Why don’t use available solution? All of them using ChatGPT. But I’m out of credits ;) Of course I want to learn something!
How to generate git commit message?
Git allows you to create hooks. Let’s use global one. Global hooks works without modifying every git repo.
Create a directory for hooks:
mkdir ~/.config/git/hooks/ $
Let git knows where hooks are:
git config core.hooksPath ~/.config/git/hooks/ $
Long story short the prepare-commit-msg
is the one we need. The file we need to update is passed as first parameter. Create a simple script:
#!/bin/sh
echo "Fancy commit message" > $1
Make it executable:
chmod +z ~/.confog/git/hooks/prepare-commit-msg $
Is it works? Let’s commit something … Yep, we have a message at the end of the commit message.
Let’s generate something:
Generating commit message
Let’s build something that’s works offline. AI? Yes, let’s use AI!
We need model right?
Let’s look at huggingface!
There it is: https://huggingface.co/mamiksik/T5-commit-message-generation but there is no docs :( But if you’ll look deeper you’ll find https://huggingface.co/spaces/mamiksik/commit-message-generator
We can use this https://huggingface.co/spaces/mamiksik/commit-message-generator/blob/main/app.py with a little modifications.
As we can use any shell script in a hook, let’s use python.
Let’s take a look what’s there:
import re
import gradio as gr
import torch
from transformers import T5ForConditionalGeneration, RobertaTokenizer
= RobertaTokenizer.from_pretrained("mamiksik/CommitPredictorT5PL", revision="fb08d01")
tokenizer = T5ForConditionalGeneration.from_pretrained("mamiksik/CommitPredictorT5PL", revision="fb08d01")
model
def parse_files(patch):
= []
accumulator = patch.splitlines()
lines
= None
filename_before for line in lines:
if line.startswith("index") or line.startswith("diff"):
continue
if line.startswith("---"):
= line.split(" ", 1)[1][1:]
filename_before continue
if line.startswith("+++"):
= line.split(" ", 1)[1][1:]
filename_after
if filename_before == filename_after:
f" {filename_before}")
accumulator.append(else:
f" {filename_after}")
accumulator.append(f"{filename_before}")
accumulator.append(continue
= re.sub("@@[^@@]*@@", "", line)
line if len(line) == 0:
continue
if line[0] == "+":
= line.replace("+", "" , 1)
line elif line[0] == "-":
= line.replace("-", "", 1)
line else:
= f" {line}"
line
accumulator.append(line)
return 'n'.join(accumulator)
def predict(patch, max_length, min_length, num_beams, prediction_count):
= parse_files(patch)
input_text with torch.no_grad():
= tokenizer(input_text, return_tensors="pt").input_ids.shape[1]
token_count
= tokenizer(
input_ids
input_text,=True,
truncation=True,
padding="pt",
return_tensors
).input_ids
= model.generate(
outputs
input_ids,=max_length,
max_length=min_length,
min_length=num_beams,
num_beams=prediction_count,
num_return_sequences
)
= tokenizer.batch_decode(outputs, skip_special_tokens=True)
result return token_count, input_text, {k: 0 for k in result}
= gr.Interface(fn=predict, inputs=[
iface ="Patch (as generated by git diff)"),
gr.Textbox(label1, 128, value=40, label="Max message length"),
gr.Slider(1, 128, value=5, label="Min message length"),
gr.Slider(1, 10, value=7, label="Number of beams"),
gr.Slider(1, 15, value=5, label="Number of predictions"),
gr.Slider(=[
], outputs="Token count"),
gr.Textbox(label="Parsed patch"),
gr.Textbox(label="Predictions")
gr.Label(label=[
], examples"""
[diff --git a/.github/workflows/pylint.yml b/.github/workflows/codestyle_checks.yml
similarity index 86%
rename from .github/workflows/pylint.yml
rename to .github/workflows/codestyle_checks.yml
index a5d5c4d9..8cbf9713 100644
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/codestyle_checks.yml
@@ -20,3 +20,6 @@ jobs:
- name: Analysing the code with pylint
run: |
pylint --rcfile=.pylintrc webapp core
+ - name: Analysing the code with flake8
+ run: |
+ flake8
""", 40, 5, 7, 5]
]
)
if __name__ == "__main__":
iface.launch()
Everything we need is here! We need to:
- fetch gitmessage file to update
- fetch git diff
- use current script make predictions
- prepend commit message to gitmessage file
File that we need to update is passed as first parameter so
Heh that was easy.
Fetch git diff
import subprocess
'git', 'diff', '--cached'], capture_output=True).stdout.decode('utf-8') subprocess.run([
Easy peasy!
Use current script to make predictions
= 40
max_message = 5
min_message = 10
num_beams = 1
num_predictions
= predict(diff, max_message, min_message, num_beams, num_predictions) msg
Prepend our message to gitmessage file
with open(sys.argv[1], 'r+') as f:
= f.read()
content 0)
f.seek(+ 'n' + content)
f.write(msg f.close()
It’s just like that. With little cleanups this is our final script.
#!/usr/bin/env python
print("Generating commit message", end="", flush=True)
import sys
import re
import subprocess
import torch
from transformers import T5ForConditionalGeneration, RobertaTokenizer
def parse_files(patch):
= []
accumulator = patch.splitlines()
lines
= None
filename_before for line in lines:
print(".", end="", flush=True)
if line.startswith("index") or line.startswith("diff"):
continue
if line.startswith("---"):
= line.split(" ", 1)[1][1:]
filename_before continue
if line.startswith("+++"):
= line.split(" ", 1)[1][1:]
filename_after
if filename_before == filename_after:
f" {filename_before}")
accumulator.append(else:
f" {filename_after}")
accumulator.append(f"{filename_before}")
accumulator.append(continue
= re.sub("@@[^@@]*@@", "", line)
line if len(line) == 0:
continue
if line[0] == "+":
= line.replace("+", "" , 1)
line elif line[0] == "-":
= line.replace("-", "", 1)
line else:
= f" {line}"
line
accumulator.append(line)
return 'n'.join(accumulator)
def predict(patch, max_length, min_length, num_beams, prediction_count):
print(".", end="", flush=True)
= parse_files(patch)
input_text
= RobertaTokenizer.from_pretrained("mamiksik/CommitPredictorT5PL", revision="fb08d01", low_cpu_mem_usage=True)
tokenizer print(".", end="", flush=True)
= T5ForConditionalGeneration.from_pretrained("mamiksik/CommitPredictorT5PL", revision="fb08d01", low_cpu_mem_usage=True)
model print(".", end="", flush=True)
with torch.no_grad():
= tokenizer(
input_ids
input_text,=True,
truncation=True,
padding="pt",
return_tensors
).input_idsprint(".", end="", flush=True)
= model.generate(
outputs
input_ids,=max_length,
max_length=min_length,
min_length=num_beams,
num_beams=prediction_count,
num_return_sequences
)print(".", end="", flush=True)
= tokenizer.batch_decode(outputs, skip_special_tokens=True)
result return result[0]
if __name__ == "__main__":
= subprocess.run(['git', 'diff', '--cached'], capture_output=True).stdout.decode('utf-8')
diff
= 40
max_message = 5
min_message = 10
num_beams = 1
num_predictions
= predict(diff, max_message, min_message, num_beams, num_predictions)
msg
with open(sys.argv[1], 'r+') as f:
= f.read()
content 0)
f.seek(+ 'n' + content)
f.write(msg
f.close()
print("Done!n")
It’s fast on cpu, but loading model take a lot of times. Anyway 3s is OK. That’s all. It works. At least for me.