-
Notifications
You must be signed in to change notification settings - Fork 14
159 lines (127 loc) · 6.75 KB
/
spellcheck.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
name: Manual Spellcheck & Auto PR
on:
workflow_dispatch: # Runs only when manually triggered
permissions:
contents: write # Needed to push changes to a new branch
pull-requests: write # Needed to create a PR
jobs:
spellcheck:
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v4
- name: Install Dependencies
run: |
pip install codespell
pip install fuzzywuzzy[speedup]
pip install nltk
python3 -c "import nltk; nltk.download('punkt'); nltk.data.path.append('/usr/share/nltk_data')"
- name: Verify Spellcheck Ignore List Exists
run: |
if [ ! -f .github/spellcheck-ignore.txt ]; then
echo "Error: spellcheck-ignore.txt not found!" && exit 1
fi
- name: Run Spellcheck and Apply Fixes
run: |
set -e # Exit on error
# Run codespell and save output
codespell --ignore-words=.github/spellcheck-ignore.txt \
--skip=".git,*.lock,*.json,*.yaml,*.yml,*.css,*.html" \
--quiet-level=2 > spellcheck_report_raw.txt || true
# Process corrections with Python
python3 <<EOF
import re
import nltk
from nltk.tokenize import sent_tokenize
from fuzzywuzzy import process
# ✅ Ensure 'punkt' is available
nltk.download('punkt', quiet=True)
nltk.data.path.append('/usr/share/nltk_data')
# ✅ Force loading the tokenizer to avoid lookup errors
_ = nltk.data.load('tokenizers/punkt/english.pickle')
# Load spellcheck ignore list with case sensitivity
ignore_list = {}
with open(".github/spellcheck-ignore.txt", "r", encoding="utf-8") as f:
for line in f:
word = line.strip()
ignore_list[word.lower()] = word # Store lowercase -> correct-case
# Function to check if a word is inside a code block, backticks, URL, or file reference
def is_code_or_url_or_file(line):
return re.search(r'`.*?`|https?://\S+|www\.\S+|/[\w./-]+', line)
# Function to check if a word is part of a Markdown link
def is_markdown_link(line, original):
return re.search(r'\[.*?\]\(.*' + re.escape(original) + r'.*\)', line)
# Function to determine if an ignore list word should be used
def should_use_ignore_list(original, suggestion, line):
best_match, score = process.extractOne(original, ignore_list.keys())
# Must be at least 90% similar to be considered a match
if score < 90:
return False
# Reject if original contains best_match as a substring (e.g., "certifcate" vs "CE")
if best_match in original and len(original) > len(best_match):
return False
return True
# Process spellcheck output and apply fixes
with open("spellcheck_report_raw.txt", "r", encoding="utf-8") as infile, open("spellcheck_report.txt", "w", encoding="utf-8") as outfile:
for line in infile:
match = re.match(r"(.*):(\d+): (\S+) ==> (\S+)", line)
if match:
file_path, line_number, original, suggestion = match.groups()
corrected_word = suggestion
# Read the full line from the file
with open(file_path, "r", encoding="utf-8") as file:
content_lines = file.readlines()
context_line = content_lines[int(line_number) - 1].strip()
# ✅ Tokenize the sentence for context-based correction
sentences = sent_tokenize(context_line)
relevant_sentence = next((s for s in sentences if original in s), context_line)
# **Fix #1: Case-sensitive correction for ignore list terms**
if original.lower() in ignore_list:
if is_code_or_url_or_file(relevant_sentence) or is_markdown_link(relevant_sentence, original):
corrected_word = original.lower() # Keep lowercase in URLs, links, or file paths
else:
corrected_word = ignore_list[original.lower()] # Use exact case from ignore list
# **Fix #2: Reject weak matches and default to the English dictionary**
elif should_use_ignore_list(original, suggestion, relevant_sentence):
best_match, _ = process.extractOne(original, ignore_list.keys())
corrected_word = ignore_list[best_match]
# **Fix #3: Apply corrections based on full sentence**
relevant_sentence = re.sub(r'\b' + re.escape(original) + r'\b', corrected_word, relevant_sentence, count=1)
# **Fix #4: Ensure no extra punctuation is introduced**
relevant_sentence = relevant_sentence.replace("..", ".").replace(",.", ".")
# Write final output
outfile.write(f"{file_path}:{line_number}: {original} ==> {corrected_word}\n")
# Apply fix while maintaining case rules
content_lines[int(line_number) - 1] = relevant_sentence + "\n"
with open(file_path, "w", encoding="utf-8") as file:
file.writelines(content_lines)
EOF
# Check if any files were modified
if git status --porcelain | grep -q '^ M'; then
echo "Corrections applied. Preparing to create PR."
else
echo "No spelling corrections found. Exiting."
exit 0
fi
- name: Create Pull Request with Corrections
env:
GITHUB_TOKEN: ${{ secrets.PAT_GITHUB_ACTIONS }}
run: |
git config --global user.name "github-actions[bot]"
git config --global user.email "github-actions[bot]@users.noreply.github.com"
BRANCH_NAME="spellcheck-fixes-$(date +%s)"
git checkout -b $BRANCH_NAME
# Commit the changes if there are any
if [ -n "$(git status --porcelain)" ]; then
git add .
git commit -m "Spellcheck: Automatically fixed detected misspellings"
git push origin $BRANCH_NAME
# Create PR using GitHub CLI
gh pr create \
--base main \
--head $BRANCH_NAME \
--title "Spellcheck Fixes" \
--body "This PR contains automatically applied spelling corrections."
else
echo "No changes detected. Skipping PR creation."
fi