Skip to content

Commit 9125424

Browse files
authored
Update spellcheck.yml
1 parent 6f809e6 commit 9125424

File tree

1 file changed

+21
-27
lines changed

1 file changed

+21
-27
lines changed

.github/workflows/spellcheck.yml

+21-27
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ jobs:
1919
run: |
2020
pip install codespell
2121
pip install fuzzywuzzy[speedup]
22-
pip install nltk
2322
2423
- name: Verify Spellcheck Ignore List Exists
2524
run: |
@@ -31,18 +30,15 @@ jobs:
3130
run: |
3231
set -e # Exit on error
3332
34-
# Run codespell on the full repo, save raw report, and prevent auto-fixes
33+
# Run codespell to detect misspellings but do not auto-correct
3534
codespell --ignore-words=.github/spellcheck-ignore.txt \
3635
--skip=".git,*.lock,*.json,*.yaml,*.yml,*.css,*.html" \
3736
--quiet-level=2 > spellcheck_report_raw.txt || true
3837
3938
# Process corrections with Python
4039
python3 <<EOF
4140
import re
42-
import nltk
4341
from fuzzywuzzy import process
44-
from nltk.tokenize import word_tokenize
45-
nltk.download('punkt')
4642
4743
# Load spellcheck ignore list with case sensitivity
4844
ignore_list = {}
@@ -56,9 +52,7 @@ jobs:
5652
"identity provider": ["identiy provider", "identify provider"],
5753
"access token": ["access toekn", "acess token"],
5854
"user authentication": ["user authentification", "user authenthication"],
59-
"API gateway": ["API getway", "API gatway"],
60-
"default reporter": ["defaul reporter"],
61-
"default identity provider": ["defaul identity provider"]
55+
"API gateway": ["API getway", "API gatway"]
6256
}
6357
6458
# Function to check if a word is inside a code block, backticks, URL, or file reference
@@ -76,14 +70,14 @@ jobs:
7670
# Must be at least 90% similar to be considered a match
7771
if score < 90:
7872
return False
79-
73+
8074
# Reject if original contains best_match as a substring (e.g., "certifcate" vs "CE")
8175
if best_match in original and len(original) > len(best_match):
8276
return False
8377
8478
# Enforce case-sensitive corrections for regular text, but NOT for file references/URLs/links
8579
if not is_code_or_url_or_file(line) and not is_markdown_link(line, original):
86-
return True if best_match in ignore_list else False
80+
return best_match in ignore_list
8781
8882
# Allow case-insensitive corrections for code blocks, backticks, URLs, and markdown links
8983
return best_match.lower() in ignore_list
@@ -94,7 +88,7 @@ jobs:
9488
for wrong_phrase in wrong_variants:
9589
if wrong_phrase in line:
9690
return line.replace(wrong_phrase, correct_phrase)
97-
return line.replace(original, suggestion, 1) # Replace only the first occurrence
91+
return line.replace(original, suggestion)
9892
9993
# Process spellcheck output and apply fixes
10094
with open("spellcheck_report_raw.txt", "r", encoding="utf-8") as infile, open("spellcheck_report.txt", "w", encoding="utf-8") as outfile:
@@ -109,25 +103,23 @@ jobs:
109103
content_lines = file.readlines()
110104
context_line = content_lines[int(line_number) - 1].strip()
111105
112-
# Preserve case-sensitive ignored terms exactly
113-
if original.lower() in ignore_list and any(c.isupper() for c in original):
114-
corrected_word = ignore_list[original.lower()]
115-
116-
# Tokenize line to avoid replacing partial words
117-
words = word_tokenize(context_line)
106+
# **Fix #1: Preserve case-sensitive ignored terms exactly**
107+
if original in ignore_list.values():
108+
corrected_word = original # Use exact case from ignore list
118109
119-
# Ensure weak matches use the English dictionary
120-
if not should_use_ignore_list(original, suggestion, context_line):
121-
corrected_word = suggestion # Use dictionary match
110+
# **Fix #2: Use English dictionary for weak matches**
111+
elif should_use_ignore_list(original, suggestion, context_line):
112+
best_match, _ = process.extractOne(original, ignore_list.keys())
113+
if not is_code_or_url_or_file(context_line) and not is_markdown_link(context_line, original):
114+
corrected_word = ignore_list[best_match]
115+
else:
116+
corrected_word = best_match.lower() # Keep it lowercase in URLs/links/files
122117
123-
# Apply corrections **only** to the first exact word match, not substrings
124-
for i, word in enumerate(words):
125-
if word.lower() == original.lower():
126-
words[i] = corrected_word
127-
break
118+
# **Fix #3: Apply context-based correction**
119+
corrected_line = apply_context_based_correction(context_line, original, corrected_word)
128120
129-
# Rebuild corrected line
130-
corrected_line = " ".join(words)
121+
# **Fix #4: Replace only the first occurrence of the word**
122+
corrected_line = re.sub(r'\b' + re.escape(original) + r'\b', corrected_word, corrected_line, count=1)
131123
132124
# Write final output
133125
outfile.write(f"{file_path}:{line_number}: {original} ==> {corrected_word}\n")
@@ -156,11 +148,13 @@ jobs:
156148
BRANCH_NAME="spellcheck-fixes-$(date +%s)"
157149
git checkout -b $BRANCH_NAME
158150
151+
# Commit the changes if there are any
159152
if [ -n "$(git status --porcelain)" ]; then
160153
git add .
161154
git commit -m "Spellcheck: Automatically fixed detected misspellings"
162155
git push origin $BRANCH_NAME
163156
157+
# Create PR using GitHub CLI
164158
gh pr create \
165159
--base main \
166160
--head $BRANCH_NAME \

0 commit comments

Comments
 (0)