19
19
run : |
20
20
pip install codespell
21
21
pip install fuzzywuzzy[speedup]
22
- pip install nltk
23
22
24
23
- name : Verify Spellcheck Ignore List Exists
25
24
run : |
@@ -31,18 +30,15 @@ jobs:
31
30
run : |
32
31
set -e # Exit on error
33
32
34
- # Run codespell on the full repo, save raw report, and prevent auto-fixes
33
+ # Run codespell to detect misspellings but do not auto-correct
35
34
codespell --ignore-words=.github/spellcheck-ignore.txt \
36
35
--skip=".git,*.lock,*.json,*.yaml,*.yml,*.css,*.html" \
37
36
--quiet-level=2 > spellcheck_report_raw.txt || true
38
37
39
38
# Process corrections with Python
40
39
python3 <<EOF
41
40
import re
42
- import nltk
43
41
from fuzzywuzzy import process
44
- from nltk.tokenize import word_tokenize
45
- nltk.download('punkt')
46
42
47
43
# Load spellcheck ignore list with case sensitivity
48
44
ignore_list = {}
56
52
"identity provider": ["identiy provider", "identify provider"],
57
53
"access token": ["access toekn", "acess token"],
58
54
"user authentication": ["user authentification", "user authenthication"],
59
- "API gateway": ["API getway", "API gatway"],
60
- "default reporter": ["defaul reporter"],
61
- "default identity provider": ["defaul identity provider"]
55
+ "API gateway": ["API getway", "API gatway"]
62
56
}
63
57
64
58
# Function to check if a word is inside a code block, backticks, URL, or file reference
@@ -76,14 +70,14 @@ jobs:
76
70
# Must be at least 90% similar to be considered a match
77
71
if score < 90:
78
72
return False
79
-
73
+
80
74
# Reject if original contains best_match as a substring (e.g., "certifcate" vs "CE")
81
75
if best_match in original and len(original) > len(best_match):
82
76
return False
83
77
84
78
# Enforce case-sensitive corrections for regular text, but NOT for file references/URLs/links
85
79
if not is_code_or_url_or_file(line) and not is_markdown_link(line, original):
86
- return True if best_match in ignore_list else False
80
+ return best_match in ignore_list
87
81
88
82
# Allow case-insensitive corrections for code blocks, backticks, URLs, and markdown links
89
83
return best_match.lower() in ignore_list
94
88
for wrong_phrase in wrong_variants:
95
89
if wrong_phrase in line:
96
90
return line.replace(wrong_phrase, correct_phrase)
97
- return line.replace(original, suggestion, 1) # Replace only the first occurrence
91
+ return line.replace(original, suggestion)
98
92
99
93
# Process spellcheck output and apply fixes
100
94
with open("spellcheck_report_raw.txt", "r", encoding="utf-8") as infile, open("spellcheck_report.txt", "w", encoding="utf-8") as outfile:
@@ -109,25 +103,23 @@ jobs:
109
103
content_lines = file.readlines()
110
104
context_line = content_lines[int(line_number) - 1].strip()
111
105
112
- # Preserve case-sensitive ignored terms exactly
113
- if original.lower() in ignore_list and any(c.isupper() for c in original):
114
- corrected_word = ignore_list[original.lower()]
115
-
116
- # Tokenize line to avoid replacing partial words
117
- words = word_tokenize(context_line)
106
+ # **Fix #1: Preserve case-sensitive ignored terms exactly**
107
+ if original in ignore_list.values():
108
+ corrected_word = original # Use exact case from ignore list
118
109
119
- # Ensure weak matches use the English dictionary
120
- if not should_use_ignore_list(original, suggestion, context_line):
121
- corrected_word = suggestion # Use dictionary match
110
+ # **Fix #2: Use English dictionary for weak matches**
111
+ elif should_use_ignore_list(original, suggestion, context_line):
112
+ best_match, _ = process.extractOne(original, ignore_list.keys())
113
+ if not is_code_or_url_or_file(context_line) and not is_markdown_link(context_line, original):
114
+ corrected_word = ignore_list[best_match]
115
+ else:
116
+ corrected_word = best_match.lower() # Keep it lowercase in URLs/links/files
122
117
123
- # Apply corrections **only** to the first exact word match, not substrings
124
- for i, word in enumerate(words):
125
- if word.lower() == original.lower():
126
- words[i] = corrected_word
127
- break
118
+ # **Fix #3: Apply context-based correction**
119
+ corrected_line = apply_context_based_correction(context_line, original, corrected_word)
128
120
129
- # Rebuild corrected line
130
- corrected_line = " ".join(words )
121
+ # **Fix #4: Replace only the first occurrence of the word**
122
+ corrected_line = re.sub(r'\b' + re.escape(original) + r'\b', corrected_word, corrected_line, count=1 )
131
123
132
124
# Write final output
133
125
outfile.write(f"{file_path}:{line_number}: {original} ==> {corrected_word}\n")
@@ -156,11 +148,13 @@ jobs:
156
148
BRANCH_NAME="spellcheck-fixes-$(date +%s)"
157
149
git checkout -b $BRANCH_NAME
158
150
151
+ # Commit the changes if there are any
159
152
if [ -n "$(git status --porcelain)" ]; then
160
153
git add .
161
154
git commit -m "Spellcheck: Automatically fixed detected misspellings"
162
155
git push origin $BRANCH_NAME
163
156
157
+ # Create PR using GitHub CLI
164
158
gh pr create \
165
159
--base main \
166
160
--head $BRANCH_NAME \
0 commit comments