Skip to content

Commit 93d37d7

Browse files
authored
Update spellcheck.yml
1 parent bd9ed2a commit 93d37d7

File tree

1 file changed

+29
-41
lines changed

1 file changed

+29
-41
lines changed

.github/workflows/spellcheck.yml

+29-41
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ jobs:
1919
run: |
2020
pip install codespell
2121
pip install fuzzywuzzy[speedup]
22+
pip install nltk
23+
python3 -c "import nltk; nltk.download('punkt')"
2224
2325
- name: Verify Spellcheck Ignore List Exists
2426
run: |
@@ -30,14 +32,16 @@ jobs:
3032
run: |
3133
set -e # Exit on error
3234
33-
# Run codespell to detect misspellings but do not auto-correct
35+
# Run codespell and save output
3436
codespell --ignore-words=.github/spellcheck-ignore.txt \
3537
--skip=".git,*.lock,*.json,*.yaml,*.yml,*.css,*.html" \
3638
--quiet-level=2 > spellcheck_report_raw.txt || true
3739
3840
# Process corrections with Python
3941
python3 <<EOF
4042
import re
43+
import nltk
44+
from nltk.tokenize import sent_tokenize
4145
from fuzzywuzzy import process
4246
4347
# Load spellcheck ignore list with case sensitivity
@@ -47,47 +51,27 @@ jobs:
4751
word = line.strip()
4852
ignore_list[word.lower()] = word # Store lowercase -> correct-case
4953
50-
# Common word pairs and phrases to check for context-based correction
51-
common_phrases = {
52-
"identity provider": ["identiy provider", "identify provider"],
53-
"access token": ["access toekn", "acess token"],
54-
"user authentication": ["user authentification", "user authenthication"],
55-
"API gateway": ["API getway", "API gatway"]
56-
}
57-
5854
# Function to check if a word is inside a code block, backticks, URL, or file reference
5955
def is_code_or_url_or_file(line):
60-
return bool(re.search(r'`.*?`|https?://\S+|www\.\S+|/[\w./-]+', line))
56+
return re.search(r'`.*?`|https?://\S+|www\.\S+|/[\w./-]+', line)
6157
6258
# Function to check if a word is part of a Markdown link
6359
def is_markdown_link(line, original):
64-
return bool(re.search(r'\[.*?\]\(.*' + re.escape(original) + r'.*\)', line))
60+
return re.search(r'\[.*?\]\(.*' + re.escape(original) + r'.*\)', line)
6561
6662
# Function to determine if an ignore list word should be used
6763
def should_use_ignore_list(original, suggestion, line):
68-
best_match, score = process.extractOne(original, ignore_list.keys(), scorer=lambda x, y: sum(c1 == c2 for c1, c2 in zip(x, y)))
69-
64+
best_match, score = process.extractOne(original, ignore_list.keys())
65+
7066
# Must be at least 90% similar to be considered a match
7167
if score < 90:
7268
return False
7369
74-
# Reject if original contains best_match as a substring
70+
# Reject if original contains best_match as a substring (e.g., "certifcate" vs "CE")
7571
if best_match in original and len(original) > len(best_match):
7672
return False
7773
78-
# Enforce case-sensitive corrections for regular text, but lowercase for files/URLs
79-
if not is_code_or_url_or_file(line) and not is_markdown_link(line, original):
80-
return best_match in ignore_list
81-
82-
return best_match.lower() in ignore_list
83-
84-
# Function to apply context-based correction
85-
def apply_context_based_correction(line, original, suggestion):
86-
for correct_phrase, wrong_variants in common_phrases.items():
87-
for wrong_phrase in wrong_variants:
88-
if wrong_phrase in line:
89-
return line.replace(wrong_phrase, correct_phrase)
90-
return re.sub(r'\b' + re.escape(original) + r'\b', suggestion, line, count=1)
74+
return True
9175
9276
# Process spellcheck output and apply fixes
9377
with open("spellcheck_report_raw.txt", "r", encoding="utf-8") as infile, open("spellcheck_report.txt", "w", encoding="utf-8") as outfile:
@@ -97,34 +81,38 @@ jobs:
9781
file_path, line_number, original, suggestion = match.groups()
9882
corrected_word = suggestion
9983
100-
# Read the line content from the file
84+
# Read the full line from the file
10185
with open(file_path, "r", encoding="utf-8") as file:
10286
content_lines = file.readlines()
10387
context_line = content_lines[int(line_number) - 1].strip()
10488
105-
# Fix #1: Ensure case-sensitive corrections match exactly
89+
# Tokenize the sentence for context-based correction
90+
sentences = sent_tokenize(context_line)
91+
relevant_sentence = next((s for s in sentences if original in s), context_line)
92+
93+
# **Fix #1: Case-sensitive correction for ignore list terms**
10694
if original.lower() in ignore_list:
107-
corrected_word = ignore_list[original.lower()]
95+
if is_code_or_url_or_file(relevant_sentence) or is_markdown_link(relevant_sentence, original):
96+
corrected_word = original.lower() # Keep lowercase in URLs, links, or file paths
97+
else:
98+
corrected_word = ignore_list[original.lower()] # Use exact case from ignore list
10899
109-
# Fix #2: Use English dictionary for weak matches
110-
elif should_use_ignore_list(original, suggestion, context_line):
100+
# **Fix #2: Reject weak matches and default to the English dictionary**
101+
elif should_use_ignore_list(original, suggestion, relevant_sentence):
111102
best_match, _ = process.extractOne(original, ignore_list.keys())
112-
corrected_word = ignore_list[best_match] if not is_code_or_url_or_file(context_line) else best_match.lower()
113-
114-
# Fix #3: Apply context-based correction
115-
corrected_line = apply_context_based_correction(context_line, original, corrected_word)
103+
corrected_word = ignore_list[best_match]
116104
117-
# Fix #4: Replace only the first occurrence of the word
118-
corrected_line = re.sub(r'\b' + re.escape(original) + r'\b', corrected_word, corrected_line, count=1)
105+
# **Fix #3: Apply corrections based on full sentence**
106+
relevant_sentence = re.sub(r'\b' + re.escape(original) + r'\b', corrected_word, relevant_sentence, count=1)
119107
120-
# Debugging Output
121-
print(f"🔍 Correction: {original} -> {corrected_word} in {file_path}:{line_number}")
108+
# **Fix #4: Ensure no extra punctuation is introduced**
109+
relevant_sentence = relevant_sentence.replace("..", ".").replace(",.", ".")
122110
123111
# Write final output
124112
outfile.write(f"{file_path}:{line_number}: {original} ==> {corrected_word}\n")
125113
126114
# Apply fix while maintaining case rules
127-
content_lines[int(line_number) - 1] = corrected_line + "\n"
115+
content_lines[int(line_number) - 1] = relevant_sentence + "\n"
128116
with open(file_path, "w", encoding="utf-8") as file:
129117
file.writelines(content_lines)
130118
EOF

0 commit comments

Comments
 (0)