19
19
run : |
20
20
pip install codespell
21
21
pip install fuzzywuzzy[speedup]
22
+ pip install nltk
23
+ python3 -c "import nltk; nltk.download('punkt')"
22
24
23
25
- name : Verify Spellcheck Ignore List Exists
24
26
run : |
@@ -30,14 +32,16 @@ jobs:
30
32
run : |
31
33
set -e # Exit on error
32
34
33
- # Run codespell to detect misspellings but do not auto-correct
35
+ # Run codespell and save output
34
36
codespell --ignore-words=.github/spellcheck-ignore.txt \
35
37
--skip=".git,*.lock,*.json,*.yaml,*.yml,*.css,*.html" \
36
38
--quiet-level=2 > spellcheck_report_raw.txt || true
37
39
38
40
# Process corrections with Python
39
41
python3 <<EOF
40
42
import re
43
+ import nltk
44
+ from nltk.tokenize import sent_tokenize
41
45
from fuzzywuzzy import process
42
46
43
47
# Load spellcheck ignore list with case sensitivity
@@ -47,47 +51,27 @@ jobs:
47
51
word = line.strip()
48
52
ignore_list[word.lower()] = word # Store lowercase -> correct-case
49
53
50
- # Common word pairs and phrases to check for context-based correction
51
- common_phrases = {
52
- "identity provider": ["identiy provider", "identify provider"],
53
- "access token": ["access toekn", "acess token"],
54
- "user authentication": ["user authentification", "user authenthication"],
55
- "API gateway": ["API getway", "API gatway"]
56
- }
57
-
58
54
# Function to check if a word is inside a code block, backticks, URL, or file reference
59
55
def is_code_or_url_or_file(line):
60
- return bool( re.search(r'`.*?`|https?://\S+|www\.\S+|/[\w./-]+', line) )
56
+ return re.search(r'`.*?`|https?://\S+|www\.\S+|/[\w./-]+', line)
61
57
62
58
# Function to check if a word is part of a Markdown link
63
59
def is_markdown_link(line, original):
64
- return bool( re.search(r'\[.*?\]\(.*' + re.escape(original) + r'.*\)', line) )
60
+ return re.search(r'\[.*?\]\(.*' + re.escape(original) + r'.*\)', line)
65
61
66
62
# Function to determine if an ignore list word should be used
67
63
def should_use_ignore_list(original, suggestion, line):
68
- best_match, score = process.extractOne(original, ignore_list.keys(), scorer=lambda x, y: sum(c1 == c2 for c1, c2 in zip(x, y)) )
69
-
64
+ best_match, score = process.extractOne(original, ignore_list.keys())
65
+
70
66
# Must be at least 90% similar to be considered a match
71
67
if score < 90:
72
68
return False
73
69
74
- # Reject if original contains best_match as a substring
70
+ # Reject if original contains best_match as a substring (e.g., "certifcate" vs "CE")
75
71
if best_match in original and len(original) > len(best_match):
76
72
return False
77
73
78
- # Enforce case-sensitive corrections for regular text, but lowercase for files/URLs
79
- if not is_code_or_url_or_file(line) and not is_markdown_link(line, original):
80
- return best_match in ignore_list
81
-
82
- return best_match.lower() in ignore_list
83
-
84
- # Function to apply context-based correction
85
- def apply_context_based_correction(line, original, suggestion):
86
- for correct_phrase, wrong_variants in common_phrases.items():
87
- for wrong_phrase in wrong_variants:
88
- if wrong_phrase in line:
89
- return line.replace(wrong_phrase, correct_phrase)
90
- return re.sub(r'\b' + re.escape(original) + r'\b', suggestion, line, count=1)
74
+ return True
91
75
92
76
# Process spellcheck output and apply fixes
93
77
with open("spellcheck_report_raw.txt", "r", encoding="utf-8") as infile, open("spellcheck_report.txt", "w", encoding="utf-8") as outfile:
@@ -97,34 +81,38 @@ jobs:
97
81
file_path, line_number, original, suggestion = match.groups()
98
82
corrected_word = suggestion
99
83
100
- # Read the line content from the file
84
+ # Read the full line from the file
101
85
with open(file_path, "r", encoding="utf-8") as file:
102
86
content_lines = file.readlines()
103
87
context_line = content_lines[int(line_number) - 1].strip()
104
88
105
- # Fix #1: Ensure case-sensitive corrections match exactly
89
+ # Tokenize the sentence for context-based correction
90
+ sentences = sent_tokenize(context_line)
91
+ relevant_sentence = next((s for s in sentences if original in s), context_line)
92
+
93
+ # **Fix #1: Case-sensitive correction for ignore list terms**
106
94
if original.lower() in ignore_list:
107
- corrected_word = ignore_list[original.lower()]
95
+ if is_code_or_url_or_file(relevant_sentence) or is_markdown_link(relevant_sentence, original):
96
+ corrected_word = original.lower() # Keep lowercase in URLs, links, or file paths
97
+ else:
98
+ corrected_word = ignore_list[original.lower()] # Use exact case from ignore list
108
99
109
- # Fix #2: Use English dictionary for weak matches
110
- elif should_use_ignore_list(original, suggestion, context_line ):
100
+ # ** Fix #2: Reject weak matches and default to the English dictionary**
101
+ elif should_use_ignore_list(original, suggestion, relevant_sentence ):
111
102
best_match, _ = process.extractOne(original, ignore_list.keys())
112
- corrected_word = ignore_list[best_match] if not is_code_or_url_or_file(context_line) else best_match.lower()
113
-
114
- # Fix #3: Apply context-based correction
115
- corrected_line = apply_context_based_correction(context_line, original, corrected_word)
103
+ corrected_word = ignore_list[best_match]
116
104
117
- # Fix #4: Replace only the first occurrence of the word
118
- corrected_line = re.sub(r'\b' + re.escape(original) + r'\b', corrected_word, corrected_line , count=1)
105
+ # ** Fix #3: Apply corrections based on full sentence**
106
+ relevant_sentence = re.sub(r'\b' + re.escape(original) + r'\b', corrected_word, relevant_sentence , count=1)
119
107
120
- # Debugging Output
121
- print(f"🔍 Correction: {original} -> {corrected_word} in {file_path}:{line_number} ")
108
+ # **Fix #4: Ensure no extra punctuation is introduced**
109
+ relevant_sentence = relevant_sentence.replace("..", ".").replace(",.", ". ")
122
110
123
111
# Write final output
124
112
outfile.write(f"{file_path}:{line_number}: {original} ==> {corrected_word}\n")
125
113
126
114
# Apply fix while maintaining case rules
127
- content_lines[int(line_number) - 1] = corrected_line + "\n"
115
+ content_lines[int(line_number) - 1] = relevant_sentence + "\n"
128
116
with open(file_path, "w", encoding="utf-8") as file:
129
117
file.writelines(content_lines)
130
118
EOF
0 commit comments