49
49
word = line.strip()
50
50
ignore_list[word.lower()] = word # Store lowercase -> correct-case
51
51
52
+ # Common phrases to prioritize in spellcheck corrections
53
+ common_phrases = {
54
+ "identity provider": ["identiy provider", "identify provider"],
55
+ "access token": ["access toekn", "acess token"],
56
+ "user authentication": ["user authentification", "user authenthication"],
57
+ "API gateway": ["API getway", "API gatway"]
58
+ }
59
+
52
60
# Function to check if a word is inside a code block, backticks, URL, or file reference
53
61
def is_code_or_url_or_file(line):
54
- return bool(re.search(r'` .*?` |https?://\S+|www\.\S+|/[\w./-]+', line))
62
+ return bool(re.search(r'.*?|https?://\S+|www\.\S+|/[\w./-]+', line))
55
63
56
64
# Function to check if a word is part of a Markdown link
57
65
def is_markdown_link(line, original):
@@ -60,12 +68,28 @@ jobs:
60
68
# Function to determine if an ignore list word should be used
61
69
def should_use_ignore_list(original, suggestion, line):
62
70
best_match, score = process.extractOne(original, ignore_list.keys())
71
+
72
+ # Must be at least 90% similar to be considered a match
63
73
if score < 90:
64
- return False # Reject weak matches
74
+ return False
75
+
76
+ # Reject if original contains best_match as a substring (e.g., "certifcate" vs "CE")
65
77
if best_match in original and len(original) > len(best_match):
66
- return False # Prevent incorrect substring matches
78
+ return False
79
+
67
80
return True
68
81
82
+ # Function to apply strict context-based correction rules
83
+ def apply_strict_context_correction(sentence, original, suggestion):
84
+ # Prioritize known common phrases first
85
+ for correct_phrase, wrong_variants in common_phrases.items():
86
+ for wrong_phrase in wrong_variants:
87
+ if wrong_phrase in sentence:
88
+ return sentence.replace(wrong_phrase, correct_phrase)
89
+
90
+ # Replace the misspelled word with the correct word **only once**
91
+ return re.sub(r'\b' + re.escape(original) + r'\b', suggestion, sentence, count=1)
92
+
69
93
# Process spellcheck output and apply fixes
70
94
with open("spellcheck_report_raw.txt", "r", encoding="utf-8") as infile, open("spellcheck_report.txt", "w", encoding="utf-8") as outfile:
71
95
for line in infile:
@@ -79,31 +103,31 @@ jobs:
79
103
content_lines = file.readlines()
80
104
context_line = content_lines[int(line_number) - 1].strip()
81
105
82
- # Use sentence-splitter to analyze full sentence context
106
+ # ✅ Use sentence-splitter to analyze full sentence context
83
107
splitter = SentenceSplitter(language="en")
84
108
sentences = splitter.split(context_line)
85
109
relevant_sentence = next((s for s in sentences if original in s), context_line)
86
110
87
- # Enforce strict case-sensitive ignore list rules
111
+ # **Fix #1: Enforce strict case-sensitive ignore list rules**
88
112
if original.lower() in ignore_list:
89
113
if is_code_or_url_or_file(relevant_sentence) or is_markdown_link(relevant_sentence, original):
90
114
corrected_word = original.lower() # Keep lowercase in URLs, links, or file paths
91
115
else:
92
116
corrected_word = ignore_list[original.lower()] # Use exact case from ignore list
93
117
94
- # Reject weak matches to ignore words
118
+ # **Fix #2: Reject weak matches to ignore words**
95
119
elif should_use_ignore_list(original, suggestion, relevant_sentence):
96
120
best_match, _ = process.extractOne(original, ignore_list.keys())
97
121
corrected_word = ignore_list[best_match]
98
122
99
- # Prevent weak ignore word matches
123
+ # **Fix #3: Strictly prevent weak ignore word matches**
100
124
elif len(original) < 3 or len(original) < len(ignore_list.get(suggestion.lower(), "")) / 2:
101
125
corrected_word = suggestion # Use the English dictionary instead
102
126
103
- # Apply strict context-based correction
104
- relevant_sentence = re.sub(r'\b' + re.escape( original) + r'\b' , corrected_word, relevant_sentence, count=1 )
127
+ # **Fix #4: Apply strict context-based correction**
128
+ relevant_sentence = apply_strict_context_correction(relevant_sentence, original, corrected_word)
105
129
106
- # Prevent punctuation modifications
130
+ # **Fix #5: Strictly prevent punctuation modifications**
107
131
relevant_sentence = relevant_sentence.replace("..", ".").replace(",.", ".").replace(" ,", ",")
108
132
109
133
# Write final output
0 commit comments