Skip to content

Commit a06fef6

Browse files
authored
Update spellcheck.yml
1 parent d5e6fae commit a06fef6

File tree

1 file changed

+34
-10
lines changed

1 file changed

+34
-10
lines changed

.github/workflows/spellcheck.yml

+34-10
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,17 @@ jobs:
4949
word = line.strip()
5050
ignore_list[word.lower()] = word # Store lowercase -> correct-case
5151
52+
# Common phrases to prioritize in spellcheck corrections
53+
common_phrases = {
54+
"identity provider": ["identiy provider", "identify provider"],
55+
"access token": ["access toekn", "acess token"],
56+
"user authentication": ["user authentification", "user authenthication"],
57+
"API gateway": ["API getway", "API gatway"]
58+
}
59+
5260
# Function to check if a word is inside a code block, backticks, URL, or file reference
5361
def is_code_or_url_or_file(line):
54-
return bool(re.search(r'`.*?`|https?://\S+|www\.\S+|/[\w./-]+', line))
62+
return bool(re.search(r'.*?|https?://\S+|www\.\S+|/[\w./-]+', line))
5563
5664
# Function to check if a word is part of a Markdown link
5765
def is_markdown_link(line, original):
@@ -60,12 +68,28 @@ jobs:
6068
# Function to determine if an ignore list word should be used
6169
def should_use_ignore_list(original, suggestion, line):
6270
best_match, score = process.extractOne(original, ignore_list.keys())
71+
72+
# Must be at least 90% similar to be considered a match
6373
if score < 90:
64-
return False # Reject weak matches
74+
return False
75+
76+
# Reject if original contains best_match as a substring (e.g., "certifcate" vs "CE")
6577
if best_match in original and len(original) > len(best_match):
66-
return False # Prevent incorrect substring matches
78+
return False
79+
6780
return True
6881
82+
# Function to apply strict context-based correction rules
83+
def apply_strict_context_correction(sentence, original, suggestion):
84+
# Prioritize known common phrases first
85+
for correct_phrase, wrong_variants in common_phrases.items():
86+
for wrong_phrase in wrong_variants:
87+
if wrong_phrase in sentence:
88+
return sentence.replace(wrong_phrase, correct_phrase)
89+
90+
# Replace the misspelled word with the correct word **only once**
91+
return re.sub(r'\b' + re.escape(original) + r'\b', suggestion, sentence, count=1)
92+
6993
# Process spellcheck output and apply fixes
7094
with open("spellcheck_report_raw.txt", "r", encoding="utf-8") as infile, open("spellcheck_report.txt", "w", encoding="utf-8") as outfile:
7195
for line in infile:
@@ -79,31 +103,31 @@ jobs:
79103
content_lines = file.readlines()
80104
context_line = content_lines[int(line_number) - 1].strip()
81105
82-
# Use sentence-splitter to analyze full sentence context
106+
# Use sentence-splitter to analyze full sentence context
83107
splitter = SentenceSplitter(language="en")
84108
sentences = splitter.split(context_line)
85109
relevant_sentence = next((s for s in sentences if original in s), context_line)
86110
87-
# Enforce strict case-sensitive ignore list rules
111+
# **Fix #1: Enforce strict case-sensitive ignore list rules**
88112
if original.lower() in ignore_list:
89113
if is_code_or_url_or_file(relevant_sentence) or is_markdown_link(relevant_sentence, original):
90114
corrected_word = original.lower() # Keep lowercase in URLs, links, or file paths
91115
else:
92116
corrected_word = ignore_list[original.lower()] # Use exact case from ignore list
93117
94-
# Reject weak matches to ignore words
118+
# **Fix #2: Reject weak matches to ignore words**
95119
elif should_use_ignore_list(original, suggestion, relevant_sentence):
96120
best_match, _ = process.extractOne(original, ignore_list.keys())
97121
corrected_word = ignore_list[best_match]
98122
99-
# Prevent weak ignore word matches
123+
# **Fix #3: Strictly prevent weak ignore word matches**
100124
elif len(original) < 3 or len(original) < len(ignore_list.get(suggestion.lower(), "")) / 2:
101125
corrected_word = suggestion # Use the English dictionary instead
102126
103-
# Apply strict context-based correction
104-
relevant_sentence = re.sub(r'\b' + re.escape(original) + r'\b', corrected_word, relevant_sentence, count=1)
127+
# **Fix #4: Apply strict context-based correction**
128+
relevant_sentence = apply_strict_context_correction(relevant_sentence, original, corrected_word)
105129
106-
# Prevent punctuation modifications
130+
# **Fix #5: Strictly prevent punctuation modifications**
107131
relevant_sentence = relevant_sentence.replace("..", ".").replace(",.", ".").replace(" ,", ",")
108132
109133
# Write final output

0 commit comments

Comments
 (0)