@@ -57,29 +57,28 @@ jobs:
57
57
58
58
# Function to check if a word is inside a code block, backticks, URL, or file reference
59
59
def is_code_or_url_or_file(line):
60
- return re.search(r'`.*?`|https?://\S+|www\.\S+|/[\w./-]+', line)
60
+ return bool( re.search(r'`.*?`|https?://\S+|www\.\S+|/[\w./-]+', line) )
61
61
62
62
# Function to check if a word is part of a Markdown link
63
63
def is_markdown_link(line, original):
64
- return re.search(r'\[.*?\]\(.*' + re.escape(original) + r'.*\)', line)
64
+ return bool( re.search(r'\[.*?\]\(.*' + re.escape(original) + r'.*\)', line) )
65
65
66
66
# Function to determine if an ignore list word should be used
67
67
def should_use_ignore_list(original, suggestion, line):
68
- best_match, score = process.extractOne(original, ignore_list.keys())
68
+ best_match, score = process.extractOne(original, ignore_list.keys(), scorer=lambda x, y: sum(c1 == c2 for c1, c2 in zip(x, y)) )
69
69
70
70
# Must be at least 90% similar to be considered a match
71
71
if score < 90:
72
72
return False
73
73
74
- # Reject if original contains best_match as a substring (e.g., "certifcate" vs "CE")
74
+ # Reject if original contains best_match as a substring
75
75
if best_match in original and len(original) > len(best_match):
76
76
return False
77
77
78
- # Enforce case-sensitive corrections for regular text, but NOT for file references /URLs/links
78
+ # Enforce case-sensitive corrections for regular text, but lowercase for files /URLs
79
79
if not is_code_or_url_or_file(line) and not is_markdown_link(line, original):
80
80
return best_match in ignore_list
81
81
82
- # Allow case-insensitive corrections for code blocks, backticks, URLs, and markdown links
83
82
return best_match.lower() in ignore_list
84
83
85
84
# Function to apply context-based correction
88
87
for wrong_phrase in wrong_variants:
89
88
if wrong_phrase in line:
90
89
return line.replace(wrong_phrase, correct_phrase)
91
- return line.replace( original, suggestion)
90
+ return re.sub(r'\b' + re.escape( original) + r'\b' , suggestion, line, count=1 )
92
91
93
92
# Process spellcheck output and apply fixes
94
93
with open("spellcheck_report_raw.txt", "r", encoding="utf-8") as infile, open("spellcheck_report.txt", "w", encoding="utf-8") as outfile:
@@ -103,24 +102,24 @@ jobs:
103
102
content_lines = file.readlines()
104
103
context_line = content_lines[int(line_number) - 1].strip()
105
104
106
- # ** Fix #1: Preserve case-sensitive ignored terms exactly**
107
- if original in ignore_list.values() :
108
- corrected_word = original # Use exact case from ignore list
105
+ # Fix #1: Ensure case-sensitive corrections match exactly
106
+ if original.lower() in ignore_list:
107
+ corrected_word = ignore_list[ original.lower()]
109
108
110
- # ** Fix #2: Use English dictionary for weak matches**
109
+ # Fix #2: Use English dictionary for weak matches
111
110
elif should_use_ignore_list(original, suggestion, context_line):
112
111
best_match, _ = process.extractOne(original, ignore_list.keys())
113
- if not is_code_or_url_or_file(context_line) and not is_markdown_link(context_line, original):
114
- corrected_word = ignore_list[best_match]
115
- else:
116
- corrected_word = best_match.lower() # Keep it lowercase in URLs/links/files
112
+ corrected_word = ignore_list[best_match] if not is_code_or_url_or_file(context_line) else best_match.lower()
117
113
118
- # ** Fix #3: Apply context-based correction**
114
+ # Fix #3: Apply context-based correction
119
115
corrected_line = apply_context_based_correction(context_line, original, corrected_word)
120
116
121
- # ** Fix #4: Replace only the first occurrence of the word**
117
+ # Fix #4: Replace only the first occurrence of the word
122
118
corrected_line = re.sub(r'\b' + re.escape(original) + r'\b', corrected_word, corrected_line, count=1)
123
119
120
+ # Debugging Output
121
+ print(f"🔍 Correction: {original} -> {corrected_word} in {file_path}:{line_number}")
122
+
124
123
# Write final output
125
124
outfile.write(f"{file_path}:{line_number}: {original} ==> {corrected_word}\n")
126
125
0 commit comments