Update spellcheck.yml

Lyd1aCla1r3 · web-flow · commit a06fef677b0c · 2025-02-23T19:34:43.000-08:00
diff --git a/.github/workflows/spellcheck.yml b/.github/workflows/spellcheck.yml
@@ -49,9 +49,17 @@ jobs:
                   word = line.strip()
                   ignore_list[word.lower()] = word  # Store lowercase -> correct-case
 
+          # Common phrases to prioritize in spellcheck corrections
+          common_phrases = {
+              "identity provider": ["identiy provider", "identify provider"],
+              "access token": ["access toekn", "acess token"],
+              "user authentication": ["user authentification", "user authenthication"],
+              "API gateway": ["API getway", "API gatway"]
+          }
+
           # Function to check if a word is inside a code block, backticks, URL, or file reference
           def is_code_or_url_or_file(line):
-              return bool(re.search(r'`.*?`|https?://\S+|www\.\S+|/[\w./-]+', line))
+              return bool(re.search(r'.*?|https?://\S+|www\.\S+|/[\w./-]+', line))
 
           # Function to check if a word is part of a Markdown link
           def is_markdown_link(line, original):
@@ -60,12 +68,28 @@ jobs:
           # Function to determine if an ignore list word should be used
           def should_use_ignore_list(original, suggestion, line):
               best_match, score = process.extractOne(original, ignore_list.keys())
+
+              # Must be at least 90% similar to be considered a match
               if score < 90:
-                  return False  # Reject weak matches
+                  return False
+
+              # Reject if original contains best_match as a substring (e.g., "certifcate" vs "CE")
               if best_match in original and len(original) > len(best_match):
-                  return False  # Prevent incorrect substring matches
+                  return False
+
               return True
 
+          # Function to apply strict context-based correction rules
+          def apply_strict_context_correction(sentence, original, suggestion):
+              # Prioritize known common phrases first
+              for correct_phrase, wrong_variants in common_phrases.items():
+                  for wrong_phrase in wrong_variants:
+                      if wrong_phrase in sentence:
+                          return sentence.replace(wrong_phrase, correct_phrase)
+
+              # Replace the misspelled word with the correct word **only once**
+              return re.sub(r'\b' + re.escape(original) + r'\b', suggestion, sentence, count=1)
+
           # Process spellcheck output and apply fixes
           with open("spellcheck_report_raw.txt", "r", encoding="utf-8") as infile, open("spellcheck_report.txt", "w", encoding="utf-8") as outfile:
               for line in infile:
@@ -79,31 +103,31 @@ jobs:
                           content_lines = file.readlines()
                           context_line = content_lines[int(line_number) - 1].strip()
 
-                      # Use sentence-splitter to analyze full sentence context
+                      # ✅ Use sentence-splitter to analyze full sentence context
                       splitter = SentenceSplitter(language="en")
                       sentences = splitter.split(context_line)
                       relevant_sentence = next((s for s in sentences if original in s), context_line)
 
-                      # Enforce strict case-sensitive ignore list rules
+                      # **Fix #1: Enforce strict case-sensitive ignore list rules**
                       if original.lower() in ignore_list:
                           if is_code_or_url_or_file(relevant_sentence) or is_markdown_link(relevant_sentence, original):
                               corrected_word = original.lower()  # Keep lowercase in URLs, links, or file paths
                           else:
                               corrected_word = ignore_list[original.lower()]  # Use exact case from ignore list
 
-                      # Reject weak matches to ignore words
+                      # **Fix #2: Reject weak matches to ignore words**
                       elif should_use_ignore_list(original, suggestion, relevant_sentence):
                           best_match, _ = process.extractOne(original, ignore_list.keys())
                           corrected_word = ignore_list[best_match]
 
-                      # Prevent weak ignore word matches
+                      # **Fix #3: Strictly prevent weak ignore word matches**
                       elif len(original) < 3 or len(original) < len(ignore_list.get(suggestion.lower(), "")) / 2:
                           corrected_word = suggestion  # Use the English dictionary instead
 
-                      # Apply strict context-based correction
-                      relevant_sentence = re.sub(r'\b' + re.escape(original) + r'\b', corrected_word, relevant_sentence, count=1)
+                      # **Fix #4: Apply strict context-based correction**
+                      relevant_sentence = apply_strict_context_correction(relevant_sentence, original, corrected_word)
 
-                      # Prevent punctuation modifications
+                      # **Fix #5: Strictly prevent punctuation modifications**
                       relevant_sentence = relevant_sentence.replace("..", ".").replace(",.", ".").replace(" ,", ",")
 
                       # Write final output