Update spellcheck.yml

Lyd1aCla1r3 · web-flow · commit 9125424c0e29 · 2025-02-23T15:46:35.000-08:00
diff --git a/.github/workflows/spellcheck.yml b/.github/workflows/spellcheck.yml
@@ -19,7 +19,6 @@ jobs:
         run: |
           pip install codespell
           pip install fuzzywuzzy[speedup]
-          pip install nltk
 
       - name: Verify Spellcheck Ignore List Exists
         run: |
@@ -31,18 +30,15 @@ jobs:
         run: |
           set -e  # Exit on error
 
-          # Run codespell on the full repo, save raw report, and prevent auto-fixes
+          # Run codespell to detect misspellings but do not auto-correct
           codespell --ignore-words=.github/spellcheck-ignore.txt \
                     --skip=".git,*.lock,*.json,*.yaml,*.yml,*.css,*.html" \
                     --quiet-level=2 > spellcheck_report_raw.txt || true
 
           # Process corrections with Python
           python3 <<EOF
           import re
-          import nltk
           from fuzzywuzzy import process
-          from nltk.tokenize import word_tokenize
-          nltk.download('punkt')
 
           # Load spellcheck ignore list with case sensitivity
           ignore_list = {}
@@ -56,9 +52,7 @@ jobs:
               "identity provider": ["identiy provider", "identify provider"],
               "access token": ["access toekn", "acess token"],
               "user authentication": ["user authentification", "user authenthication"],
-              "API gateway": ["API getway", "API gatway"],
-              "default reporter": ["defaul reporter"],
-              "default identity provider": ["defaul identity provider"]
+              "API gateway": ["API getway", "API gatway"]
           }
 
           # Function to check if a word is inside a code block, backticks, URL, or file reference
@@ -76,14 +70,14 @@ jobs:
               # Must be at least 90% similar to be considered a match
               if score < 90:
                   return False
-              
+
               # Reject if original contains best_match as a substring (e.g., "certifcate" vs "CE")
               if best_match in original and len(original) > len(best_match):
                   return False
 
               # Enforce case-sensitive corrections for regular text, but NOT for file references/URLs/links
               if not is_code_or_url_or_file(line) and not is_markdown_link(line, original):
-                  return True if best_match in ignore_list else False
+                  return best_match in ignore_list
 
               # Allow case-insensitive corrections for code blocks, backticks, URLs, and markdown links
               return best_match.lower() in ignore_list
@@ -94,7 +88,7 @@ jobs:
                   for wrong_phrase in wrong_variants:
                       if wrong_phrase in line:
                           return line.replace(wrong_phrase, correct_phrase)
-              return line.replace(original, suggestion, 1)  # Replace only the first occurrence
+              return line.replace(original, suggestion)
 
           # Process spellcheck output and apply fixes
           with open("spellcheck_report_raw.txt", "r", encoding="utf-8") as infile, open("spellcheck_report.txt", "w", encoding="utf-8") as outfile:
@@ -109,25 +103,23 @@ jobs:
                           content_lines = file.readlines()
                           context_line = content_lines[int(line_number) - 1].strip()
 
-                      # Preserve case-sensitive ignored terms exactly
-                      if original.lower() in ignore_list and any(c.isupper() for c in original):
-                          corrected_word = ignore_list[original.lower()]
-
-                      # Tokenize line to avoid replacing partial words
-                      words = word_tokenize(context_line)
+                      # **Fix #1: Preserve case-sensitive ignored terms exactly**
+                      if original in ignore_list.values():
+                          corrected_word = original  # Use exact case from ignore list
 
-                      # Ensure weak matches use the English dictionary
-                      if not should_use_ignore_list(original, suggestion, context_line):
-                          corrected_word = suggestion  # Use dictionary match
+                      # **Fix #2: Use English dictionary for weak matches**
+                      elif should_use_ignore_list(original, suggestion, context_line):
+                          best_match, _ = process.extractOne(original, ignore_list.keys())
+                          if not is_code_or_url_or_file(context_line) and not is_markdown_link(context_line, original):
+                              corrected_word = ignore_list[best_match]
+                          else:
+                              corrected_word = best_match.lower()  # Keep it lowercase in URLs/links/files
 
-                      # Apply corrections **only** to the first exact word match, not substrings
-                      for i, word in enumerate(words):
-                          if word.lower() == original.lower():
-                              words[i] = corrected_word
-                              break
+                      # **Fix #3: Apply context-based correction**
+                      corrected_line = apply_context_based_correction(context_line, original, corrected_word)
 
-                      # Rebuild corrected line
-                      corrected_line = " ".join(words)
+                      # **Fix #4: Replace only the first occurrence of the word**
+                      corrected_line = re.sub(r'\b' + re.escape(original) + r'\b', corrected_word, corrected_line, count=1)
 
                       # Write final output
                       outfile.write(f"{file_path}:{line_number}: {original} ==> {corrected_word}\n")
@@ -156,11 +148,13 @@ jobs:
           BRANCH_NAME="spellcheck-fixes-$(date +%s)"
           git checkout -b $BRANCH_NAME
 
+          # Commit the changes if there are any
           if [ -n "$(git status --porcelain)" ]; then
             git add .
             git commit -m "Spellcheck: Automatically fixed detected misspellings"
             git push origin $BRANCH_NAME
 
+            # Create PR using GitHub CLI
             gh pr create \
               --base main \
               --head $BRANCH_NAME \