|
19 | 19 | run: |
|
20 | 20 | pip install codespell
|
21 | 21 | pip install fuzzywuzzy[speedup]
|
| 22 | + pip install nltk |
22 | 23 |
|
23 | 24 | - name: Verify Spellcheck Ignore List Exists
|
24 | 25 | run: |
|
|
38 | 39 | # Process corrections with Python
|
39 | 40 | python3 <<EOF
|
40 | 41 | import re
|
| 42 | + import nltk |
41 | 43 | from fuzzywuzzy import process
|
| 44 | + from nltk.tokenize import word_tokenize |
| 45 | + nltk.download('punkt') |
42 | 46 |
|
43 | 47 | # Load spellcheck ignore list with case sensitivity
|
44 | 48 | ignore_list = {}
|
@@ -109,16 +113,21 @@ jobs:
|
109 | 113 | if original.lower() in ignore_list and any(c.isupper() for c in original):
|
110 | 114 | corrected_word = ignore_list[original.lower()]
|
111 | 115 |
|
112 |
| - # Use English dictionary for weak matches |
113 |
| - elif should_use_ignore_list(original, suggestion, context_line): |
114 |
| - best_match, _ = process.extractOne(original, ignore_list.keys()) |
115 |
| - if not is_code_or_url_or_file(context_line) and not is_markdown_link(context_line, original): |
116 |
| - corrected_word = ignore_list[best_match] |
117 |
| - else: |
118 |
| - corrected_word = best_match.lower() # Keep it lowercase in URLs/links/files |
| 116 | + # Tokenize line to avoid replacing partial words |
| 117 | + words = word_tokenize(context_line) |
119 | 118 |
|
120 |
| - # Apply context-based correction |
121 |
| - corrected_line = apply_context_based_correction(context_line, original, corrected_word) |
| 119 | + # Ensure weak matches use the English dictionary |
| 120 | + if not should_use_ignore_list(original, suggestion, context_line): |
| 121 | + corrected_word = suggestion # Use dictionary match |
| 122 | +
|
| 123 | + # Apply corrections **only** to the first exact word match, not substrings |
| 124 | + for i, word in enumerate(words): |
| 125 | + if word.lower() == original.lower(): |
| 126 | + words[i] = corrected_word |
| 127 | + break |
| 128 | +
|
| 129 | + # Rebuild corrected line |
| 130 | + corrected_line = " ".join(words) |
122 | 131 |
|
123 | 132 | # Write final output
|
124 | 133 | outfile.write(f"{file_path}:{line_number}: {original} ==> {corrected_word}\n")
|
@@ -147,13 +156,11 @@ jobs:
|
147 | 156 | BRANCH_NAME="spellcheck-fixes-$(date +%s)"
|
148 | 157 | git checkout -b $BRANCH_NAME
|
149 | 158 |
|
150 |
| - # Commit the changes if there are any |
151 | 159 | if [ -n "$(git status --porcelain)" ]; then
|
152 | 160 | git add .
|
153 | 161 | git commit -m "Spellcheck: Automatically fixed detected misspellings"
|
154 | 162 | git push origin $BRANCH_NAME
|
155 | 163 |
|
156 |
| - # Create PR using GitHub CLI |
157 | 164 | gh pr create \
|
158 | 165 | --base main \
|
159 | 166 | --head $BRANCH_NAME \
|
|
0 commit comments