-
Notifications
You must be signed in to change notification settings - Fork 14
165 lines (133 loc) · 7.29 KB
/
spellcheck.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
name: Manual Spellcheck & Auto PR
on:
workflow_dispatch: # Runs only when manually triggered
permissions:
contents: write # Needed to push changes to a new branch
pull-requests: write # Needed to create a PR
jobs:
spellcheck:
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v4
- name: Install Dependencies
run: |
pip install codespell
pip install fuzzywuzzy[speedup]
- name: Verify Spellcheck Ignore List Exists
run: |
if [ ! -f .github/spellcheck-ignore.txt ]; then
echo "Error: spellcheck-ignore.txt not found!" && exit 1
fi
- name: Run Spellcheck and Apply Fixes
run: |
set -e # Exit on error
# Run codespell on the full repo, save raw report, and prevent auto-fixes
codespell --ignore-words=.github/spellcheck-ignore.txt \
--skip=".git,*.lock,*.json,*.yaml,*.yml,*.css,*.html" \
--quiet-level=2 > spellcheck_report_raw.txt || true
# Process corrections with Python
python3 <<EOF
import re
from fuzzywuzzy import process
# Load spellcheck ignore list with case sensitivity
ignore_list = {}
with open(".github/spellcheck-ignore.txt", "r", encoding="utf-8") as f:
for line in f:
word = line.strip()
ignore_list[word.lower()] = word # Store lowercase -> correct-case
# Common word pairs and phrases to check for context-based correction
common_phrases = {
"identity provider": ["identiy provider", "identify provider"],
"access token": ["access toekn", "acess token"],
"user authentication": ["user authentification", "user authenthication"],
"API gateway": ["API getway", "API gatway"]
}
# Function to check if a word is inside a code block, backticks, URL, or file reference
def is_code_or_url_or_file(line):
return re.search(r'`.*?`|https?://\S+|www\.\S+|/[\w./-]+', line)
# Function to check if a word is part of a Markdown link
def is_markdown_link(line, original):
return re.search(r'\[.*?\]\(.*' + re.escape(original) + r'.*\)', line)
# Function to determine if an ignore list word should be used
def should_use_ignore_list(original, suggestion, line):
best_match, score = process.extractOne(original, ignore_list.keys())
# Must be at least 90% similar to be considered a match
if score < 90:
return False
# Reject if original contains best_match as a substring (e.g., "certifcate" vs "CE")
if best_match in original and len(original) > len(best_match):
return False
# Enforce case-sensitive corrections for regular text, but NOT for file references/URLs/links
if not is_code_or_url_or_file(line) and not is_markdown_link(line, original):
return True if best_match in ignore_list else False
# Allow case-insensitive corrections for code blocks, backticks, URLs, and markdown links
return best_match.lower() in ignore_list
# Function to apply context-based correction
def apply_context_based_correction(line, original, suggestion):
for correct_phrase, wrong_variants in common_phrases.items():
for wrong_phrase in wrong_variants:
if wrong_phrase in line:
return line.replace(wrong_phrase, correct_phrase)
return line.replace(original, suggestion)
# Process spellcheck output and apply fixes
with open("spellcheck_report_raw.txt", "r", encoding="utf-8") as infile, open("spellcheck_report.txt", "w", encoding="utf-8") as outfile:
for line in infile:
match = re.match(r"(.*):(\d+): (\S+) ==> (\S+)", line)
if match:
file_path, line_number, original, suggestion = match.groups()
corrected_word = suggestion
# Read the line content from the file
with open(file_path, "r", encoding="utf-8") as file:
content_lines = file.readlines()
context_line = content_lines[int(line_number) - 1].strip()
# **Fix #1: Preserve case-sensitive ignored terms exactly**
if original in ignore_list.values():
corrected_word = original # Use exact case from original text
# **Fix #2: Use English dictionary for weak matches**
elif should_use_ignore_list(original, suggestion, context_line):
best_match, _ = process.extractOne(original, ignore_list.keys())
if not is_code_or_url_or_file(context_line) and not is_markdown_link(context_line, original):
corrected_word = ignore_list[best_match]
else:
corrected_word = best_match.lower() # Keep it lowercase in URLs/links/files
# **Fix #3: Apply context-based correction**
corrected_line = apply_context_based_correction(context_line, original, corrected_word)
# **Fix #4: Replace only the first occurrence of the word**
corrected_line = re.sub(r'\b' + re.escape(original) + r'\b', corrected_word, corrected_line, count=1)
# Write final output
outfile.write(f"{file_path}:{line_number}: {original} ==> {corrected_word}\n")
# Apply fix while maintaining case rules
content_lines[int(line_number) - 1] = corrected_line + "\n"
with open(file_path, "w", encoding="utf-8") as file:
file.writelines(content_lines)
EOF
# Check if any files were modified
if git status --porcelain | grep -q '^ M'; then
echo "Corrections applied. Preparing to create PR."
else
echo "No spelling corrections found. Exiting."
exit 0
fi
- name: Create Pull Request with Corrections
env:
GITHUB_TOKEN: ${{ secrets.PAT_GITHUB_ACTIONS }}
run: |
git config --global user.name "github-actions[bot]"
git config --global user.email "github-actions[bot]@users.noreply.github.com"
BRANCH_NAME="spellcheck-fixes-$(date +%s)"
git checkout -b $BRANCH_NAME
# Commit the changes if there are any
if [ -n "$(git status --porcelain)" ]; then
git add .
git commit -m "Spellcheck: Automatically fixed detected misspellings"
git push origin $BRANCH_NAME
# Create PR using GitHub CLI
gh pr create \
--base main \
--head $BRANCH_NAME \
--title "Spellcheck Fixes" \
--body "This PR contains automatically applied spelling corrections."
else
echo "No changes detected. Skipping PR creation."
fi