Skip to content

Manual Spellcheck Review & Approval #105

Manual Spellcheck Review & Approval

Manual Spellcheck Review & Approval #105

Workflow file for this run

name: Manual Spellcheck & Auto PR
on:
workflow_dispatch: # Runs only when manually triggered
permissions:
contents: write # Needed to push changes to a new branch
pull-requests: write # Needed to create a PR
jobs:
spellcheck:
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v4
- name: Install Dependencies
run: |
pip install codespell
pip install fuzzywuzzy[speedup]
- name: Verify Spellcheck Ignore List Exists
run: |
if [ ! -f .github/spellcheck-ignore.txt ]; then
echo "Error: spellcheck-ignore.txt not found!" && exit 1
fi
- name: Run Spellcheck and Apply Fixes
run: |
set -e # Exit on error
# Run codespell on the full repo, save raw report, and prevent auto-fixes
codespell --ignore-words=.github/spellcheck-ignore.txt \
--skip=".git,*.lock,*.json,*.yaml,*.yml,*.css,*.html" \
--quiet-level=2 > spellcheck_report_raw.txt || true
# Process corrections with Python
python3 <<EOF
import re
from fuzzywuzzy import process
# Load spellcheck ignore list with case sensitivity
ignore_list = {}
with open(".github/spellcheck-ignore.txt", "r", encoding="utf-8") as f:
for line in f:
word = line.strip()
ignore_list[word.lower()] = word # Store lowercase -> correct-case
# Common word pairs and phrases to check for context-based correction
common_phrases = {
"identity provider": ["identiy provider", "identify provider"],
"access token": ["access toekn", "acess token"],
"user authentication": ["user authentification", "user authenthication"],
"API gateway": ["API getway", "API gatway"]
}
# Function to check if a word is inside a code block, backticks, URL, or file reference
def is_code_or_url_or_file(line):
return re.search(r'`.*?`|https?://\S+|www\.\S+|/[\w./-]+', line)
# Function to check if a word is part of a Markdown link
def is_markdown_link(line, original):
return re.search(r'\[.*?\]\(.*' + re.escape(original) + r'.*\)', line)
# Function to determine if an ignore list word should be used
def should_use_ignore_list(original, suggestion, line):
best_match, score = process.extractOne(original, ignore_list.keys())
# Must be at least 90% similar to be considered a match
if score < 90:
return False
# Reject if original contains best_match as a substring (e.g., "certifcate" vs "CE")
if best_match in original and len(original) > len(best_match):
return False
# Enforce case-sensitive corrections for regular text, but NOT for file references/URLs/links
if not is_code_or_url_or_file(line) and not is_markdown_link(line, original):
return True if best_match in ignore_list else False
# Allow case-insensitive corrections for code blocks, backticks, URLs, and markdown links
return best_match.lower() in ignore_list
# Function to apply context-based correction
def apply_context_based_correction(line, original, suggestion):
for correct_phrase, wrong_variants in common_phrases.items():
for wrong_phrase in wrong_variants:
if wrong_phrase in line:
return line.replace(wrong_phrase, correct_phrase)
return line.replace(original, suggestion)
# Process spellcheck output and apply fixes
with open("spellcheck_report_raw.txt", "r", encoding="utf-8") as infile, open("spellcheck_report.txt", "w", encoding="utf-8") as outfile:
for line in infile:
match = re.match(r"(.*):(\d+): (\S+) ==> (\S+)", line)
if match:
file_path, line_number, original, suggestion = match.groups()
corrected_word = suggestion
# Read the line content from the file
with open(file_path, "r", encoding="utf-8") as file:
content_lines = file.readlines()
context_line = content_lines[int(line_number) - 1].strip()
# **Fix #1: Preserve case-sensitive ignored terms exactly**
if original in ignore_list.values():
corrected_word = original # Use exact case from original text
# **Fix #2: Use English dictionary for weak matches**
elif should_use_ignore_list(original, suggestion, context_line):
best_match, _ = process.extractOne(original, ignore_list.keys())
if not is_code_or_url_or_file(context_line) and not is_markdown_link(context_line, original):
corrected_word = ignore_list[best_match]
else:
corrected_word = best_match.lower() # Keep it lowercase in URLs/links/files
# **Fix #3: Apply context-based correction**
corrected_line = apply_context_based_correction(context_line, original, corrected_word)
# **Fix #4: Replace only the first occurrence of the word**
corrected_line = re.sub(r'\b' + re.escape(original) + r'\b', corrected_word, corrected_line, count=1)
# Write final output
outfile.write(f"{file_path}:{line_number}: {original} ==> {corrected_word}\n")
# Apply fix while maintaining case rules
content_lines[int(line_number) - 1] = corrected_line + "\n"
with open(file_path, "w", encoding="utf-8") as file:
file.writelines(content_lines)
EOF
# Check if any files were modified
if git status --porcelain | grep -q '^ M'; then
echo "Corrections applied. Preparing to create PR."
else
echo "No spelling corrections found. Exiting."
exit 0
fi
- name: Create Pull Request with Corrections
env:
GITHUB_TOKEN: ${{ secrets.PAT_GITHUB_ACTIONS }}
run: |
git config --global user.name "github-actions[bot]"
git config --global user.email "github-actions[bot]@users.noreply.github.com"
BRANCH_NAME="spellcheck-fixes-$(date +%s)"
git checkout -b $BRANCH_NAME
# Commit the changes if there are any
if [ -n "$(git status --porcelain)" ]; then
git add .
git commit -m "Spellcheck: Automatically fixed detected misspellings"
git push origin $BRANCH_NAME
# Create PR using GitHub CLI
gh pr create \
--base main \
--head $BRANCH_NAME \
--title "Spellcheck Fixes" \
--body "This PR contains automatically applied spelling corrections."
else
echo "No changes detected. Skipping PR creation."
fi