.github/workflows/spellcheck.yml

name: Manual Spellcheck & Auto PR

on:
  workflow_dispatch:  # Runs only when manually triggered

permissions:
  contents: write  # Needed to push changes to a new branch
  pull-requests: write  # Needed to create a PR

jobs:
  spellcheck:
    runs-on: ubuntu-latest

    steps:
      - name: Checkout Repository
        uses: actions/checkout@v4

      - name: Install Dependencies
        run: |
          pip install codespell
          pip install fuzzywuzzy[speedup]

      - name: Verify Spellcheck Ignore List Exists
        run: |
          if [ ! -f .github/spellcheck-ignore.txt ]; then
            echo "Error: spellcheck-ignore.txt not found!" && exit 1
          fi

      - name: Run Spellcheck and Apply Fixes
        run: |
          set -e  # Exit on error

          # Run codespell on the full repo, save raw report, and prevent auto-fixes
          codespell --ignore-words=.github/spellcheck-ignore.txt \
                    --skip=".git,*.lock,*.json,*.yaml,*.yml,*.css,*.html" \
                    --quiet-level=2 > spellcheck_report_raw.txt || true

          # Process corrections with Python
          python3 <<EOF
          import re
          from fuzzywuzzy import process

          # Load spellcheck ignore list with case sensitivity
          ignore_list = {}
          with open(".github/spellcheck-ignore.txt", "r", encoding="utf-8") as f:
              for line in f:
                  word = line.strip()
                  ignore_list[word.lower()] = word  # Store lowercase -> correct-case

          # Common word pairs and phrases to check for context-based correction
          common_phrases = {
              "identity provider": ["identiy provider", "identify provider"],
              "access token": ["access toekn", "acess token"],
              "user authentication": ["user authentification", "user authenthication"],
              "API gateway": ["API getway", "API gatway"]
          }

          # Function to check if a word is inside a code block, backticks, URL, or file reference
          def is_code_or_url_or_file(line):
              return re.search(r'`.*?`|https?://\S+|www\.\S+|/[\w./-]+', line)

          # Function to check if a word is part of a Markdown link
          def is_markdown_link(line, original):
              return re.search(r'\[.*?\]\(.*' + re.escape(original) + r'.*\)', line)

          # Function to determine if an ignore list word should be used
          def should_use_ignore_list(original, suggestion, line):
              best_match, score = process.extractOne(original, ignore_list.keys())
              
              # Must be at least 90% similar to be considered a match
              if score < 90:
                  return False
              
              # Reject if original contains best_match as a substring (e.g., "certifcate" vs "CE")
              if best_match in original and len(original) > len(best_match):
                  return False

              # Enforce case-sensitive corrections for regular text, but NOT for file references/URLs/links
              if not is_code_or_url_or_file(line) and not is_markdown_link(line, original):
                  return True if best_match in ignore_list else False

              # Allow case-insensitive corrections for code blocks, backticks, URLs, and markdown links
              return best_match.lower() in ignore_list

          # Function to apply context-based correction
          def apply_context_based_correction(line, original, suggestion):
              for correct_phrase, wrong_variants in common_phrases.items():
                  for wrong_phrase in wrong_variants:
                      if wrong_phrase in line:
                          return line.replace(wrong_phrase, correct_phrase)
              return line.replace(original, suggestion)

          # Process spellcheck output and apply fixes
          with open("spellcheck_report_raw.txt", "r", encoding="utf-8") as infile, open("spellcheck_report.txt", "w", encoding="utf-8") as outfile:
              for line in infile:
                  match = re.match(r"(.*):(\d+): (\S+) ==> (\S+)", line)
                  if match:
                      file_path, line_number, original, suggestion = match.groups()
                      corrected_word = suggestion

                      # Read the line content from the file
                      with open(file_path, "r", encoding="utf-8") as file:
                          content_lines = file.readlines()
                          context_line = content_lines[int(line_number) - 1].strip()

                      # **Fix #1: Preserve case-sensitive ignored terms exactly**
                      if original in ignore_list.values():
                          corrected_word = original  # Use exact case from original text

                      # **Fix #2: Use English dictionary for weak matches**
                      elif should_use_ignore_list(original, suggestion, context_line):
                          best_match, _ = process.extractOne(original, ignore_list.keys())
                          if not is_code_or_url_or_file(context_line) and not is_markdown_link(context_line, original):
                              corrected_word = ignore_list[best_match]
                          else:
                              corrected_word = best_match.lower()  # Keep it lowercase in URLs/links/files

                      # **Fix #3: Apply context-based correction**
                      corrected_line = apply_context_based_correction(context_line, original, corrected_word)

                      # **Fix #4: Replace only the first occurrence of the word**
                      corrected_line = re.sub(r'\b' + re.escape(original) + r'\b', corrected_word, corrected_line, count=1)

                      # Write final output
                      outfile.write(f"{file_path}:{line_number}: {original} ==> {corrected_word}\n")

                      # Apply fix while maintaining case rules
                      content_lines[int(line_number) - 1] = corrected_line + "\n"
                      with open(file_path, "w", encoding="utf-8") as file:
                          file.writelines(content_lines)
          EOF

          # Check if any files were modified
          if git status --porcelain | grep -q '^ M'; then
            echo "Corrections applied. Preparing to create PR."
          else
            echo "No spelling corrections found. Exiting."
            exit 0
          fi

      - name: Create Pull Request with Corrections
        env:
          GITHUB_TOKEN: ${{ secrets.PAT_GITHUB_ACTIONS }}
        run: |
          git config --global user.name "github-actions[bot]"
          git config --global user.email "github-actions[bot]@users.noreply.github.com"

          BRANCH_NAME="spellcheck-fixes-$(date +%s)"
          git checkout -b $BRANCH_NAME

          # Commit the changes if there are any
          if [ -n "$(git status --porcelain)" ]; then
            git add .
            git commit -m "Spellcheck: Automatically fixed detected misspellings"
            git push origin $BRANCH_NAME

            # Create PR using GitHub CLI
            gh pr create \
              --base main \
              --head $BRANCH_NAME \
              --title "Spellcheck Fixes" \
              --body "This PR contains automatically applied spelling corrections."
          else
            echo "No changes detected. Skipping PR creation."
          fi