mongo/buildscripts/lint_markdown_links.py

#!/usr/bin/env python3
"""Markdown Link Linter (MongoDB)
=================================

Checks Markdown files under `src/mongo` for broken internal links.

Link Types Validated
--------------------
1. Intra-document anchors: `[text](#some-heading)`
2. Relative file links: `[text](../../path/to/OtherFile.md#anchor)`
3. Repo-root relative paths beginning with `/` (e.g. `[feature flags](/src/mongo/db/query/README_query_feature_flags.md)`)
4. Reference-style links: `[text][label]` or `[text][]` with definitions like `[label]: url`

External (http/https) links are currently skipped (no network requests performed) except for a trivial malformed scheme check (e.g. `hhttps://`).

GitHub Repository Link Policy
-----------------------------
Links to the MongoDB server repository (`github.com/mongodb/mongo` or private clone `github.com/10gen/mongo`) must not reference the mutable `master` branch. Allowed:
* Release/tag branches (e.g. `r6.2.0`)
* Specific commit SHAs (40 hex chars)
* Any other non-`master` branch

Unpinned `master` links are reported with an issue. Auto-fix rewrites them to a repo-root relative path (`/src/...`) preserving any line fragment (e.g. `#L89`).

Anchor Normalization
--------------------
GitHub-style anchors derived from headings:
* Lowercased
* Punctuation stripped (most symbols except `-` and `_`)
* Spaces collapsed to single `-`

Usage
-----
Run from repository root:
    python buildscripts/lint_markdown_links.py --verbose

JSON output (exit code still meaningful):
    python buildscripts/lint_markdown_links.py --json > link_report.json

Auto-Fix Renamed Paths
----------------------
Auto-fix (`--auto-fix`) automatically handles:
* Directory renames via `--rename-map old=new`
* Moved files (searches by basename across the repository)
* Broken anchors (relocates to correct file)
* Common typos and malformed schemes

Example with rename mapping:
    python buildscripts/lint_markdown_links.py --auto-fix --rename-map catalog=local_catalog --root src/mongo/db/storage --verbose

Multiple mappings:
    python buildscripts/lint_markdown_links.py --auto-fix \
        --rename-map catalog=local_catalog \
        --rename-map query_stats=query_shape_stats

After auto-fix the script re-runs linting to verify all fixes.

Safety Characteristics
----------------------
* Only replaces the specific `](oldpath...)` occurrence.
* Skips if replacement yields identical path.
* Always review diffs before committing.

Exit Codes
----------
0 = all links OK
1 = usage / root not found
2 = one or more link issues detected

Sample Output
-------------
    src/mongo/example/README.md:42: file does not exist: /abs/path/src/mongo/missing.md [missing.md]
    src/mongo/example/README.md:57: anchor "overview" not found in target file [other.md#overview]

Common False Positives
----------------------
* Headings generated dynamically (e.g. code-generated docs)
* Links to files produced by a build step
* Root-relative paths not starting with `/src/` (extend logic if needed)
* External links (intentionally not validated)

Performance
-----------
Parallel validation using a thread pool sized to available CPUs (capped at 32).

Suppressing Specific Links
--------------------------
Not implemented yet. Potential future directive:
    <!-- linklint-ignore-next -->

Future Enhancements (Ideas)
---------------------------
* Reference-style link resolution (`[text][ref]` definitions)
* Ignore patterns via CLI or config file
* CI integration (Bazel / GitHub Actions) enforcing link health
* Levenshtein suggestions for typos
* Anchor remapping when heading text changes

CI Integration Example
----------------------
Add a step:
    python buildscripts/lint_markdown_links.py --json
Fail build if exit code is 2.

Implementation Notes
--------------------
* Uses regex heuristics (no full Markdown parsing) for speed.
* Anchor generation and link fragment normalization share the same logic (`github_anchor`).

Maintained by MongoDB Engineering Tooling.
"""

from __future__ import annotations

import argparse
import concurrent.futures
import json
import os
import re
import sys
import urllib.parse
from dataclasses import dataclass
from typing import Iterable, List, Optional, Tuple

HEADING_RE = re.compile(r"^(#{1,6})\s+(.*)$")
HTML_ANCHOR_RE = re.compile(r'<a\s+(?:name|id)=["\']([^"\']+)["\']\s*>\s*</a>?', re.IGNORECASE)
LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
# Inline link references: [text]: url
REF_DEF_RE = re.compile(r"^\s*\[([^\]]+)\]:\s+(\S+)")
# Reference-style links: [text][label] or [text][] but NOT [[double brackets]]
# Negative lookbehind (?<!\[) ensures first [ is not preceded by [
# Negative lookahead (?!\[) ensures first [ is not followed by another [
REF_USE_RE = re.compile(r"(?<!\[)\[([^\]]+)\](?!\])\[(?:(?:[^\]]+))?\]")

# Characters removed for anchor IDs (GitHub rules simplified). We strip most punctuation except hyphen and underscore.
PUNCT_TO_STRIP = "\"'!#$%&()*+,./:;<=>?@[]^`{|}~"  # punctuation characters to remove
ANCHOR_CACHE: dict[str, set[str]] = {}
# Cache for reference-style link definitions: file_path -> {label: target_url}
REFERENCE_CACHE: dict[str, dict[str, str]] = {}


def _detect_repo_root(start: str | None = None) -> str:
    """Walk upwards to locate repository root (presence of WORKSPACE.bazel or .git).

    Falls back to current working directory if no sentinel found.
    """
    if start is None:
        start = os.getcwd()
    cur = os.path.abspath(start)
    last = None
    while cur != last:
        if (
            os.path.exists(os.path.join(cur, "WORKSPACE.bazel"))
            or os.path.exists(os.path.join(cur, "MODULE.bazel"))
            or os.path.isdir(os.path.join(cur, ".git"))
        ):
            return cur
        last = cur
        cur = os.path.dirname(cur)
    return os.getcwd()


REPO_ROOT = _detect_repo_root()


@dataclass
class LinkIssue:
    file: str
    line: int
    link_text: str
    target: str
    message: str

    def to_dict(self):
        return {
            "file": self.file,
            "line": self.line,
            "link_text": self.link_text,
            "target": self.target,
            "message": self.message,
        }


def github_anchor(text: str) -> str:
    t = text.strip().lower()
    # remove punctuation
    t2 = "".join(ch for ch in t if ch not in PUNCT_TO_STRIP)
    # spaces to hyphens
    t2 = re.sub(r"\s+", "-", t2)
    # collapse multiple hyphens
    t2 = re.sub(r"-+", "-", t2)
    return t2


def collect_headings(path: str) -> set[str]:
    if path in ANCHOR_CACHE:
        return ANCHOR_CACHE[path]
    anchors: set[str] = set()
    try:
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                # Support blockquoted headings: strip leading '>' plus following space(s)
                if line.lstrip().startswith(">"):
                    # Remove successive '>' prefixes (nested blockquotes) while preserving heading markers
                    stripped = line.lstrip()
                    while stripped.startswith(">"):
                        stripped = stripped[1:].lstrip()
                    candidate_line = stripped
                else:
                    candidate_line = line
                m = HEADING_RE.match(candidate_line)
                if m:
                    heading_text = m.group(2)
                    # Extract any embedded HTML anchors first
                    for a in HTML_ANCHOR_RE.finditer(heading_text):
                        raw = a.group(1).strip()
                        if raw:
                            anchors.add(github_anchor(raw))
                            anchors.add(raw)  # also allow direct reference without normalization
                    # Remove HTML anchor tags from heading text before normalizing (support no-space join)
                    cleaned = HTML_ANCHOR_RE.sub("", heading_text).strip()
                    # Strip inline markdown links (e.g., Classic [PlanCache](...)) to derive anchor from visible text only.
                    cleaned = re.sub(r"\[([^\]]+)\]\([^)]*\)", r"\1", cleaned)
                    if cleaned:
                        norm_clean = github_anchor(cleaned)
                        # Duplicate tracking: if an anchor already exists, add numbered variants
                        if norm_clean in anchors:
                            # Count existing numbered variants to assign next index
                            existing_indices = [0]
                            for existing in list(anchors):
                                if existing == norm_clean:
                                    existing_indices.append(0)
                                elif re.match(rf"^{re.escape(norm_clean)}-(\d+)$", existing):
                                    try:
                                        existing_indices.append(int(existing.rsplit("-", 1)[1]))
                                    except Exception:
                                        pass
                            next_idx = max(existing_indices) + 1
                            numbered = f"{norm_clean}-{next_idx}"
                            anchors.add(numbered)
                        anchors.add(norm_clean)
                    # Also add normalized form of raw anchors (in case users link using normalized visible text form)
                    for a in HTML_ANCHOR_RE.finditer(heading_text):
                        raw = a.group(1).strip()
                        if raw:
                            anchors.add(github_anchor(raw))
    except Exception:
        pass
    ANCHOR_CACHE[path] = anchors
    return anchors


def collect_reference_definitions(path: str) -> dict[str, str]:
    """Parse all reference-style link definitions [label]: url from a markdown file."""
    if path in REFERENCE_CACHE:
        return REFERENCE_CACHE[path]
    references: dict[str, str] = {}
    try:
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                m = REF_DEF_RE.match(line)
                if m:
                    label = m.group(1).strip().lower()  # case-insensitive matching
                    target = m.group(2).strip()
                    references[label] = target
    except Exception:
        pass
    REFERENCE_CACHE[path] = references
    return references


def is_http_url(url: str) -> bool:
    return url.startswith("http://") or url.startswith("https://")


def find_markdown_files(root: str) -> List[str]:
    files: List[str] = []
    for dirpath, _, filenames in os.walk(root):
        for fn in filenames:
            if fn.lower().endswith(".md"):
                files.append(os.path.join(dirpath, fn))
    return files


def parse_links(file_path: str) -> List[Tuple[int, str, str]]:
    links: List[Tuple[int, str, str]] = []
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            in_fence = False
            in_blockquote = False
            fence_delim = None  # track ``` or ~~~
            for idx, raw_line in enumerate(f, start=1):
                line = raw_line.rstrip("\n")
                # Detect start/end of fenced code blocks. Accept ``` or ~~~ with optional language and leading whitespace.
                fence_match = re.match(r"^\s*(?P<delim>`{3,}|~{3,})(.*)$", line)
                if fence_match:
                    full = fence_match.group("delim")
                    # Toggle if same delimiter starts/ends
                    if not in_fence:
                        in_fence = True
                        fence_delim = full
                        continue
                    else:
                        # Only close if same delimiter length & char
                        if fence_delim == full:
                            in_fence = False
                            fence_delim = None
                            continue
                if in_fence:
                    continue  # skip link detection inside code fences
                # Blockquote handling: if line starts with '>' treat entire following wrapped paragraph as quoted until blank line
                if re.match(r"^\s*>", line):
                    in_blockquote = True
                    continue
                if in_blockquote:
                    if line.strip() == "":
                        in_blockquote = False
                    else:
                        continue
                # Skip lines that are reference definitions themselves
                if REF_DEF_RE.match(line):
                    continue

                # Find all backtick regions to exclude from link detection
                # Build a set of character positions that are inside backticks
                backtick_positions = set()
                in_code = False
                for i, char in enumerate(line):
                    if char == "`":
                        in_code = not in_code
                    elif in_code:
                        backtick_positions.add(i)

                # Helper function to check if the opening bracket of a link is inside backticks
                # We only check the start position because if the [ is in code, the whole link should be skipped
                def is_in_code_span(match_start):
                    return match_start in backtick_positions

                # Track character ranges of all matched links to avoid double-processing
                matched_ranges = []

                def overlaps_matched_range(start, end):
                    """Check if a position range overlaps with any previously matched range."""
                    for m_start, m_end in matched_ranges:
                        # Check for any overlap
                        if start < m_end and end > m_start:
                            return True
                    return False

                # Inline links [text](url)
                for m in LINK_RE.finditer(line):
                    if is_in_code_span(m.start()):
                        continue  # Skip links inside backticks
                    text, target = m.group(1), m.group(2).strip()
                    links.append((idx, text, target))
                    matched_ranges.append((m.start(), m.end()))

                # Reference-style links [text][label] or [text][]
                for m in REF_USE_RE.finditer(line):
                    if is_in_code_span(m.start()):
                        continue  # Skip links inside backticks
                    full_match = m.group(0)
                    text = m.group(1).strip()
                    # Extract label from [text][label] - if empty brackets [], use text as label
                    label_part = full_match[len(text) + 2 :]  # skip [text]
                    if label_part == "[]":
                        label = text  # implicit reference: [text][] uses "text" as label
                    else:
                        # Explicit label: [text][label]
                        label = label_part.strip("[]").strip()
                    # Use special marker to indicate this is a reference link
                    links.append((idx, text, f"__REF__{label}"))
                    matched_ranges.append((m.start(), m.end()))

                # Shortcut reference links [text] - single bracket that references a definition
                # Only match if not already matched by inline or reference-style patterns
                # Pattern: single bracket pair not preceded by [ and not followed by ( or [
                for m in re.finditer(r"(?<!\[)\[([^\]]+)\](?![(\[])", line):
                    if is_in_code_span(m.start()):
                        continue  # Skip links inside backticks
                    # Skip if overlaps with already matched ranges
                    if overlaps_matched_range(m.start(), m.end()):
                        continue
                    # Skip if this is part of a double bracket pattern [[...]]
                    if m.end() < len(line) and line[m.end()] == "]":
                        continue
                    text = m.group(1).strip()
                    # Only treat as reference link if it could plausibly be one
                    # (contains text, not just punctuation or numbers)
                    if text and not text.isdigit():
                        # Use special marker to indicate this is a reference link
                        # For shortcut references, the label is the text itself
                        links.append((idx, text, f"__REF__{text}"))
    except Exception:
        pass
    return links


def validate_link(current_file: str, line: int, text: str, target: str) -> Optional[LinkIssue]:
    # Handle reference-style links [text][label]
    if target.startswith("__REF__"):
        label = target[7:].lower()  # Extract label and normalize to lowercase
        references = collect_reference_definitions(current_file)
        if label not in references:
            return LinkIssue(
                current_file,
                line,
                text,
                f"[{label}]",
                f'reference link label "{label}" not defined in this file',
            )
        # Resolve the reference and validate the actual target
        resolved_target = references[label]
        return validate_link(current_file, line, text, resolved_target)

    # Remove surrounding <> used sometimes in markdown
    if target.startswith("<") and target.endswith(">"):
        target = target[1:-1]

    # Ignore empty link
    if target == "":
        return LinkIssue(current_file, line, text, target, "empty link target")

    # Fragment-only (#anchor)
    if target.startswith("#"):
        anchors = collect_headings(current_file)
        raw_anchor = target[1:]
        # Normalize link anchor the same way headings are normalized
        norm_anchor = github_anchor(raw_anchor)
        if norm_anchor not in anchors:
            # Fuzzy variants: attempt to tolerate missing or extra hyphens inside multi-token anchors.
            # Strategy:
            # 1. If anchor has hyphens, try removing each hyphen individually (concatenation forms).
            # 2. Try removing all hyphens (fully concatenated form).
            # 3. If anchor has N hyphens, also try forms with one extra hyphen inserted between adjacent alphanumerics
            #    (covers classic-plancache -> classic-plan-cache).
            # 4. If anchor has no hyphens, attempt inserting a hyphen at every internal boundary between alphanumerics.
            fuzzy_match = False
            variant_candidates: set[str] = set()
            a = norm_anchor
            # (1) remove each hyphen separately
            if "-" in a:
                for i, ch in enumerate(a):
                    if ch == "-":
                        variant_candidates.add(a[:i] + a[i + 1 :])
                # (2) remove all hyphens
                variant_candidates.add(a.replace("-", ""))
                # (3) insert extra hyphen between alphanumerics where not already hyphen
                for i in range(1, len(a)):
                    if a[i] != "-" and a[i - 1] != "-":
                        if a[i - 1].isalnum() and a[i].isalnum():
                            variant_candidates.add(a[:i] + "-" + a[i:])
            else:
                # (4) insert hyphen at every internal boundary
                for i in range(1, len(a)):
                    if a[i - 1].isalnum() and a[i].isalnum():
                        variant_candidates.add(a[:i] + "-" + a[i:])
            # Limit explosion: cap at 50 candidates
            if len(variant_candidates) > 50:
                variant_candidates = set(list(variant_candidates)[:50])
            for cand in variant_candidates:
                if cand in anchors:
                    fuzzy_match = True
                    break
            if fuzzy_match:
                return None  # Suppress issue since a fuzzy variant matches
            return LinkIssue(current_file, line, text, target, "anchor not found in this file")
        return None

    # Split fragment if present
    file_part, frag_part = target.split("#", 1) if "#" in target else (target, None)

    if is_http_url(file_part):
        # Allow detection of malformed scheme 'hhttps://' but otherwise skip external validation
        if file_part.startswith("hhttps://"):
            return LinkIssue(
                current_file, line, text, target, "malformed scheme (did you mean https:// ?)"
            )
        # Enforce pinned GitHub refs for mongodb/mongo and 10gen/mongo repositories.
        gh_match = re.match(
            r"^https://github.com/(mongodb|10gen)/mongo/(blob|tree)/([^/]+)/([^#]+)(?:#.*)?$",
            target,
        )
        if gh_match:
            owner, kind, ref, path_rest = gh_match.groups()
            if ref == "master":
                return LinkIssue(
                    current_file,
                    line,
                    text,
                    target,
                    "unpinned GitHub master reference; use tag/commit or relative path",
                )
            return None  # Non-master GitHub link accepted
        return None

    # Remove query params if any
    if "?" in file_part:
        parsed = urllib.parse.urlparse(file_part)
        file_part = parsed.path

    # Normalize relative path. If path starts with '/' treat as repo-root relative.
    repo_root = REPO_ROOT  # resolved once; works under Bazel runfiles
    if file_part.startswith("/"):
        resolved_path = os.path.normpath(os.path.join(repo_root, file_part.lstrip("/")))
    else:
        current_dir = os.path.dirname(current_file)
        resolved_path = os.path.normpath(os.path.join(current_dir, file_part))

    if not os.path.exists(resolved_path):
        # Try appending .md extension if the path doesn't exist
        if not resolved_path.endswith(".md"):
            resolved_path_with_md = resolved_path + ".md"
            if os.path.exists(resolved_path_with_md):
                resolved_path = resolved_path_with_md
            else:
                return LinkIssue(
                    current_file, line, text, target, f"file does not exist: {resolved_path}"
                )
        else:
            return LinkIssue(
                current_file, line, text, target, f"file does not exist: {resolved_path}"
            )

    if frag_part:
        # If target file is NOT markdown and fragment matches a GitHub line anchor (#Lnn or #Lnn-Lmm), accept.
        if not resolved_path.lower().endswith(".md") and re.match(r"^L\d+(-L\d+)?$", frag_part):
            return None
        anchors = (
            collect_headings(resolved_path) if resolved_path.lower().endswith(".md") else set()
        )
        if resolved_path.lower().endswith(".md"):
            norm_frag = github_anchor(frag_part)
            if norm_frag not in anchors:
                return LinkIssue(
                    current_file,
                    line,
                    text,
                    target,
                    f'anchor "{frag_part}" not found in target file',
                )
        else:
            # Non-markdown + non line-fragment: cannot validate anchor, assume ok.
            return None

    return None


def lint_files(files: Iterable[str], workers: int) -> List[LinkIssue]:
    issues: List[LinkIssue] = []

    def process(file_path: str) -> List[LinkIssue]:
        file_issues: List[LinkIssue] = []
        links = parse_links(file_path)
        for line, text, target in links:
            issue = validate_link(file_path, line, text, target)
            if issue:
                file_issues.append(issue)
        return file_issues

    with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as exe:
        futures = {exe.submit(process, f): f for f in files}
        for fut in concurrent.futures.as_completed(futures):
            for iss in fut.result():
                issues.append(iss)
    return issues


def main(argv: List[str]) -> int:
    ap = argparse.ArgumentParser(description="Markdown link linter for src/mongo markdown files.")
    ap.add_argument("--root", default="src/mongo", help="Root directory to scan")
    ap.add_argument(
        "--workers",
        type=int,
        default=min(32, (os.cpu_count() or 4)),
        help="Parallel worker threads",
    )
    ap.add_argument("--json", action="store_true", help="Output machine-readable JSON")
    ap.add_argument("--verbose", action="store_true", help="Verbose output")
    ap.add_argument(
        "--auto-fix",
        action="store_true",
        help="Attempt automatic fixes for simple broken links (renames)",
    )
    ap.add_argument(
        "--rename-map",
        action="append",
        metavar="OLD=NEW",
        help="Directory/file rename mapping, e.g. catalog=local_catalog (can be repeated)",
    )
    ap.add_argument(
        "--search-moved",
        action="store_true",
        help="Search for missing file basenames under root and rewrite link if unique match found",
    )
    args = ap.parse_args(argv)

    root = args.root
    if not os.path.isdir(root):
        # Try resolving relative to detected repo root
        candidate = os.path.join(REPO_ROOT, root.lstrip("/"))
        if os.path.isdir(candidate):
            root = candidate
        else:
            print(
                f"Error: root directory {root} not found (repo root: {REPO_ROOT})", file=sys.stderr
            )
            return 1

    files = find_markdown_files(root)
    if args.verbose:
        print(f"Scanning {len(files)} markdown files under {root} ...")

    issues = lint_files(files, args.workers)

    # Moved-file search index (basename -> list of full paths). We walk the entire
    # root tree to include non-markdown sources (e.g., .h/.cpp) since links may point to headers.
    # Auto-enabled when --auto-fix is used; can also be explicitly enabled with --search-moved.
    moved_index: dict[str, list[str]] = {}
    if args.auto_fix or args.search_moved:
        for dirpath, dirnames, filenames in os.walk(root):
            # Avoid descending into very large generated output dirs if present.
            # (Heuristic: skip bazel-* dirs under root scan to reduce noise.)
            dirnames[:] = [d for d in dirnames if not d.startswith("bazel-")]
            for fn in filenames:
                if fn.startswith("."):
                    continue
                full = os.path.join(dirpath, fn)
                moved_index.setdefault(fn, []).append(full)
        if args.verbose:
            total_paths = sum(len(v) for v in moved_index.values())
            print(
                f"Built moved-file index: {len(moved_index)} unique basenames mapped to {total_paths} file(s)"
            )

    # Auto-fix pass (only for missing file issues with rename hints)
    if args.auto_fix and issues:
        rename_pairs = {}
        for pair in args.rename_map or []:
            if "=" in pair:
                old, new = pair.split("=", 1)
                rename_pairs[old.strip()] = new.strip()

        if rename_pairs and args.verbose:
            print(f"Auto-fix: applying rename map {rename_pairs}")

        fix_count = 0
        # Group issues by file for editing
        issues_by_file: dict[str, List[LinkIssue]] = {}
        for iss in issues:
            issues_by_file.setdefault(iss.file, []).append(iss)

        # Precompute anchor -> candidate files map to help relocation of anchors.
        anchor_index: dict[str, list[str]] = {}

        def index_file_anchors(path: str):
            if not path.lower().endswith(".md"):
                return
            for a in collect_headings(path):
                anchor_index.setdefault(a, []).append(path)

        # Index only when we encounter first anchor issue to keep performance reasonable.
        anchor_index_built = False

        for md_file, file_issues in issues_by_file.items():
            # Only attempt if file exists and we have rename hints
            if not os.path.isfile(md_file):
                continue
            try:
                # Use a distinct variable name (fh) for the file handle to avoid
                # shadowing earlier loop variables (e.g., 'f' used for file paths),
                # which was confusing the type checker.
                with open(md_file, "r", encoding="utf-8") as fh:
                    lines = fh.readlines()
            except Exception:
                continue

            # Deduplicate identical (message, target) to avoid repeated work (retain first occurrence)
            seen_sig = set()
            deduped: List[LinkIssue] = []
            for iss in file_issues:
                sig = (iss.message, iss.target)
                if sig in seen_sig:
                    continue
                seen_sig.add(sig)
                deduped.append(iss)

            modified = False
            for iss in deduped:
                # Always capture the current target early to avoid scope issues
                original_target = iss.target

                # 0. GitHub master link auto-fix: rewrite to repo-root relative path
                if "unpinned GitHub master reference" in iss.message:
                    m_gh = re.match(
                        r"^https://github.com/(mongodb|10gen)/mongo/(blob|tree)/master/([^#]+)(?:#(.*))?$",
                        original_target,
                    )
                    if m_gh:
                        path_part = m_gh.group(
                            3
                        )  # path inside repository (corrected: group 3, not 2)
                        frag_only = m_gh.group(4)  # fragment (corrected: group 4, not 3)
                        # GitHub URLs point to any repo path (src/, buildscripts/, jstests/, etc)
                        # All must become absolute repo-root refs like /buildscripts/... not buildscripts/...
                        new_target = "/" + path_part
                        if frag_only:
                            new_target += "#" + frag_only  # append single fragment only
                        for idx, line_text in enumerate(lines):
                            token = f"]({original_target})"
                            if token in line_text:
                                lines[idx] = line_text.replace(token, f"]({new_target})", 1)
                                modified = True
                                fix_count += 1
                                if args.verbose:
                                    print(
                                        f"Auto-fixed GitHub master link in {md_file}: {original_target} -> {new_target}"
                                    )
                                break

                # 1. Scheme / common typo fixes
                if "malformed scheme" in iss.message and original_target.startswith("hhttps://"):
                    fixed_target = original_target.replace("hhttps://", "https://", 1)
                    for idx, line_text in enumerate(lines):
                        if f"]({original_target})" in line_text:
                            lines[idx] = line_text.replace(
                                f"]({original_target})", f"]({fixed_target})", 1
                            )
                            modified = True
                            fix_count += 1
                            if args.verbose:
                                print(
                                    f"Auto-fixed malformed scheme in {md_file}: {original_target} -> {fixed_target}"
                                )
                            break

                # 2. Common directory typo fix (storgae -> storage)
                if "file does not exist:" in iss.message and "storgae" in original_target:
                    fixed_target = original_target.replace("storgae", "storage")
                    if fixed_target != original_target:
                        for idx, line_text in enumerate(lines):
                            if f"]({original_target})" in line_text:
                                lines[idx] = line_text.replace(
                                    f"]({original_target})", f"]({fixed_target})", 1
                                )
                                modified = True
                                fix_count += 1
                                if args.verbose:
                                    print(
                                        f"Auto-fixed path typo in {md_file}: {original_target} -> {fixed_target}"
                                    )
                                break

                # 3. Anchor relocation: only attempt if we can extract a plausible fragment token
                if ('anchor "' in iss.message and "not found in target file" in iss.message) or (
                    "anchor not found in this file" in iss.message
                ):
                    # Accept fragments comprised of word chars, dashes, underscores, and periods
                    m_anchor = re.search(r'anchor "([A-Za-z0-9_.:-]+)"', iss.message)
                    frag: Optional[str] = None
                    if m_anchor:
                        frag = m_anchor.group(1)
                    else:
                        # Fallback extraction ONLY from the original target if it starts with '#'
                        if original_target.startswith("#") and len(original_target) > 1:
                            frag = original_target[1:]
                        elif "#" in original_target:
                            frag = original_target.split("#", 1)[1]
                    # Guard against obviously wrong fragments like 'not' arising from message text
                    if frag and frag.lower() == "not":
                        frag = None
                    if frag:
                        norm_frag = github_anchor(frag)
                        if not anchor_index_built:
                            for fpath in files:
                                index_file_anchors(fpath)
                            anchor_index_built = True
                        candidates = anchor_index.get(norm_frag, [])
                        if not candidates and args.verbose:
                            print(
                                f'Verbose: no indexed candidates for anchor "{frag}" (normalized "{norm_frag}") referenced from {md_file}. Performing fallback scan...'
                            )
                            # Fallback: scan sibling and parent directories (one level up) for the anchor
                            search_dirs = {os.path.dirname(md_file)}
                            parent_dir = os.path.dirname(os.path.dirname(md_file))
                            if os.path.isdir(parent_dir):
                                search_dirs.add(parent_dir)
                            fallback_matches: list[str] = []
                            for d in list(search_dirs):
                                try:
                                    for fn in os.listdir(d):
                                        if fn.lower().endswith(".md"):
                                            candidate_path = os.path.join(d, fn)
                                            for a in collect_headings(candidate_path):
                                                if a == norm_frag:
                                                    fallback_matches.append(candidate_path)
                                                    break
                                except Exception:
                                    pass
                            if fallback_matches:
                                candidates = fallback_matches
                                if args.verbose:
                                    print(
                                        f'Verbose: fallback found {len(candidates)} candidate(s) for anchor "{frag}"'
                                    )
                        # Global one-time fallback: scan entire root if still no candidates
                        if not candidates:
                            # Perform a global scan only once per fragment per run (simple memo via anchor_index miss)
                            if args.verbose:
                                print(
                                    f'Verbose: performing global scan for anchor "{frag}" under root {root}'
                                )
                            try:
                                for gfile in files:
                                    if gfile.lower().endswith(".md"):
                                        if norm_frag in collect_headings(gfile):
                                            candidates.append(gfile)
                            except Exception:
                                pass
                            if candidates and args.verbose:
                                print(
                                    f'Verbose: global scan found {len(candidates)} candidate(s) for anchor "{frag}"'
                                )
                        if candidates:
                            chosen: Optional[str] = None
                            if len(candidates) == 1:
                                chosen = candidates[0]
                            else:
                                # Proximity heuristic: minimal directory distance (count of differing path segments)
                                base_dir = os.path.dirname(md_file)

                                def dir_distance(a: str, b: str) -> int:
                                    a_parts = os.path.abspath(a).split(os.sep)
                                    b_parts = os.path.abspath(b).split(os.sep)
                                    # Find common prefix length
                                    i = 0
                                    for x, y in zip(a_parts, b_parts):
                                        if x != y:
                                            break
                                        i += 1
                                    return (len(a_parts) - i) + (len(b_parts) - i)

                                # Rank by distance then by path length for stability
                                chosen = sorted(
                                    candidates, key=lambda p: (dir_distance(base_dir, p), len(p))
                                )[0]
                            if chosen:
                                rel_path = os.path.relpath(chosen, os.path.dirname(md_file))
                                new_target = f"{rel_path}#{frag}"
                                search_token = f"]({original_target})"
                                replaced_any = False
                                for idx, line_text in enumerate(lines):
                                    if search_token in line_text:
                                        # Replace only the first occurrence per line to avoid accidental nested replacements, but scan all lines.
                                        lines[idx] = line_text.replace(
                                            search_token, f"]({new_target})", 1
                                        )
                                        modified = True
                                        replaced_any = True
                                        fix_count += 1
                                if replaced_any and args.verbose:
                                    if len(candidates) > 1:
                                        print(
                                            f"Auto-relocated anchor (closest of {len(candidates)}) in {md_file}: {original_target} -> {new_target}"
                                        )
                                    else:
                                        print(
                                            f"Auto-relocated anchor in {md_file}: {original_target} -> {new_target}"
                                        )

                # 4. Path segment rename fixes (directory renames) independent of anchor relocation
                if rename_pairs and "file does not exist:" in iss.message:
                    path_part = original_target.split("#", 1)[0]
                    new_path_part = path_part
                    for old, new in rename_pairs.items():
                        pattern = re.compile(rf"(?:^|/)({re.escape(old)})(?=/|$)")
                        new_path_part = pattern.sub(
                            lambda m: m.group(0).replace(old, new), new_path_part
                        )
                    if new_path_part != path_part:
                        new_target = new_path_part + (
                            ""
                            if "#" not in original_target
                            else "#" + original_target.split("#", 1)[1]
                        )
                        for idx, line_text in enumerate(lines):
                            if f"]({original_target})" in line_text:
                                lines[idx] = line_text.replace(
                                    f"]({original_target})", f"]({new_target})", 1
                                )
                                modified = True
                                fix_count += 1
                                if args.verbose:
                                    print(
                                        f"Auto-fixed link in {md_file}: {original_target} -> {new_target}"
                                    )
                                break
                # 5. Moved file basename search (auto-enabled with --auto-fix)
                if "file does not exist:" in iss.message and "#" not in original_target:
                    # Extract the basename of the missing file
                    missing_base = os.path.basename(original_target)
                    # Skip obviously non-file references (contain spaces or wildcard characters)
                    # Allow basename-only references (original_target may equal missing_base)
                    if missing_base and " " not in missing_base:
                        candidates = moved_index.get(missing_base, [])
                        # If no candidates under the provided root, attempt a one-time scan of the
                        # full repo root (this can be expensive, so only do it when we miss locally).
                        if not candidates:
                            global_hits: list[str] = []
                            for dirpath, dirnames, filenames in os.walk(REPO_ROOT):
                                # Skip bazel output directories to reduce noise.
                                dirnames[:] = [d for d in dirnames if not d.startswith("bazel-")]
                                if missing_base in filenames:
                                    global_hits.append(os.path.join(dirpath, missing_base))
                                # Fast exit if >1 found (ambiguity)
                                if len(global_hits) > 1:
                                    break
                            if len(global_hits) == 1:
                                candidates = global_hits
                            if args.verbose:
                                if not global_hits:
                                    print(
                                        f"Verbose: moved-file search found no global candidates for {missing_base} (original target {original_target})"
                                    )
                                elif len(global_hits) > 1:
                                    print(
                                        f"Verbose: moved-file search ambiguous ({len(global_hits)} matches) for {missing_base}; skipping auto-fix"
                                    )
                                else:
                                    print(
                                        f"Verbose: moved-file global search matched unique file {global_hits[0]} for {missing_base}"
                                    )
                        if len(candidates) == 1:
                            target_file_candidate = candidates[0]
                            rel_path = os.path.relpath(
                                target_file_candidate, os.path.dirname(md_file)
                            )
                            new_target = rel_path
                            for idx, line_text in enumerate(lines):
                                token = f"]({original_target})"
                                if token in line_text:
                                    lines[idx] = line_text.replace(token, f"]({new_target})", 1)
                                    modified = True
                                    fix_count += 1
                                    if args.verbose:
                                        print(
                                            f"Auto-fixed moved file in {md_file}: {original_target} -> {new_target}"
                                        )
                                    break
            if modified:
                try:
                    with open(md_file, "w", encoding="utf-8") as fh:
                        fh.writelines(lines)
                except Exception:
                    print(f"Warning: failed to write fixes to {md_file}", file=sys.stderr)

        if args.verbose:
            print(f"Auto-fix completed: {fix_count} link(s) updated")
        # Re-run lint to update issues list after fixes
        if fix_count:
            ANCHOR_CACHE.clear()
            REFERENCE_CACHE.clear()
            issues = lint_files(files, args.workers)

    if args.json:
        print(json.dumps([i.to_dict() for i in issues], indent=2))
    else:
        for issue in issues:
            print(f"{issue.file}:{issue.line}: {issue.message} [{issue.target}]")

    if issues:
        print(f"Found {len(issues)} markdown link issue(s).", file=sys.stderr)
        return 2
    else:
        if args.verbose:
            print("All links OK.")
        return 0


if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))