#!/usr/bin/env python3
"""Markdown Link Linter (MongoDB)
=================================
Checks Markdown files under `src/mongo` for broken internal links.
Link Types Validated
--------------------
1. Intra-document anchors: `[text](#some-heading)`
2. Relative file links: `[text](../../path/to/OtherFile.md#anchor)`
3. Repo-root relative paths beginning with `/` (e.g. `[feature flags](/src/mongo/db/query/README_query_feature_flags.md)`)
4. Reference-style links: `[text][label]` or `[text][]` with definitions like `[label]: url`
External (http/https) links are currently skipped (no network requests performed) except for a trivial malformed scheme check (e.g. `hhttps://`).
GitHub Repository Link Policy
-----------------------------
Links to the MongoDB server repository (`github.com/mongodb/mongo` or private clone `github.com/10gen/mongo`) must not reference the mutable `master` branch. Allowed:
* Release/tag branches (e.g. `r6.2.0`)
* Specific commit SHAs (40 hex chars)
* Any other non-`master` branch
Unpinned `master` links are reported with an issue. Auto-fix rewrites them to a repo-root relative path (`/src/...`) preserving any line fragment (e.g. `#L89`).
Anchor Normalization
--------------------
GitHub-style anchors derived from headings:
* Lowercased
* Punctuation stripped (most symbols except `-` and `_`)
* Spaces collapsed to single `-`
Usage
-----
Run from repository root:
python buildscripts/lint_markdown_links.py --verbose
JSON output (exit code still meaningful):
python buildscripts/lint_markdown_links.py --json > link_report.json
Auto-Fix Renamed Paths
----------------------
Auto-fix (`--auto-fix`) automatically handles:
* Directory renames via `--rename-map old=new`
* Moved files (searches by basename across the repository)
* Broken anchors (relocates to correct file)
* Common typos and malformed schemes
Example with rename mapping:
python buildscripts/lint_markdown_links.py --auto-fix --rename-map catalog=local_catalog --root src/mongo/db/storage --verbose
Multiple mappings:
python buildscripts/lint_markdown_links.py --auto-fix \
--rename-map catalog=local_catalog \
--rename-map query_stats=query_shape_stats
After auto-fix the script re-runs linting to verify all fixes.
Safety Characteristics
----------------------
* Only replaces the specific `](oldpath...)` occurrence.
* Skips if replacement yields identical path.
* Always review diffs before committing.
Exit Codes
----------
0 = all links OK
1 = usage / root not found
2 = one or more link issues detected
Sample Output
-------------
src/mongo/example/README.md:42: file does not exist: /abs/path/src/mongo/missing.md [missing.md]
src/mongo/example/README.md:57: anchor "overview" not found in target file [other.md#overview]
Common False Positives
----------------------
* Headings generated dynamically (e.g. code-generated docs)
* Links to files produced by a build step
* Root-relative paths not starting with `/src/` (extend logic if needed)
* External links (intentionally not validated)
Performance
-----------
Parallel validation using a thread pool sized to available CPUs (capped at 32).
Suppressing Specific Links
--------------------------
Not implemented yet. Potential future directive:
Future Enhancements (Ideas)
---------------------------
* Reference-style link resolution (`[text][ref]` definitions)
* Ignore patterns via CLI or config file
* CI integration (Bazel / GitHub Actions) enforcing link health
* Levenshtein suggestions for typos
* Anchor remapping when heading text changes
CI Integration Example
----------------------
Add a step:
python buildscripts/lint_markdown_links.py --json
Fail build if exit code is 2.
Implementation Notes
--------------------
* Uses regex heuristics (no full Markdown parsing) for speed.
* Anchor generation and link fragment normalization share the same logic (`github_anchor`).
Maintained by MongoDB Engineering Tooling.
"""
from __future__ import annotations
import argparse
import concurrent.futures
import json
import os
import re
import sys
import urllib.parse
from dataclasses import dataclass
from typing import Iterable, List, Optional, Tuple
HEADING_RE = re.compile(r"^(#{1,6})\s+(.*)$")
HTML_ANCHOR_RE = re.compile(r'\s*?', re.IGNORECASE)
LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
# Inline link references: [text]: url
REF_DEF_RE = re.compile(r"^\s*\[([^\]]+)\]:\s+(\S+)")
# Reference-style links: [text][label] or [text][] but NOT [[double brackets]]
# Negative lookbehind (??@[]^`{|}~" # punctuation characters to remove
ANCHOR_CACHE: dict[str, set[str]] = {}
# Cache for reference-style link definitions: file_path -> {label: target_url}
REFERENCE_CACHE: dict[str, dict[str, str]] = {}
def _detect_repo_root(start: str | None = None) -> str:
"""Walk upwards to locate repository root (presence of WORKSPACE.bazel or .git).
Falls back to current working directory if no sentinel found.
"""
if start is None:
start = os.getcwd()
cur = os.path.abspath(start)
last = None
while cur != last:
if (
os.path.exists(os.path.join(cur, "WORKSPACE.bazel"))
or os.path.exists(os.path.join(cur, "MODULE.bazel"))
or os.path.isdir(os.path.join(cur, ".git"))
):
return cur
last = cur
cur = os.path.dirname(cur)
return os.getcwd()
REPO_ROOT = _detect_repo_root()
@dataclass
class LinkIssue:
file: str
line: int
link_text: str
target: str
message: str
def to_dict(self):
return {
"file": self.file,
"line": self.line,
"link_text": self.link_text,
"target": self.target,
"message": self.message,
}
def github_anchor(text: str) -> str:
t = text.strip().lower()
# remove punctuation
t2 = "".join(ch for ch in t if ch not in PUNCT_TO_STRIP)
# spaces to hyphens
t2 = re.sub(r"\s+", "-", t2)
# collapse multiple hyphens
t2 = re.sub(r"-+", "-", t2)
return t2
def collect_headings(path: str) -> set[str]:
if path in ANCHOR_CACHE:
return ANCHOR_CACHE[path]
anchors: set[str] = set()
try:
with open(path, "r", encoding="utf-8") as f:
for line in f:
# Support blockquoted headings: strip leading '>' plus following space(s)
if line.lstrip().startswith(">"):
# Remove successive '>' prefixes (nested blockquotes) while preserving heading markers
stripped = line.lstrip()
while stripped.startswith(">"):
stripped = stripped[1:].lstrip()
candidate_line = stripped
else:
candidate_line = line
m = HEADING_RE.match(candidate_line)
if m:
heading_text = m.group(2)
# Extract any embedded HTML anchors first
for a in HTML_ANCHOR_RE.finditer(heading_text):
raw = a.group(1).strip()
if raw:
anchors.add(github_anchor(raw))
anchors.add(raw) # also allow direct reference without normalization
# Remove HTML anchor tags from heading text before normalizing (support no-space join)
cleaned = HTML_ANCHOR_RE.sub("", heading_text).strip()
# Strip inline markdown links (e.g., Classic [PlanCache](...)) to derive anchor from visible text only.
cleaned = re.sub(r"\[([^\]]+)\]\([^)]*\)", r"\1", cleaned)
if cleaned:
norm_clean = github_anchor(cleaned)
# Duplicate tracking: if an anchor already exists, add numbered variants
if norm_clean in anchors:
# Count existing numbered variants to assign next index
existing_indices = [0]
for existing in list(anchors):
if existing == norm_clean:
existing_indices.append(0)
elif re.match(rf"^{re.escape(norm_clean)}-(\d+)$", existing):
try:
existing_indices.append(int(existing.rsplit("-", 1)[1]))
except Exception:
pass
next_idx = max(existing_indices) + 1
numbered = f"{norm_clean}-{next_idx}"
anchors.add(numbered)
anchors.add(norm_clean)
# Also add normalized form of raw anchors (in case users link using normalized visible text form)
for a in HTML_ANCHOR_RE.finditer(heading_text):
raw = a.group(1).strip()
if raw:
anchors.add(github_anchor(raw))
except Exception:
pass
ANCHOR_CACHE[path] = anchors
return anchors
def collect_reference_definitions(path: str) -> dict[str, str]:
"""Parse all reference-style link definitions [label]: url from a markdown file."""
if path in REFERENCE_CACHE:
return REFERENCE_CACHE[path]
references: dict[str, str] = {}
try:
with open(path, "r", encoding="utf-8") as f:
for line in f:
m = REF_DEF_RE.match(line)
if m:
label = m.group(1).strip().lower() # case-insensitive matching
target = m.group(2).strip()
references[label] = target
except Exception:
pass
REFERENCE_CACHE[path] = references
return references
def is_http_url(url: str) -> bool:
return url.startswith("http://") or url.startswith("https://")
def find_markdown_files(root: str) -> List[str]:
files: List[str] = []
for dirpath, _, filenames in os.walk(root):
for fn in filenames:
if fn.lower().endswith(".md"):
files.append(os.path.join(dirpath, fn))
return files
def parse_links(file_path: str) -> List[Tuple[int, str, str]]:
links: List[Tuple[int, str, str]] = []
try:
with open(file_path, "r", encoding="utf-8") as f:
in_fence = False
in_blockquote = False
fence_delim = None # track ``` or ~~~
for idx, raw_line in enumerate(f, start=1):
line = raw_line.rstrip("\n")
# Detect start/end of fenced code blocks. Accept ``` or ~~~ with optional language and leading whitespace.
fence_match = re.match(r"^\s*(?P`{3,}|~{3,})(.*)$", line)
if fence_match:
full = fence_match.group("delim")
# Toggle if same delimiter starts/ends
if not in_fence:
in_fence = True
fence_delim = full
continue
else:
# Only close if same delimiter length & char
if fence_delim == full:
in_fence = False
fence_delim = None
continue
if in_fence:
continue # skip link detection inside code fences
# Blockquote handling: if line starts with '>' treat entire following wrapped paragraph as quoted until blank line
if re.match(r"^\s*>", line):
in_blockquote = True
continue
if in_blockquote:
if line.strip() == "":
in_blockquote = False
else:
continue
# Skip lines that are reference definitions themselves
if REF_DEF_RE.match(line):
continue
# Find all backtick regions to exclude from link detection
# Build a set of character positions that are inside backticks
backtick_positions = set()
in_code = False
for i, char in enumerate(line):
if char == "`":
in_code = not in_code
elif in_code:
backtick_positions.add(i)
# Helper function to check if the opening bracket of a link is inside backticks
# We only check the start position because if the [ is in code, the whole link should be skipped
def is_in_code_span(match_start):
return match_start in backtick_positions
# Track character ranges of all matched links to avoid double-processing
matched_ranges = []
def overlaps_matched_range(start, end):
"""Check if a position range overlaps with any previously matched range."""
for m_start, m_end in matched_ranges:
# Check for any overlap
if start < m_end and end > m_start:
return True
return False
# Inline links [text](url)
for m in LINK_RE.finditer(line):
if is_in_code_span(m.start()):
continue # Skip links inside backticks
text, target = m.group(1), m.group(2).strip()
links.append((idx, text, target))
matched_ranges.append((m.start(), m.end()))
# Reference-style links [text][label] or [text][]
for m in REF_USE_RE.finditer(line):
if is_in_code_span(m.start()):
continue # Skip links inside backticks
full_match = m.group(0)
text = m.group(1).strip()
# Extract label from [text][label] - if empty brackets [], use text as label
label_part = full_match[len(text) + 2 :] # skip [text]
if label_part == "[]":
label = text # implicit reference: [text][] uses "text" as label
else:
# Explicit label: [text][label]
label = label_part.strip("[]").strip()
# Use special marker to indicate this is a reference link
links.append((idx, text, f"__REF__{label}"))
matched_ranges.append((m.start(), m.end()))
# Shortcut reference links [text] - single bracket that references a definition
# Only match if not already matched by inline or reference-style patterns
# Pattern: single bracket pair not preceded by [ and not followed by ( or [
for m in re.finditer(r"(? Optional[LinkIssue]:
# Handle reference-style links [text][label]
if target.startswith("__REF__"):
label = target[7:].lower() # Extract label and normalize to lowercase
references = collect_reference_definitions(current_file)
if label not in references:
return LinkIssue(
current_file,
line,
text,
f"[{label}]",
f'reference link label "{label}" not defined in this file',
)
# Resolve the reference and validate the actual target
resolved_target = references[label]
return validate_link(current_file, line, text, resolved_target)
# Remove surrounding <> used sometimes in markdown
if target.startswith("<") and target.endswith(">"):
target = target[1:-1]
# Ignore empty link
if target == "":
return LinkIssue(current_file, line, text, target, "empty link target")
# Fragment-only (#anchor)
if target.startswith("#"):
anchors = collect_headings(current_file)
raw_anchor = target[1:]
# Normalize link anchor the same way headings are normalized
norm_anchor = github_anchor(raw_anchor)
if norm_anchor not in anchors:
# Fuzzy variants: attempt to tolerate missing or extra hyphens inside multi-token anchors.
# Strategy:
# 1. If anchor has hyphens, try removing each hyphen individually (concatenation forms).
# 2. Try removing all hyphens (fully concatenated form).
# 3. If anchor has N hyphens, also try forms with one extra hyphen inserted between adjacent alphanumerics
# (covers classic-plancache -> classic-plan-cache).
# 4. If anchor has no hyphens, attempt inserting a hyphen at every internal boundary between alphanumerics.
fuzzy_match = False
variant_candidates: set[str] = set()
a = norm_anchor
# (1) remove each hyphen separately
if "-" in a:
for i, ch in enumerate(a):
if ch == "-":
variant_candidates.add(a[:i] + a[i + 1 :])
# (2) remove all hyphens
variant_candidates.add(a.replace("-", ""))
# (3) insert extra hyphen between alphanumerics where not already hyphen
for i in range(1, len(a)):
if a[i] != "-" and a[i - 1] != "-":
if a[i - 1].isalnum() and a[i].isalnum():
variant_candidates.add(a[:i] + "-" + a[i:])
else:
# (4) insert hyphen at every internal boundary
for i in range(1, len(a)):
if a[i - 1].isalnum() and a[i].isalnum():
variant_candidates.add(a[:i] + "-" + a[i:])
# Limit explosion: cap at 50 candidates
if len(variant_candidates) > 50:
variant_candidates = set(list(variant_candidates)[:50])
for cand in variant_candidates:
if cand in anchors:
fuzzy_match = True
break
if fuzzy_match:
return None # Suppress issue since a fuzzy variant matches
return LinkIssue(current_file, line, text, target, "anchor not found in this file")
return None
# Split fragment if present
file_part, frag_part = target.split("#", 1) if "#" in target else (target, None)
if is_http_url(file_part):
# Allow detection of malformed scheme 'hhttps://' but otherwise skip external validation
if file_part.startswith("hhttps://"):
return LinkIssue(
current_file, line, text, target, "malformed scheme (did you mean https:// ?)"
)
# Enforce pinned GitHub refs for mongodb/mongo and 10gen/mongo repositories.
gh_match = re.match(
r"^https://github.com/(mongodb|10gen)/mongo/(blob|tree)/([^/]+)/([^#]+)(?:#.*)?$",
target,
)
if gh_match:
owner, kind, ref, path_rest = gh_match.groups()
if ref == "master":
return LinkIssue(
current_file,
line,
text,
target,
"unpinned GitHub master reference; use tag/commit or relative path",
)
return None # Non-master GitHub link accepted
return None
# Remove query params if any
if "?" in file_part:
parsed = urllib.parse.urlparse(file_part)
file_part = parsed.path
# Normalize relative path. If path starts with '/' treat as repo-root relative.
repo_root = REPO_ROOT # resolved once; works under Bazel runfiles
if file_part.startswith("/"):
resolved_path = os.path.normpath(os.path.join(repo_root, file_part.lstrip("/")))
else:
current_dir = os.path.dirname(current_file)
resolved_path = os.path.normpath(os.path.join(current_dir, file_part))
if not os.path.exists(resolved_path):
# Try appending .md extension if the path doesn't exist
if not resolved_path.endswith(".md"):
resolved_path_with_md = resolved_path + ".md"
if os.path.exists(resolved_path_with_md):
resolved_path = resolved_path_with_md
else:
return LinkIssue(
current_file, line, text, target, f"file does not exist: {resolved_path}"
)
else:
return LinkIssue(
current_file, line, text, target, f"file does not exist: {resolved_path}"
)
if frag_part:
# If target file is NOT markdown and fragment matches a GitHub line anchor (#Lnn or #Lnn-Lmm), accept.
if not resolved_path.lower().endswith(".md") and re.match(r"^L\d+(-L\d+)?$", frag_part):
return None
anchors = (
collect_headings(resolved_path) if resolved_path.lower().endswith(".md") else set()
)
if resolved_path.lower().endswith(".md"):
norm_frag = github_anchor(frag_part)
if norm_frag not in anchors:
return LinkIssue(
current_file,
line,
text,
target,
f'anchor "{frag_part}" not found in target file',
)
else:
# Non-markdown + non line-fragment: cannot validate anchor, assume ok.
return None
return None
def lint_files(files: Iterable[str], workers: int) -> List[LinkIssue]:
issues: List[LinkIssue] = []
def process(file_path: str) -> List[LinkIssue]:
file_issues: List[LinkIssue] = []
links = parse_links(file_path)
for line, text, target in links:
issue = validate_link(file_path, line, text, target)
if issue:
file_issues.append(issue)
return file_issues
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as exe:
futures = {exe.submit(process, f): f for f in files}
for fut in concurrent.futures.as_completed(futures):
for iss in fut.result():
issues.append(iss)
return issues
def main(argv: List[str]) -> int:
ap = argparse.ArgumentParser(description="Markdown link linter for src/mongo markdown files.")
ap.add_argument("--root", default="src/mongo", help="Root directory to scan")
ap.add_argument(
"--workers",
type=int,
default=min(32, (os.cpu_count() or 4)),
help="Parallel worker threads",
)
ap.add_argument("--json", action="store_true", help="Output machine-readable JSON")
ap.add_argument("--verbose", action="store_true", help="Verbose output")
ap.add_argument(
"--auto-fix",
action="store_true",
help="Attempt automatic fixes for simple broken links (renames)",
)
ap.add_argument(
"--rename-map",
action="append",
metavar="OLD=NEW",
help="Directory/file rename mapping, e.g. catalog=local_catalog (can be repeated)",
)
ap.add_argument(
"--search-moved",
action="store_true",
help="Search for missing file basenames under root and rewrite link if unique match found",
)
args = ap.parse_args(argv)
root = args.root
if not os.path.isdir(root):
# Try resolving relative to detected repo root
candidate = os.path.join(REPO_ROOT, root.lstrip("/"))
if os.path.isdir(candidate):
root = candidate
else:
print(
f"Error: root directory {root} not found (repo root: {REPO_ROOT})", file=sys.stderr
)
return 1
files = find_markdown_files(root)
if args.verbose:
print(f"Scanning {len(files)} markdown files under {root} ...")
issues = lint_files(files, args.workers)
# Moved-file search index (basename -> list of full paths). We walk the entire
# root tree to include non-markdown sources (e.g., .h/.cpp) since links may point to headers.
# Auto-enabled when --auto-fix is used; can also be explicitly enabled with --search-moved.
moved_index: dict[str, list[str]] = {}
if args.auto_fix or args.search_moved:
for dirpath, dirnames, filenames in os.walk(root):
# Avoid descending into very large generated output dirs if present.
# (Heuristic: skip bazel-* dirs under root scan to reduce noise.)
dirnames[:] = [d for d in dirnames if not d.startswith("bazel-")]
for fn in filenames:
if fn.startswith("."):
continue
full = os.path.join(dirpath, fn)
moved_index.setdefault(fn, []).append(full)
if args.verbose:
total_paths = sum(len(v) for v in moved_index.values())
print(
f"Built moved-file index: {len(moved_index)} unique basenames mapped to {total_paths} file(s)"
)
# Auto-fix pass (only for missing file issues with rename hints)
if args.auto_fix and issues:
rename_pairs = {}
for pair in args.rename_map or []:
if "=" in pair:
old, new = pair.split("=", 1)
rename_pairs[old.strip()] = new.strip()
if rename_pairs and args.verbose:
print(f"Auto-fix: applying rename map {rename_pairs}")
fix_count = 0
# Group issues by file for editing
issues_by_file: dict[str, List[LinkIssue]] = {}
for iss in issues:
issues_by_file.setdefault(iss.file, []).append(iss)
# Precompute anchor -> candidate files map to help relocation of anchors.
anchor_index: dict[str, list[str]] = {}
def index_file_anchors(path: str):
if not path.lower().endswith(".md"):
return
for a in collect_headings(path):
anchor_index.setdefault(a, []).append(path)
# Index only when we encounter first anchor issue to keep performance reasonable.
anchor_index_built = False
for md_file, file_issues in issues_by_file.items():
# Only attempt if file exists and we have rename hints
if not os.path.isfile(md_file):
continue
try:
# Use a distinct variable name (fh) for the file handle to avoid
# shadowing earlier loop variables (e.g., 'f' used for file paths),
# which was confusing the type checker.
with open(md_file, "r", encoding="utf-8") as fh:
lines = fh.readlines()
except Exception:
continue
# Deduplicate identical (message, target) to avoid repeated work (retain first occurrence)
seen_sig = set()
deduped: List[LinkIssue] = []
for iss in file_issues:
sig = (iss.message, iss.target)
if sig in seen_sig:
continue
seen_sig.add(sig)
deduped.append(iss)
modified = False
for iss in deduped:
# Always capture the current target early to avoid scope issues
original_target = iss.target
# 0. GitHub master link auto-fix: rewrite to repo-root relative path
if "unpinned GitHub master reference" in iss.message:
m_gh = re.match(
r"^https://github.com/(mongodb|10gen)/mongo/(blob|tree)/master/([^#]+)(?:#(.*))?$",
original_target,
)
if m_gh:
path_part = m_gh.group(
3
) # path inside repository (corrected: group 3, not 2)
frag_only = m_gh.group(4) # fragment (corrected: group 4, not 3)
# GitHub URLs point to any repo path (src/, buildscripts/, jstests/, etc)
# All must become absolute repo-root refs like /buildscripts/... not buildscripts/...
new_target = "/" + path_part
if frag_only:
new_target += "#" + frag_only # append single fragment only
for idx, line_text in enumerate(lines):
token = f"]({original_target})"
if token in line_text:
lines[idx] = line_text.replace(token, f"]({new_target})", 1)
modified = True
fix_count += 1
if args.verbose:
print(
f"Auto-fixed GitHub master link in {md_file}: {original_target} -> {new_target}"
)
break
# 1. Scheme / common typo fixes
if "malformed scheme" in iss.message and original_target.startswith("hhttps://"):
fixed_target = original_target.replace("hhttps://", "https://", 1)
for idx, line_text in enumerate(lines):
if f"]({original_target})" in line_text:
lines[idx] = line_text.replace(
f"]({original_target})", f"]({fixed_target})", 1
)
modified = True
fix_count += 1
if args.verbose:
print(
f"Auto-fixed malformed scheme in {md_file}: {original_target} -> {fixed_target}"
)
break
# 2. Common directory typo fix (storgae -> storage)
if "file does not exist:" in iss.message and "storgae" in original_target:
fixed_target = original_target.replace("storgae", "storage")
if fixed_target != original_target:
for idx, line_text in enumerate(lines):
if f"]({original_target})" in line_text:
lines[idx] = line_text.replace(
f"]({original_target})", f"]({fixed_target})", 1
)
modified = True
fix_count += 1
if args.verbose:
print(
f"Auto-fixed path typo in {md_file}: {original_target} -> {fixed_target}"
)
break
# 3. Anchor relocation: only attempt if we can extract a plausible fragment token
if ('anchor "' in iss.message and "not found in target file" in iss.message) or (
"anchor not found in this file" in iss.message
):
# Accept fragments comprised of word chars, dashes, underscores, and periods
m_anchor = re.search(r'anchor "([A-Za-z0-9_.:-]+)"', iss.message)
frag: Optional[str] = None
if m_anchor:
frag = m_anchor.group(1)
else:
# Fallback extraction ONLY from the original target if it starts with '#'
if original_target.startswith("#") and len(original_target) > 1:
frag = original_target[1:]
elif "#" in original_target:
frag = original_target.split("#", 1)[1]
# Guard against obviously wrong fragments like 'not' arising from message text
if frag and frag.lower() == "not":
frag = None
if frag:
norm_frag = github_anchor(frag)
if not anchor_index_built:
for fpath in files:
index_file_anchors(fpath)
anchor_index_built = True
candidates = anchor_index.get(norm_frag, [])
if not candidates and args.verbose:
print(
f'Verbose: no indexed candidates for anchor "{frag}" (normalized "{norm_frag}") referenced from {md_file}. Performing fallback scan...'
)
# Fallback: scan sibling and parent directories (one level up) for the anchor
search_dirs = {os.path.dirname(md_file)}
parent_dir = os.path.dirname(os.path.dirname(md_file))
if os.path.isdir(parent_dir):
search_dirs.add(parent_dir)
fallback_matches: list[str] = []
for d in list(search_dirs):
try:
for fn in os.listdir(d):
if fn.lower().endswith(".md"):
candidate_path = os.path.join(d, fn)
for a in collect_headings(candidate_path):
if a == norm_frag:
fallback_matches.append(candidate_path)
break
except Exception:
pass
if fallback_matches:
candidates = fallback_matches
if args.verbose:
print(
f'Verbose: fallback found {len(candidates)} candidate(s) for anchor "{frag}"'
)
# Global one-time fallback: scan entire root if still no candidates
if not candidates:
# Perform a global scan only once per fragment per run (simple memo via anchor_index miss)
if args.verbose:
print(
f'Verbose: performing global scan for anchor "{frag}" under root {root}'
)
try:
for gfile in files:
if gfile.lower().endswith(".md"):
if norm_frag in collect_headings(gfile):
candidates.append(gfile)
except Exception:
pass
if candidates and args.verbose:
print(
f'Verbose: global scan found {len(candidates)} candidate(s) for anchor "{frag}"'
)
if candidates:
chosen: Optional[str] = None
if len(candidates) == 1:
chosen = candidates[0]
else:
# Proximity heuristic: minimal directory distance (count of differing path segments)
base_dir = os.path.dirname(md_file)
def dir_distance(a: str, b: str) -> int:
a_parts = os.path.abspath(a).split(os.sep)
b_parts = os.path.abspath(b).split(os.sep)
# Find common prefix length
i = 0
for x, y in zip(a_parts, b_parts):
if x != y:
break
i += 1
return (len(a_parts) - i) + (len(b_parts) - i)
# Rank by distance then by path length for stability
chosen = sorted(
candidates, key=lambda p: (dir_distance(base_dir, p), len(p))
)[0]
if chosen:
rel_path = os.path.relpath(chosen, os.path.dirname(md_file))
new_target = f"{rel_path}#{frag}"
search_token = f"]({original_target})"
replaced_any = False
for idx, line_text in enumerate(lines):
if search_token in line_text:
# Replace only the first occurrence per line to avoid accidental nested replacements, but scan all lines.
lines[idx] = line_text.replace(
search_token, f"]({new_target})", 1
)
modified = True
replaced_any = True
fix_count += 1
if replaced_any and args.verbose:
if len(candidates) > 1:
print(
f"Auto-relocated anchor (closest of {len(candidates)}) in {md_file}: {original_target} -> {new_target}"
)
else:
print(
f"Auto-relocated anchor in {md_file}: {original_target} -> {new_target}"
)
# 4. Path segment rename fixes (directory renames) independent of anchor relocation
if rename_pairs and "file does not exist:" in iss.message:
path_part = original_target.split("#", 1)[0]
new_path_part = path_part
for old, new in rename_pairs.items():
pattern = re.compile(rf"(?:^|/)({re.escape(old)})(?=/|$)")
new_path_part = pattern.sub(
lambda m: m.group(0).replace(old, new), new_path_part
)
if new_path_part != path_part:
new_target = new_path_part + (
""
if "#" not in original_target
else "#" + original_target.split("#", 1)[1]
)
for idx, line_text in enumerate(lines):
if f"]({original_target})" in line_text:
lines[idx] = line_text.replace(
f"]({original_target})", f"]({new_target})", 1
)
modified = True
fix_count += 1
if args.verbose:
print(
f"Auto-fixed link in {md_file}: {original_target} -> {new_target}"
)
break
# 5. Moved file basename search (auto-enabled with --auto-fix)
if "file does not exist:" in iss.message and "#" not in original_target:
# Extract the basename of the missing file
missing_base = os.path.basename(original_target)
# Skip obviously non-file references (contain spaces or wildcard characters)
# Allow basename-only references (original_target may equal missing_base)
if missing_base and " " not in missing_base:
candidates = moved_index.get(missing_base, [])
# If no candidates under the provided root, attempt a one-time scan of the
# full repo root (this can be expensive, so only do it when we miss locally).
if not candidates:
global_hits: list[str] = []
for dirpath, dirnames, filenames in os.walk(REPO_ROOT):
# Skip bazel output directories to reduce noise.
dirnames[:] = [d for d in dirnames if not d.startswith("bazel-")]
if missing_base in filenames:
global_hits.append(os.path.join(dirpath, missing_base))
# Fast exit if >1 found (ambiguity)
if len(global_hits) > 1:
break
if len(global_hits) == 1:
candidates = global_hits
if args.verbose:
if not global_hits:
print(
f"Verbose: moved-file search found no global candidates for {missing_base} (original target {original_target})"
)
elif len(global_hits) > 1:
print(
f"Verbose: moved-file search ambiguous ({len(global_hits)} matches) for {missing_base}; skipping auto-fix"
)
else:
print(
f"Verbose: moved-file global search matched unique file {global_hits[0]} for {missing_base}"
)
if len(candidates) == 1:
target_file_candidate = candidates[0]
rel_path = os.path.relpath(
target_file_candidate, os.path.dirname(md_file)
)
new_target = rel_path
for idx, line_text in enumerate(lines):
token = f"]({original_target})"
if token in line_text:
lines[idx] = line_text.replace(token, f"]({new_target})", 1)
modified = True
fix_count += 1
if args.verbose:
print(
f"Auto-fixed moved file in {md_file}: {original_target} -> {new_target}"
)
break
if modified:
try:
with open(md_file, "w", encoding="utf-8") as fh:
fh.writelines(lines)
except Exception:
print(f"Warning: failed to write fixes to {md_file}", file=sys.stderr)
if args.verbose:
print(f"Auto-fix completed: {fix_count} link(s) updated")
# Re-run lint to update issues list after fixes
if fix_count:
ANCHOR_CACHE.clear()
REFERENCE_CACHE.clear()
issues = lint_files(files, args.workers)
if args.json:
print(json.dumps([i.to_dict() for i in issues], indent=2))
else:
for issue in issues:
print(f"{issue.file}:{issue.line}: {issue.message} [{issue.target}]")
if issues:
print(f"Found {len(issues)} markdown link issue(s).", file=sys.stderr)
return 2
else:
if args.verbose:
print("All links OK.")
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))