mirror of https://github.com/mongodb/mongo
SERVER-113600 Lint broken internal markdown links and fix them (#43704)
Co-authored-by: Pierlauro Sciarelli <pierlauro.sciarelli@mongodb.com> GitOrigin-RevId: c684e293f08d9f5b7e151a280ca47974d6eb0bf8
This commit is contained in:
parent
2e740e56a7
commit
85b80b1f13
|
|
@ -1,5 +1,6 @@
|
|||
load("@poetry//:dependencies.bzl", "dependency")
|
||||
load("@npm//:eslint/package_json.bzl", "bin")
|
||||
load("@rules_python//python:defs.bzl", "py_binary", "py_library")
|
||||
|
||||
exports_files([
|
||||
"candle_wrapper.py",
|
||||
|
|
@ -399,3 +400,12 @@ py_binary(
|
|||
"local_rbe_container_url",
|
||||
],
|
||||
)
|
||||
|
||||
# Markdown link linter
|
||||
py_binary(
|
||||
name = "markdown_link_linter",
|
||||
srcs = ["lint_markdown_links.py"],
|
||||
main = "lint_markdown_links.py",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [], # 'requests' optional (external link checks skipped if absent)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,800 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Markdown Link Linter (MongoDB)
|
||||
=================================
|
||||
|
||||
Checks Markdown files under `src/mongo` for broken internal links.
|
||||
|
||||
Link Types Validated
|
||||
--------------------
|
||||
1. Intra-document anchors: `[text](#some-heading)`
|
||||
2. Relative file links: `[text](../../path/to/OtherFile.md#anchor)`
|
||||
3. Repo-root relative paths beginning with `/src/` (e.g. `[feature flags](/src/mongo/db/query/README_query_feature_flags.md)`).
|
||||
|
||||
External (http/https) links are currently skipped (no network requests performed) except for a trivial malformed scheme check (e.g. `hhttps://`).
|
||||
|
||||
Anchor Normalization
|
||||
--------------------
|
||||
GitHub-style anchors derived from headings:
|
||||
* Lowercased
|
||||
* Punctuation stripped (most symbols except `-` and `_`)
|
||||
* Spaces collapsed to single `-`
|
||||
|
||||
Usage
|
||||
-----
|
||||
Run from repository root:
|
||||
python buildscripts/lint_markdown_links.py --verbose
|
||||
|
||||
JSON output (exit code still meaningful):
|
||||
python buildscripts/lint_markdown_links.py --json > link_report.json
|
||||
|
||||
Auto-Fix Renamed Paths
|
||||
----------------------
|
||||
If directories or files were renamed (e.g. `catalog` -> `local_catalog`), attempt automatic fixes:
|
||||
python buildscripts/lint_markdown_links.py --auto-fix --rename-map catalog=local_catalog --root src/mongo/db/storage --verbose
|
||||
|
||||
Multiple mappings:
|
||||
python buildscripts/lint_markdown_links.py --auto-fix \
|
||||
--rename-map catalog=local_catalog \
|
||||
--rename-map query_stats=query_shape_stats
|
||||
|
||||
After auto-fix the script re-runs linting. Only simple missing-file cases where a path segment matches an OLD value are modified; anchors are preserved.
|
||||
|
||||
Safety Characteristics
|
||||
----------------------
|
||||
* Only replaces the specific `](oldpath...)` occurrence.
|
||||
* Skips if replacement yields identical path.
|
||||
* Always review diffs before committing.
|
||||
|
||||
Exit Codes
|
||||
----------
|
||||
0 = all links OK
|
||||
1 = usage / root not found
|
||||
2 = one or more link issues detected
|
||||
|
||||
Sample Output
|
||||
-------------
|
||||
src/mongo/example/README.md:42: file does not exist: /abs/path/src/mongo/missing.md [missing.md]
|
||||
src/mongo/example/README.md:57: anchor "overview" not found in target file [other.md#overview]
|
||||
|
||||
Common False Positives
|
||||
----------------------
|
||||
* Headings generated dynamically (e.g. code-generated docs)
|
||||
* Links to files produced by a build step
|
||||
* Root-relative paths not starting with `/src/` (extend logic if needed)
|
||||
* External links (intentionally not validated)
|
||||
|
||||
Performance
|
||||
-----------
|
||||
Parallel validation using a thread pool sized to available CPUs (capped at 32).
|
||||
|
||||
Suppressing Specific Links
|
||||
--------------------------
|
||||
Not implemented yet. Potential future directive:
|
||||
<!-- linklint-ignore-next -->
|
||||
|
||||
Future Enhancements (Ideas)
|
||||
---------------------------
|
||||
* Reference-style link resolution (`[text][ref]` definitions)
|
||||
* Ignore patterns via CLI or config file
|
||||
* CI integration (Bazel / GitHub Actions) enforcing link health
|
||||
* Levenshtein suggestions for typos
|
||||
* Anchor remapping when heading text changes
|
||||
|
||||
CI Integration Example
|
||||
----------------------
|
||||
Add a step:
|
||||
python buildscripts/lint_markdown_links.py --json
|
||||
Fail build if exit code is 2.
|
||||
|
||||
Implementation Notes
|
||||
--------------------
|
||||
* Uses regex heuristics (no full Markdown parsing) for speed.
|
||||
* Anchor generation and link fragment normalization share the same logic (`github_anchor`).
|
||||
|
||||
Maintained by MongoDB Engineering Tooling.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import concurrent.futures
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.parse
|
||||
from dataclasses import dataclass
|
||||
from typing import Iterable, List, Optional, Tuple
|
||||
|
||||
HEADING_RE = re.compile(r"^(#{1,6})\s+(.*)$")
|
||||
HTML_ANCHOR_RE = re.compile(r'<a\s+(?:name|id)=["\']([^"\']+)["\']\s*>\s*</a>?', re.IGNORECASE)
|
||||
LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
|
||||
# Inline link references: [text]: url
|
||||
REF_DEF_RE = re.compile(r"^\s*\[([^\]]+)\]:\s+(\S+)")
|
||||
REF_USE_RE = re.compile(r"\[([^\]]+)\]\[(?:(?:[^\]]+))?\]") # simplified
|
||||
|
||||
# Characters removed for anchor IDs (GitHub rules simplified). We strip most punctuation except hyphen and underscore.
|
||||
PUNCT_TO_STRIP = "\"'!#$%&()*+,./:;<=>?@[]^`{|}~" # punctuation characters to remove
|
||||
ANCHOR_CACHE: dict[str, set[str]] = {}
|
||||
|
||||
|
||||
def _detect_repo_root(start: str | None = None) -> str:
|
||||
"""Walk upwards to locate repository root (presence of WORKSPACE.bazel or .git).
|
||||
|
||||
Falls back to current working directory if no sentinel found.
|
||||
"""
|
||||
if start is None:
|
||||
start = os.getcwd()
|
||||
cur = os.path.abspath(start)
|
||||
last = None
|
||||
while cur != last:
|
||||
if (
|
||||
os.path.exists(os.path.join(cur, "WORKSPACE.bazel"))
|
||||
or os.path.exists(os.path.join(cur, "MODULE.bazel"))
|
||||
or os.path.isdir(os.path.join(cur, ".git"))
|
||||
):
|
||||
return cur
|
||||
last = cur
|
||||
cur = os.path.dirname(cur)
|
||||
return os.getcwd()
|
||||
|
||||
|
||||
REPO_ROOT = _detect_repo_root()
|
||||
|
||||
|
||||
@dataclass
|
||||
class LinkIssue:
|
||||
file: str
|
||||
line: int
|
||||
link_text: str
|
||||
target: str
|
||||
message: str
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"file": self.file,
|
||||
"line": self.line,
|
||||
"link_text": self.link_text,
|
||||
"target": self.target,
|
||||
"message": self.message,
|
||||
}
|
||||
|
||||
|
||||
def github_anchor(text: str) -> str:
|
||||
t = text.strip().lower()
|
||||
# remove punctuation
|
||||
t2 = "".join(ch for ch in t if ch not in PUNCT_TO_STRIP)
|
||||
# spaces to hyphens
|
||||
t2 = re.sub(r"\s+", "-", t2)
|
||||
# collapse multiple hyphens
|
||||
t2 = re.sub(r"-+", "-", t2)
|
||||
return t2
|
||||
|
||||
|
||||
def collect_headings(path: str) -> set[str]:
|
||||
if path in ANCHOR_CACHE:
|
||||
return ANCHOR_CACHE[path]
|
||||
anchors: set[str] = set()
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
# Support blockquoted headings: strip leading '>' plus following space(s)
|
||||
if line.lstrip().startswith(">"):
|
||||
# Remove successive '>' prefixes (nested blockquotes) while preserving heading markers
|
||||
stripped = line.lstrip()
|
||||
while stripped.startswith(">"):
|
||||
stripped = stripped[1:].lstrip()
|
||||
candidate_line = stripped
|
||||
else:
|
||||
candidate_line = line
|
||||
m = HEADING_RE.match(candidate_line)
|
||||
if m:
|
||||
heading_text = m.group(2)
|
||||
# Extract any embedded HTML anchors first
|
||||
for a in HTML_ANCHOR_RE.finditer(heading_text):
|
||||
raw = a.group(1).strip()
|
||||
if raw:
|
||||
anchors.add(github_anchor(raw))
|
||||
anchors.add(raw) # also allow direct reference without normalization
|
||||
# Remove HTML anchor tags from heading text before normalizing (support no-space join)
|
||||
cleaned = HTML_ANCHOR_RE.sub("", heading_text).strip()
|
||||
# Strip inline markdown links (e.g., Classic [PlanCache](...)) to derive anchor from visible text only.
|
||||
cleaned = re.sub(r"\[([^\]]+)\]\([^)]*\)", r"\1", cleaned)
|
||||
if cleaned:
|
||||
norm_clean = github_anchor(cleaned)
|
||||
# Duplicate tracking: if an anchor already exists, add numbered variants
|
||||
if norm_clean in anchors:
|
||||
# Count existing numbered variants to assign next index
|
||||
existing_indices = [0]
|
||||
for existing in list(anchors):
|
||||
if existing == norm_clean:
|
||||
existing_indices.append(0)
|
||||
elif re.match(rf"^{re.escape(norm_clean)}-(\d+)$", existing):
|
||||
try:
|
||||
existing_indices.append(int(existing.rsplit("-", 1)[1]))
|
||||
except Exception:
|
||||
pass
|
||||
next_idx = max(existing_indices) + 1
|
||||
numbered = f"{norm_clean}-{next_idx}"
|
||||
anchors.add(numbered)
|
||||
anchors.add(norm_clean)
|
||||
# Also add normalized form of raw anchors (in case users link using normalized visible text form)
|
||||
for a in HTML_ANCHOR_RE.finditer(heading_text):
|
||||
raw = a.group(1).strip()
|
||||
if raw:
|
||||
anchors.add(github_anchor(raw))
|
||||
except Exception:
|
||||
pass
|
||||
ANCHOR_CACHE[path] = anchors
|
||||
return anchors
|
||||
|
||||
|
||||
def is_http_url(url: str) -> bool:
|
||||
return url.startswith("http://") or url.startswith("https://")
|
||||
|
||||
|
||||
def find_markdown_files(root: str) -> List[str]:
|
||||
files: List[str] = []
|
||||
for dirpath, _, filenames in os.walk(root):
|
||||
for fn in filenames:
|
||||
if fn.lower().endswith(".md"):
|
||||
files.append(os.path.join(dirpath, fn))
|
||||
return files
|
||||
|
||||
|
||||
def parse_links(file_path: str) -> List[Tuple[int, str, str]]:
|
||||
links: List[Tuple[int, str, str]] = []
|
||||
try:
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
in_fence = False
|
||||
in_blockquote = False
|
||||
fence_delim = None # track ``` or ~~~
|
||||
for idx, raw_line in enumerate(f, start=1):
|
||||
line = raw_line.rstrip("\n")
|
||||
# Detect start/end of fenced code blocks. Accept ``` or ~~~ with optional language.
|
||||
fence_match = re.match(r"^(?P<delim>`{3,}|~{3,})(.*)$", line)
|
||||
if fence_match:
|
||||
full = fence_match.group("delim")
|
||||
# Toggle if same delimiter starts/ends
|
||||
if not in_fence:
|
||||
in_fence = True
|
||||
fence_delim = full
|
||||
continue
|
||||
else:
|
||||
# Only close if same delimiter length & char
|
||||
if fence_delim == full:
|
||||
in_fence = False
|
||||
fence_delim = None
|
||||
continue
|
||||
if in_fence:
|
||||
continue # skip link detection inside code fences
|
||||
# Blockquote handling: if line starts with '>' treat entire following wrapped paragraph as quoted until blank line
|
||||
if re.match(r"^\s*>", line):
|
||||
in_blockquote = True
|
||||
continue
|
||||
if in_blockquote:
|
||||
if line.strip() == "":
|
||||
in_blockquote = False
|
||||
else:
|
||||
continue
|
||||
for m in LINK_RE.finditer(line):
|
||||
text, target = m.group(1), m.group(2).strip()
|
||||
links.append((idx, text, target))
|
||||
except Exception:
|
||||
pass
|
||||
return links
|
||||
|
||||
|
||||
def validate_link(current_file: str, line: int, text: str, target: str) -> Optional[LinkIssue]:
|
||||
# Remove surrounding <> used sometimes in markdown
|
||||
if target.startswith("<") and target.endswith(">"):
|
||||
target = target[1:-1]
|
||||
|
||||
# Ignore empty link
|
||||
if target == "":
|
||||
return LinkIssue(current_file, line, text, target, "empty link target")
|
||||
|
||||
# Fragment-only (#anchor)
|
||||
if target.startswith("#"):
|
||||
anchors = collect_headings(current_file)
|
||||
raw_anchor = target[1:]
|
||||
# Normalize link anchor the same way headings are normalized
|
||||
norm_anchor = github_anchor(raw_anchor)
|
||||
if norm_anchor not in anchors:
|
||||
# Fuzzy variants: attempt to tolerate missing or extra hyphens inside multi-token anchors.
|
||||
# Strategy:
|
||||
# 1. If anchor has hyphens, try removing each hyphen individually (concatenation forms).
|
||||
# 2. Try removing all hyphens (fully concatenated form).
|
||||
# 3. If anchor has N hyphens, also try forms with one extra hyphen inserted between adjacent alphanumerics
|
||||
# (covers classic-plancache -> classic-plan-cache).
|
||||
# 4. If anchor has no hyphens, attempt inserting a hyphen at every internal boundary between alphanumerics.
|
||||
fuzzy_match = False
|
||||
variant_candidates: set[str] = set()
|
||||
a = norm_anchor
|
||||
# (1) remove each hyphen separately
|
||||
if "-" in a:
|
||||
for i, ch in enumerate(a):
|
||||
if ch == "-":
|
||||
variant_candidates.add(a[:i] + a[i + 1 :])
|
||||
# (2) remove all hyphens
|
||||
variant_candidates.add(a.replace("-", ""))
|
||||
# (3) insert extra hyphen between alphanumerics where not already hyphen
|
||||
for i in range(1, len(a)):
|
||||
if a[i] != "-" and a[i - 1] != "-":
|
||||
if a[i - 1].isalnum() and a[i].isalnum():
|
||||
variant_candidates.add(a[:i] + "-" + a[i:])
|
||||
else:
|
||||
# (4) insert hyphen at every internal boundary
|
||||
for i in range(1, len(a)):
|
||||
if a[i - 1].isalnum() and a[i].isalnum():
|
||||
variant_candidates.add(a[:i] + "-" + a[i:])
|
||||
# Limit explosion: cap at 50 candidates
|
||||
if len(variant_candidates) > 50:
|
||||
variant_candidates = set(list(variant_candidates)[:50])
|
||||
for cand in variant_candidates:
|
||||
if cand in anchors:
|
||||
fuzzy_match = True
|
||||
break
|
||||
if fuzzy_match:
|
||||
return None # Suppress issue since a fuzzy variant matches
|
||||
return LinkIssue(current_file, line, text, target, "anchor not found in this file")
|
||||
return None
|
||||
|
||||
# Split fragment if present
|
||||
file_part, frag_part = target.split("#", 1) if "#" in target else (target, None)
|
||||
|
||||
if is_http_url(file_part):
|
||||
# Allow detection of malformed scheme 'hhttps://' but otherwise skip external validation
|
||||
if file_part.startswith("hhttps://"):
|
||||
return LinkIssue(
|
||||
current_file, line, text, target, "malformed scheme (did you mean https:// ?)"
|
||||
)
|
||||
return None
|
||||
|
||||
# Remove query params if any
|
||||
if "?" in file_part:
|
||||
parsed = urllib.parse.urlparse(file_part)
|
||||
file_part = parsed.path
|
||||
|
||||
# Normalize relative path. If path starts with '/src/' treat as repo-root relative.
|
||||
repo_root = REPO_ROOT # resolved once; works under Bazel runfiles
|
||||
if file_part.startswith("/src/"):
|
||||
resolved_path = os.path.normpath(os.path.join(repo_root, file_part.lstrip("/")))
|
||||
else:
|
||||
current_dir = os.path.dirname(current_file)
|
||||
resolved_path = os.path.normpath(os.path.join(current_dir, file_part))
|
||||
|
||||
if not os.path.exists(resolved_path):
|
||||
return LinkIssue(current_file, line, text, target, f"file does not exist: {resolved_path}")
|
||||
|
||||
if frag_part:
|
||||
# If target file is NOT markdown and fragment matches a GitHub line anchor (#Lnn or #Lnn-Lmm), accept.
|
||||
if not resolved_path.lower().endswith(".md") and re.match(r"^L\d+(-L\d+)?$", frag_part):
|
||||
return None
|
||||
anchors = (
|
||||
collect_headings(resolved_path) if resolved_path.lower().endswith(".md") else set()
|
||||
)
|
||||
if resolved_path.lower().endswith(".md"):
|
||||
norm_frag = github_anchor(frag_part)
|
||||
if norm_frag not in anchors:
|
||||
return LinkIssue(
|
||||
current_file,
|
||||
line,
|
||||
text,
|
||||
target,
|
||||
f'anchor "{frag_part}" not found in target file',
|
||||
)
|
||||
else:
|
||||
# Non-markdown + non line-fragment: cannot validate anchor, assume ok.
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def lint_files(files: Iterable[str], workers: int) -> List[LinkIssue]:
|
||||
issues: List[LinkIssue] = []
|
||||
|
||||
def process(file_path: str) -> List[LinkIssue]:
|
||||
file_issues: List[LinkIssue] = []
|
||||
links = parse_links(file_path)
|
||||
for line, text, target in links:
|
||||
issue = validate_link(file_path, line, text, target)
|
||||
if issue:
|
||||
file_issues.append(issue)
|
||||
return file_issues
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as exe:
|
||||
futures = {exe.submit(process, f): f for f in files}
|
||||
for fut in concurrent.futures.as_completed(futures):
|
||||
for iss in fut.result():
|
||||
issues.append(iss)
|
||||
return issues
|
||||
|
||||
|
||||
def main(argv: List[str]) -> int:
|
||||
ap = argparse.ArgumentParser(description="Markdown link linter for src/mongo markdown files.")
|
||||
ap.add_argument("--root", default="src/mongo", help="Root directory to scan")
|
||||
ap.add_argument(
|
||||
"--workers",
|
||||
type=int,
|
||||
default=min(32, (os.cpu_count() or 4)),
|
||||
help="Parallel worker threads",
|
||||
)
|
||||
ap.add_argument("--json", action="store_true", help="Output machine-readable JSON")
|
||||
ap.add_argument("--verbose", action="store_true", help="Verbose output")
|
||||
ap.add_argument(
|
||||
"--auto-fix",
|
||||
action="store_true",
|
||||
help="Attempt automatic fixes for simple broken links (renames)",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--rename-map",
|
||||
action="append",
|
||||
metavar="OLD=NEW",
|
||||
help="Directory/file rename mapping, e.g. catalog=local_catalog (can be repeated)",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--search-moved",
|
||||
action="store_true",
|
||||
help="Search for missing file basenames under root and rewrite link if unique match found",
|
||||
)
|
||||
args = ap.parse_args(argv)
|
||||
|
||||
root = args.root
|
||||
if not os.path.isdir(root):
|
||||
# Try resolving relative to detected repo root
|
||||
candidate = os.path.join(REPO_ROOT, root.lstrip("/"))
|
||||
if os.path.isdir(candidate):
|
||||
root = candidate
|
||||
else:
|
||||
print(
|
||||
f"Error: root directory {root} not found (repo root: {REPO_ROOT})", file=sys.stderr
|
||||
)
|
||||
return 1
|
||||
|
||||
files = find_markdown_files(root)
|
||||
if args.verbose:
|
||||
print(f"Scanning {len(files)} markdown files under {root} ...")
|
||||
|
||||
issues = lint_files(files, args.workers)
|
||||
|
||||
# Optional moved-file search index (basename -> list of full paths). We walk the entire
|
||||
# root tree to include non-markdown sources (e.g., .h/.cpp) since links may point to headers.
|
||||
moved_index: dict[str, list[str]] = {}
|
||||
if args.search_moved:
|
||||
for dirpath, dirnames, filenames in os.walk(root):
|
||||
# Avoid descending into very large generated output dirs if present.
|
||||
# (Heuristic: skip bazel-* dirs under root scan to reduce noise.)
|
||||
dirnames[:] = [d for d in dirnames if not d.startswith("bazel-")]
|
||||
for fn in filenames:
|
||||
if fn.startswith("."):
|
||||
continue
|
||||
full = os.path.join(dirpath, fn)
|
||||
moved_index.setdefault(fn, []).append(full)
|
||||
if args.verbose:
|
||||
total_paths = sum(len(v) for v in moved_index.values())
|
||||
print(
|
||||
f"Built moved-file index: {len(moved_index)} unique basenames mapped to {total_paths} file(s)"
|
||||
)
|
||||
|
||||
# Auto-fix pass (only for missing file issues with rename hints)
|
||||
if args.auto_fix and issues:
|
||||
rename_pairs = {}
|
||||
for pair in args.rename_map or []:
|
||||
if "=" in pair:
|
||||
old, new = pair.split("=", 1)
|
||||
rename_pairs[old.strip()] = new.strip()
|
||||
|
||||
if rename_pairs and args.verbose:
|
||||
print(f"Auto-fix: applying rename map {rename_pairs}")
|
||||
|
||||
fix_count = 0
|
||||
# Group issues by file for editing
|
||||
issues_by_file: dict[str, List[LinkIssue]] = {}
|
||||
for iss in issues:
|
||||
issues_by_file.setdefault(iss.file, []).append(iss)
|
||||
|
||||
# Precompute anchor -> candidate files map to help relocation of anchors.
|
||||
anchor_index: dict[str, list[str]] = {}
|
||||
|
||||
def index_file_anchors(path: str):
|
||||
if not path.lower().endswith(".md"):
|
||||
return
|
||||
for a in collect_headings(path):
|
||||
anchor_index.setdefault(a, []).append(path)
|
||||
|
||||
# Index only when we encounter first anchor issue to keep performance reasonable.
|
||||
anchor_index_built = False
|
||||
|
||||
for md_file, file_issues in issues_by_file.items():
|
||||
# Only attempt if file exists and we have rename hints
|
||||
if not os.path.isfile(md_file):
|
||||
continue
|
||||
try:
|
||||
# Use a distinct variable name (fh) for the file handle to avoid
|
||||
# shadowing earlier loop variables (e.g., 'f' used for file paths),
|
||||
# which was confusing the type checker.
|
||||
with open(md_file, "r", encoding="utf-8") as fh:
|
||||
lines = fh.readlines()
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Deduplicate identical (message, target) to avoid repeated work (retain first occurrence)
|
||||
seen_sig = set()
|
||||
deduped: List[LinkIssue] = []
|
||||
for iss in file_issues:
|
||||
sig = (iss.message, iss.target)
|
||||
if sig in seen_sig:
|
||||
continue
|
||||
seen_sig.add(sig)
|
||||
deduped.append(iss)
|
||||
|
||||
modified = False
|
||||
for iss in deduped:
|
||||
# Always capture the current target early to avoid scope issues
|
||||
original_target = iss.target
|
||||
|
||||
# 1. Scheme / common typo fixes
|
||||
if "malformed scheme" in iss.message and original_target.startswith("hhttps://"):
|
||||
fixed_target = original_target.replace("hhttps://", "https://", 1)
|
||||
for idx, line_text in enumerate(lines):
|
||||
if f"]({original_target})" in line_text:
|
||||
lines[idx] = line_text.replace(
|
||||
f"]({original_target})", f"]({fixed_target})", 1
|
||||
)
|
||||
modified = True
|
||||
fix_count += 1
|
||||
if args.verbose:
|
||||
print(
|
||||
f"Auto-fixed malformed scheme in {md_file}: {original_target} -> {fixed_target}"
|
||||
)
|
||||
break
|
||||
|
||||
# 2. Common directory typo fix (storgae -> storage)
|
||||
if "file does not exist:" in iss.message and "storgae" in original_target:
|
||||
fixed_target = original_target.replace("storgae", "storage")
|
||||
if fixed_target != original_target:
|
||||
for idx, line_text in enumerate(lines):
|
||||
if f"]({original_target})" in line_text:
|
||||
lines[idx] = line_text.replace(
|
||||
f"]({original_target})", f"]({fixed_target})", 1
|
||||
)
|
||||
modified = True
|
||||
fix_count += 1
|
||||
if args.verbose:
|
||||
print(
|
||||
f"Auto-fixed path typo in {md_file}: {original_target} -> {fixed_target}"
|
||||
)
|
||||
break
|
||||
|
||||
# 3. Anchor relocation: only attempt if we can extract a plausible fragment token
|
||||
if ('anchor "' in iss.message and "not found in target file" in iss.message) or (
|
||||
"anchor not found in this file" in iss.message
|
||||
):
|
||||
# Accept fragments comprised of word chars, dashes, underscores, and periods
|
||||
m_anchor = re.search(r'anchor "([A-Za-z0-9_.:-]+)"', iss.message)
|
||||
frag: Optional[str] = None
|
||||
if m_anchor:
|
||||
frag = m_anchor.group(1)
|
||||
else:
|
||||
# Fallback extraction ONLY from the original target if it starts with '#'
|
||||
if original_target.startswith("#") and len(original_target) > 1:
|
||||
frag = original_target[1:]
|
||||
elif "#" in original_target:
|
||||
frag = original_target.split("#", 1)[1]
|
||||
# Guard against obviously wrong fragments like 'not' arising from message text
|
||||
if frag and frag.lower() == "not":
|
||||
frag = None
|
||||
if frag:
|
||||
norm_frag = github_anchor(frag)
|
||||
if not anchor_index_built:
|
||||
for fpath in files:
|
||||
index_file_anchors(fpath)
|
||||
anchor_index_built = True
|
||||
candidates = anchor_index.get(norm_frag, [])
|
||||
if not candidates and args.verbose:
|
||||
print(
|
||||
f'Verbose: no indexed candidates for anchor "{frag}" (normalized "{norm_frag}") referenced from {md_file}. Performing fallback scan...'
|
||||
)
|
||||
# Fallback: scan sibling and parent directories (one level up) for the anchor
|
||||
search_dirs = {os.path.dirname(md_file)}
|
||||
parent_dir = os.path.dirname(os.path.dirname(md_file))
|
||||
if os.path.isdir(parent_dir):
|
||||
search_dirs.add(parent_dir)
|
||||
fallback_matches: list[str] = []
|
||||
for d in list(search_dirs):
|
||||
try:
|
||||
for fn in os.listdir(d):
|
||||
if fn.lower().endswith(".md"):
|
||||
candidate_path = os.path.join(d, fn)
|
||||
for a in collect_headings(candidate_path):
|
||||
if a == norm_frag:
|
||||
fallback_matches.append(candidate_path)
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
if fallback_matches:
|
||||
candidates = fallback_matches
|
||||
if args.verbose:
|
||||
print(
|
||||
f'Verbose: fallback found {len(candidates)} candidate(s) for anchor "{frag}"'
|
||||
)
|
||||
# Global one-time fallback: scan entire root if still no candidates
|
||||
if not candidates:
|
||||
# Perform a global scan only once per fragment per run (simple memo via anchor_index miss)
|
||||
if args.verbose:
|
||||
print(
|
||||
f'Verbose: performing global scan for anchor "{frag}" under root {root}'
|
||||
)
|
||||
try:
|
||||
for gfile in files:
|
||||
if gfile.lower().endswith(".md"):
|
||||
if norm_frag in collect_headings(gfile):
|
||||
candidates.append(gfile)
|
||||
except Exception:
|
||||
pass
|
||||
if candidates and args.verbose:
|
||||
print(
|
||||
f'Verbose: global scan found {len(candidates)} candidate(s) for anchor "{frag}"'
|
||||
)
|
||||
if candidates:
|
||||
chosen: Optional[str] = None
|
||||
if len(candidates) == 1:
|
||||
chosen = candidates[0]
|
||||
else:
|
||||
# Proximity heuristic: minimal directory distance (count of differing path segments)
|
||||
base_dir = os.path.dirname(md_file)
|
||||
|
||||
def dir_distance(a: str, b: str) -> int:
|
||||
a_parts = os.path.abspath(a).split(os.sep)
|
||||
b_parts = os.path.abspath(b).split(os.sep)
|
||||
# Find common prefix length
|
||||
i = 0
|
||||
for x, y in zip(a_parts, b_parts):
|
||||
if x != y:
|
||||
break
|
||||
i += 1
|
||||
return (len(a_parts) - i) + (len(b_parts) - i)
|
||||
|
||||
# Rank by distance then by path length for stability
|
||||
chosen = sorted(
|
||||
candidates, key=lambda p: (dir_distance(base_dir, p), len(p))
|
||||
)[0]
|
||||
if chosen:
|
||||
rel_path = os.path.relpath(chosen, os.path.dirname(md_file))
|
||||
new_target = f"{rel_path}#{frag}"
|
||||
search_token = f"]({original_target})"
|
||||
replaced_any = False
|
||||
for idx, line_text in enumerate(lines):
|
||||
if search_token in line_text:
|
||||
# Replace only the first occurrence per line to avoid accidental nested replacements, but scan all lines.
|
||||
lines[idx] = line_text.replace(
|
||||
search_token, f"]({new_target})", 1
|
||||
)
|
||||
modified = True
|
||||
replaced_any = True
|
||||
fix_count += 1
|
||||
if replaced_any and args.verbose:
|
||||
if len(candidates) > 1:
|
||||
print(
|
||||
f"Auto-relocated anchor (closest of {len(candidates)}) in {md_file}: {original_target} -> {new_target}"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"Auto-relocated anchor in {md_file}: {original_target} -> {new_target}"
|
||||
)
|
||||
|
||||
# 4. Path segment rename fixes (directory renames) independent of anchor relocation
|
||||
if rename_pairs and "file does not exist:" in iss.message:
|
||||
path_part = original_target.split("#", 1)[0]
|
||||
new_path_part = path_part
|
||||
for old, new in rename_pairs.items():
|
||||
pattern = re.compile(rf"(?:^|/)({re.escape(old)})(?=/|$)")
|
||||
new_path_part = pattern.sub(
|
||||
lambda m: m.group(0).replace(old, new), new_path_part
|
||||
)
|
||||
if new_path_part != path_part:
|
||||
new_target = new_path_part + (
|
||||
""
|
||||
if "#" not in original_target
|
||||
else "#" + original_target.split("#", 1)[1]
|
||||
)
|
||||
for idx, line_text in enumerate(lines):
|
||||
if f"]({original_target})" in line_text:
|
||||
lines[idx] = line_text.replace(
|
||||
f"]({original_target})", f"]({new_target})", 1
|
||||
)
|
||||
modified = True
|
||||
fix_count += 1
|
||||
if args.verbose:
|
||||
print(
|
||||
f"Auto-fixed link in {md_file}: {original_target} -> {new_target}"
|
||||
)
|
||||
break
|
||||
# 5. Moved file basename search (only when enabled)
|
||||
if (
|
||||
args.search_moved
|
||||
and "file does not exist:" in iss.message
|
||||
and "#" not in original_target
|
||||
):
|
||||
# Extract the basename of the missing file
|
||||
missing_base = os.path.basename(original_target)
|
||||
# Skip obviously non-file references (contain spaces or wildcard characters)
|
||||
# Allow basename-only references (original_target may equal missing_base)
|
||||
if missing_base and " " not in missing_base:
|
||||
candidates = moved_index.get(missing_base, [])
|
||||
# If no candidates under the provided root, attempt a one-time scan of the
|
||||
# full repo root (this can be expensive, so only do it when we miss locally).
|
||||
if not candidates:
|
||||
global_hits: list[str] = []
|
||||
for dirpath, dirnames, filenames in os.walk(REPO_ROOT):
|
||||
# Skip bazel output directories to reduce noise.
|
||||
dirnames[:] = [d for d in dirnames if not d.startswith("bazel-")]
|
||||
if missing_base in filenames:
|
||||
global_hits.append(os.path.join(dirpath, missing_base))
|
||||
# Fast exit if >1 found (ambiguity)
|
||||
if len(global_hits) > 1:
|
||||
break
|
||||
if len(global_hits) == 1:
|
||||
candidates = global_hits
|
||||
if args.verbose:
|
||||
if not global_hits:
|
||||
print(
|
||||
f"Verbose: moved-file search found no global candidates for {missing_base} (original target {original_target})"
|
||||
)
|
||||
elif len(global_hits) > 1:
|
||||
print(
|
||||
f"Verbose: moved-file search ambiguous ({len(global_hits)} matches) for {missing_base}; skipping auto-fix"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"Verbose: moved-file global search matched unique file {global_hits[0]} for {missing_base}"
|
||||
)
|
||||
if len(candidates) == 1:
|
||||
target_file_candidate = candidates[0]
|
||||
rel_path = os.path.relpath(
|
||||
target_file_candidate, os.path.dirname(md_file)
|
||||
)
|
||||
new_target = rel_path
|
||||
for idx, line_text in enumerate(lines):
|
||||
token = f"]({original_target})"
|
||||
if token in line_text:
|
||||
lines[idx] = line_text.replace(token, f"]({new_target})", 1)
|
||||
modified = True
|
||||
fix_count += 1
|
||||
if args.verbose:
|
||||
print(
|
||||
f"Auto-fixed moved file in {md_file}: {original_target} -> {new_target}"
|
||||
)
|
||||
break
|
||||
if modified:
|
||||
try:
|
||||
with open(md_file, "w", encoding="utf-8") as fh:
|
||||
fh.writelines(lines)
|
||||
except Exception:
|
||||
print(f"Warning: failed to write fixes to {md_file}", file=sys.stderr)
|
||||
|
||||
if args.verbose:
|
||||
print(f"Auto-fix completed: {fix_count} link(s) updated")
|
||||
# Re-run lint to update issues list after fixes
|
||||
if fix_count:
|
||||
ANCHOR_CACHE.clear()
|
||||
issues = lint_files(files, args.workers)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps([i.to_dict() for i in issues], indent=2))
|
||||
else:
|
||||
for issue in issues:
|
||||
print(f"{issue.file}:{issue.line}: {issue.message} [{issue.target}]")
|
||||
|
||||
if issues:
|
||||
print(f"Found {len(issues)} markdown link issue(s).", file=sys.stderr)
|
||||
return 2
|
||||
else:
|
||||
if args.verbose:
|
||||
print("All links OK.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv[1:]))
|
||||
|
|
@ -773,6 +773,37 @@ tasks:
|
|||
- "merged_decls.json"
|
||||
- "--parse-only"
|
||||
|
||||
- name: lint_markdown_links
|
||||
tags:
|
||||
[
|
||||
"assigned_to_jira_team_devprod_build",
|
||||
"development_critical_single_variant",
|
||||
"lint",
|
||||
]
|
||||
commands:
|
||||
- command: timeout.update
|
||||
params:
|
||||
# 60 mins
|
||||
exec_timeout_secs: 3600
|
||||
- func: "f_expansions_write"
|
||||
- command: manifest.load
|
||||
- func: "git get project and add git tag"
|
||||
- func: "f_expansions_write"
|
||||
- func: "kill processes"
|
||||
- func: "cleanup environment"
|
||||
- func: "set up venv"
|
||||
- func: "upload pip requirements"
|
||||
- func: "get engflow creds"
|
||||
- command: subprocess.exec
|
||||
type: test
|
||||
params:
|
||||
binary: bash
|
||||
args:
|
||||
- "./src/evergreen/run_python_script.sh"
|
||||
- "buildscripts/lint_markdown_links.py"
|
||||
- "--root=src/mongo"
|
||||
- "--verbose"
|
||||
|
||||
- name: bazel_coverage
|
||||
tags: ["assigned_to_jira_team_devprod_correctness", "auxiliary"]
|
||||
depends_on:
|
||||
|
|
|
|||
|
|
@ -75,7 +75,7 @@ user credentials and roles. The authorization session is then used to check perm
|
|||
On a server with authentication enabled, all but a small handful of commands require clients to
|
||||
authenticate before performing any action. This typically occurs with a 1 to 3 round trip
|
||||
conversation using the `saslStart` and `saslContinue` commands, or though a single call to the
|
||||
`authenticate` command. See [SASL](#SASL) and [X.509](#X509) below for the details of these
|
||||
`authenticate` command. See [SASL](#SASL) and [X.509](../../util/net/README.md#X509) below for the details of these
|
||||
exchanges.
|
||||
|
||||
### SASL
|
||||
|
|
@ -119,7 +119,7 @@ encountered.
|
|||
|
||||
To reduce connection overhead time, clients may begin and possibly complete their authentication
|
||||
exchange as part of the
|
||||
[`CmdHello`](<(https://github.com/mongodb/mongo/blob/r4.7.0/src/mongo/db/repl/replication_info.cpp#L234)>)
|
||||
[`CmdHello`](https://github.com/mongodb/mongo/blob/r4.7.0/src/mongo/db/repl/replication_info.cpp#L234)
|
||||
exchange. In this mode, the body of the `saslStart` or `authenticate` command used for
|
||||
authentication may be embedded into the `hello` command under the field `{speculativeAuthenticate:
|
||||
$bodyOfAuthCmd}`.
|
||||
|
|
@ -298,7 +298,7 @@ The only purpose of an arbiter is to participate in elections for replica set pr
|
|||
does not have a copy of data set, including system tables which contain user and role definitions,
|
||||
and therefore can not authenticate local users. It is possible to authenticate to arbiter using
|
||||
external authentication methods such as cluster authentication or
|
||||
[x.509 authentication](#x509atn) and acquire a role using [x.509 authorization](#x509azn).
|
||||
x.509 authentication and acquire a role using [x.509 authorization](#x509azn).
|
||||
|
||||
It is also possible to connect to an arbiter with limited access using the
|
||||
[localhost auth bypass](#lhabp). If the localhost auth bypass is disabled using the
|
||||
|
|
@ -608,8 +608,8 @@ A resource pattern is a combination of a [MatchType](action_type.idl) with a `Na
|
|||
| `kMatchNever` | _Unexpressable_ | A base type only used internally to indicate that the privilege specified by the ResourcePattern can not match any real resource |
|
||||
| `kMatchClusterResource` | `{ cluster : true }` | Commonly used with host and cluster management actions such as `ActionType::addShard`, `ActionType::setParameter`, or `ActionType::shutdown`. |
|
||||
| `kMatchAnyResource` | `{ anyResource: true }` | Matches all storage resources, even [non-normal namespaces](#normal-namespace) such as `db.system.views`. |
|
||||
| `kMatchAnyNormalResource` | `{ db: '', collection: '' }` | Matches all [normal](#normal-namespace) storage resources. Used with [builtin role](builtin_roles.cpp) `readWriteAnyDatabase`. |
|
||||
| `kMatchDatabaseName` | `{ db: 'dbname', collection: '' }` | Matches all [normal](#normal-namespace) storage resources for a specific named database. Used with [builtin role](builtin_roles.cpp) `readWrite`. |
|
||||
| `kMatchAnyNormalResource` | `{ db: '', collection: '' }` | Matches all [normal](#normal-namespace) storage resources. Used with [builtin role](builtin_roles.tpl.cpp) `readWriteAnyDatabase`. |
|
||||
| `kMatchDatabaseName` | `{ db: 'dbname', collection: '' }` | Matches all [normal](#normal-namespace) storage resources for a specific named database. Used with [builtin role](builtin_roles.tpl.cpp) `readWrite`. |
|
||||
| `kMatchCollectionName` | `{ db: '', collection: 'collname' }` | Matches all storage resources, normal or not, which have the exact collection suffix '`collname`'. For example, to provide read-only access to `*.system.js`. |
|
||||
| `kMatchExactNamespace` | `{ db: 'dbname', collection: 'collname' }` | Matches the exact namespace '`dbname`.`collname`'. |
|
||||
| `kMatchAnySystemBucketResource` | `{ db: '', system_buckets: '' }` | Matches the namespace pattern `*.system.buckets.*`. |
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ The classic engine makes this determination using a process called **multiplanni
|
|||
|
||||
By the nature of this strategy, we do not have a complete guarantee that the selected plan will be the most optimal. It is possible that by the nature of the data, a suboptimal plan appears optimal during the multiplanning trial period. Therefore, our goal is to pick a generally efficient plan, which is not necessarily the _most_ efficient plan.
|
||||
|
||||
The Query Optimization team is currently developing a cost-based plan ranker as an alternative to multiplanning. This initiative is documented [here](TODO: SERVER-100250).
|
||||
The Query Optimization team is currently developing a cost-based plan ranker as an alternative to multiplanning. This initiative is documented here (link TODO: SERVER-100250).
|
||||
|
||||
> ### Aside: Classic Runtime Planner for SBE
|
||||
>
|
||||
|
|
@ -22,7 +22,7 @@ Note that if execution of a cached plan fails because the cached plan is less ef
|
|||
|
||||
## [`MultiPlanner`](https://github.com/mongodb/mongo/blob/12390d154c1d06b6082a03d2410ff2b3578a323e/src/mongo/db/query/classic_runtime_planner/planner_interface.h#L182)
|
||||
|
||||
If we have [more than one](#singlesolutionpassthroughplanner) `QuerySolution`, multiplanning begins in [`buildMultiPlan()`](https://github.com/mongodb/mongo/blob/12390d154c1d06b6082a03d2410ff2b3578a323e/src/mongo/db/query/get_executor.cpp#L586). This function initializes a `MultiPlanner` that manages the multiplanning process. For each `QuerySolution`, [`buildExecutableTree()`](https://github.com/mongodb/mongo/blob/12390d154c1d06b6082a03d2410ff2b3578a323e/src/mongo/db/query/classic_runtime_planner/multi_planner.cpp#L46) is called which constructs a tree of `PlanStage`s, where each stage corresponds to the tree in the `QuerySolution`. The `MultiPlanner` initiates planning within [`MultiPlanStage::pickBestPlan()`](https://github.com/mongodb/mongo/blob/6b012bcbe4610ef1e88f9f75d171faa017503713/src/mongo/db/exec/multi_plan.cpp#L246-L253).
|
||||
If we have [more than one](../classic_runtime_planner_for_sbe/README.md#singlesolutionpassthroughplanner) `QuerySolution`, multiplanning begins in [`buildMultiPlan()`](https://github.com/mongodb/mongo/blob/12390d154c1d06b6082a03d2410ff2b3578a323e/src/mongo/db/query/get_executor.cpp#L586). This function initializes a `MultiPlanner` that manages the multiplanning process. For each `QuerySolution`, [`buildExecutableTree()`](https://github.com/mongodb/mongo/blob/12390d154c1d06b6082a03d2410ff2b3578a323e/src/mongo/db/query/classic_runtime_planner/multi_planner.cpp#L46) is called which constructs a tree of `PlanStage`s, where each stage corresponds to the tree in the `QuerySolution`. The `MultiPlanner` initiates planning within [`MultiPlanStage::pickBestPlan()`](https://github.com/mongodb/mongo/blob/6b012bcbe4610ef1e88f9f75d171faa017503713/src/mongo/db/exec/multi_plan.cpp#L246-L253).
|
||||
|
||||
During multiplanning, each `QuerySolution` is evaluated by running each candidate for a trial period in a round-robin fashion. This round-robin execution successively calls `PlanStage::work()` on each query, performing one unit of work for each.
|
||||
|
||||
|
|
@ -194,7 +194,7 @@ flowchart TD
|
|||
|
||||
## Alternative Planners
|
||||
|
||||
Although `MultiPlanner` is our "standard" case, not all queries utilize a `MultiPlanner`. Under certain conditions, we may use a different planner that is a subclass of the abstract class [`ClassicPlannerInterface`](https://github.com/mongodb/mongo/blob/12390d154c1d06b6082a03d2410ff2b3578a323e/src/mongo/db/query/classic_runtime_planner/planner_interface.h#L59); [`CachedPlanner`](#plan-cache-consultation) is one such example. Each subclass of `ClassicPlannerInterface` overrides [`doPlan()`](https://github.com/mongodb/mongo/blob/6b012bcbe4610ef1e88f9f75d171faa017503713/src/mongo/db/query/classic_runtime_planner/planner_interface.h#L117). `MultiPlanner`'s override, for example, calls [`MultiPlanStage::pickBestPlan()`](https://github.com/mongodb/mongo/blob/6b012bcbe4610ef1e88f9f75d171faa017503713/src/mongo/db/query/classic_runtime_planner/multi_planner.cpp#L55).
|
||||
Although `MultiPlanner` is our "standard" case, not all queries utilize a `MultiPlanner`. Under certain conditions, we may use a different planner that is a subclass of the abstract class [`ClassicPlannerInterface`](https://github.com/mongodb/mongo/blob/12390d154c1d06b6082a03d2410ff2b3578a323e/src/mongo/db/query/classic_runtime_planner/planner_interface.h#L59); [`CachedPlanner`](../../../query/plan_cache/README.md) is one such example. Each subclass of `ClassicPlannerInterface` overrides [`doPlan()`](https://github.com/mongodb/mongo/blob/6b012bcbe4610ef1e88f9f75d171faa017503713/src/mongo/db/query/classic_runtime_planner/planner_interface.h#L117). `MultiPlanner`'s override, for example, calls [`MultiPlanStage::pickBestPlan()`](https://github.com/mongodb/mongo/blob/6b012bcbe4610ef1e88f9f75d171faa017503713/src/mongo/db/query/classic_runtime_planner/multi_planner.cpp#L55).
|
||||
|
||||
Each subclass is detailed below:
|
||||
|
||||
|
|
|
|||
|
|
@ -370,7 +370,7 @@ The query plan chosen by the classic optimizer, represented as a `QuerySolution`
|
|||
```
|
||||
|
||||
In particular, it is an `IXSCAN` over the `{"major": 1}` index, followed by a `FETCH` and a filter of
|
||||
`year = 2020`. The SBE plan (generated by the [SBE stage builder](#sbe-stage-builder) with the [plan
|
||||
`year = 2020`. The SBE plan (generated by the [SBE stage builder](../docs/sbe.md#sbe-stage-builders) with the [plan
|
||||
cache](#sbe-plan-cache) disabled) for this query plan is as follows:
|
||||
|
||||
```
|
||||
|
|
|
|||
|
|
@ -131,7 +131,7 @@ is set (see `timeseries::isTimeseriesViewRequest`). If it is set, the shards cal
|
|||
like unsharded time-series collections. For example, for inserts, measurements will try to be inserted into an open bucket
|
||||
in the bucket catalog, then a reopened bucket, and finally a new bucket will be opened if necessary. Updates
|
||||
and deletes occur one bucket at a time, and the buckets will be unpacked if necessary. See
|
||||
[db/timeseries/README.md](../../../db/timeseries/README.md) for more details about the specific implementations
|
||||
[db/timeseries/README.md](../timeseries/README.md) for more details about the specific implementations
|
||||
of each CRUD operation.
|
||||
|
||||
## Query routing for aggregation
|
||||
|
|
|
|||
|
|
@ -187,7 +187,7 @@ Notes:
|
|||
sharded collection is reincarnated unsharded.
|
||||
|
||||
A formal specification of the placement versioning protocol and the protocol avoiding the data
|
||||
placement anomaly is available [here](/src/mongo/tla_plus/TxnsMoveRange).
|
||||
placement anomaly is available [here](/src/mongo/tla_plus/Sharding/TxnsMoveRange).
|
||||
|
||||
A formal specification of the protocol avoiding the collection generation and collection incarnation
|
||||
anomalies is available [here](/src/mongo/tla_plus/TxnsCollectionIncarnation).
|
||||
anomalies is available [here](/src/mongo/tla_plus/Sharding/TxnsCollectionIncarnation).
|
||||
|
|
|
|||
|
|
@ -47,17 +47,17 @@ You can also find below two real usage examples for each case:
|
|||
|
||||
These classes handle the following processes internally:
|
||||
|
||||
1. Fetch the routing information for the specified collection or DBPrimary shard, and pass it to the lambda function as either a [RoutingContext](../s/query/README_aggregation.md) or a `CachedDatabaseInfo` object.
|
||||
1. Fetch the routing information for the specified collection or DBPrimary shard, and pass it to the lambda function as either a [RoutingContext](./README_routing_context.md) or a `CachedDatabaseInfo` object.
|
||||
2. Detect and handle stale routing errors coming from shard responses. If the routing data is outdated, it is automatically refreshed and the operation is retried.
|
||||
3. Once the operation succeeds, the `RoutingContext` gets validated ([here](../s/query/README_routing_context.md#invariants) you'll find a more clear understanding of what's checked under a `RoutingContext` validation).
|
||||
3. Once the operation succeeds, the `RoutingContext` gets validated ([here](./README_routing_context.md#invariants) you'll find a more clear understanding of what's checked under a `RoutingContext` validation).
|
||||
|
||||
When using `CollectionRouter` or `DBPrimaryRouter`, keep the following in mind:
|
||||
|
||||
- The lambda function passed to `CollectionRouter::routeWithRoutingContext()` or `DBPrimaryRouter::route()` must use the provided [RoutingContext](../s/query/README_aggregation.md) or `CachedDatabaseInfo` objects to dispatch a shard-versioned command to the shards.
|
||||
- The lambda function passed to `CollectionRouter::routeWithRoutingContext()` or `DBPrimaryRouter::route()` must use the provided [RoutingContext](./README_routing_context.md) or `CachedDatabaseInfo` objects to dispatch a shard-versioned command to the shards.
|
||||
- Any stale routing error returned by a shard must be thrown so that it can be properly handled by the router logic.
|
||||
- During a single routing operation, it is crucial to consult only one version of the routing table.
|
||||
|
||||
For more details on routing internals, see the [Versioning Protocols](../versioning_protocol/README_versioning_protocols.md) architecture guide.
|
||||
For more details on routing internals, see the [Versioning Protocols](../../versioning_protocol/README_versioning_protocols.md) architecture guide.
|
||||
|
||||
## MultiCollectionRouter
|
||||
|
||||
|
|
|
|||
|
|
@ -202,7 +202,7 @@ resuming the index builds when the server starts up. The persisted information i
|
|||
- The internal state of the external sorter.
|
||||
- Idents for side writes, duplicate keys, and skipped records.
|
||||
|
||||
During [startup recovery](../storgae/README.md#startup-recovery), the persisted information is used
|
||||
During [startup recovery](../storage/README.md#startup-recovery), the persisted information is used
|
||||
to reconstruct the in-memory state for the index build and resume from the phase that we left off
|
||||
in. If we fail to resume the index build for whatever reason, the index build will restart from the
|
||||
beginning.
|
||||
|
|
|
|||
|
|
@ -401,7 +401,7 @@ In addition `Collection` objects have shared ownership of:
|
|||
- A `RecordStore` - an interface to access and manipulate the documents in the collection as stored
|
||||
by the storage engine.
|
||||
|
||||
A writable `Collection` may only be requested in an active [WriteUnitOfWork](#WriteUnitOfWork). The
|
||||
A writable `Collection` may only be requested in an active [WriteUnitOfWork](../storage/README.md#WriteUnitOfWork). The
|
||||
new `Collection` instance is installed in the catalog when the storage transaction commits as the
|
||||
first `onCommit` [Changes](../storage/README.md#changes) that run. This means that it is not allowed
|
||||
to perform any modification to catalog, collection or index instances in `onCommit` handlers. Such
|
||||
|
|
|
|||
|
|
@ -527,7 +527,7 @@ The [`accumulate()`](https://github.com/mongodb/mongo/blob/28df8e56046e44f597767
|
|||
|
||||
Different executors will provide overrides of the functions in the `PlanExplainer` interface, and will provide its own stats in the `PlanSummaryStats` container:
|
||||
|
||||
- **Classic executor**: Uses [`PlanExplainerImpl`](https://github.com/mongodb/mongo/blob/11b6fc54aaeddbb6dd85d2a808827f8048f366a1/src/mongo/db/query/plan_explainer_impl.h#L59). All the information required to generate `explain` output in various formats is stored in the execution tree. Starting from the root stage of the execution tree, plan summary stats are gathered by traversing through the rest of the tree. The `MultiPlanStage` is skipped, and stats are extracted from its children. Note that if [subplanning](../exec/runtime_planners/classic_runtime_planner/README.md#subplanner) was triggered, it doesn't include information about rejected plans.
|
||||
- **Classic executor**: Uses [`PlanExplainerImpl`](https://github.com/mongodb/mongo/blob/11b6fc54aaeddbb6dd85d2a808827f8048f366a1/src/mongo/db/query/plan_explainer_impl.h#L59). All the information required to generate `explain` output in various formats is stored in the execution tree. Starting from the root stage of the execution tree, plan summary stats are gathered by traversing through the rest of the tree. The `MultiPlanStage` is skipped, and stats are extracted from its children. Note that if [subplanning](../exec/runtime_planners/classic_runtime_planner_for_sbe/README.md#subplanner) was triggered, it doesn't include information about rejected plans.
|
||||
- **Express executor**: Uses [`PlanExplainerExpress`](https://github.com/mongodb/mongo/blob/11b6fc54aaeddbb6dd85d2a808827f8048f366a1/src/mongo/db/query/plan_explainer_express.h#L193). Since we don't build a plan tree for express queries, this doesn't include stage information that's typically included in other `PlanExplainer`s, such as whether shard filtering was required and what index bounds were used. It will, however, include the chosen index.
|
||||
|
||||
> ### Aside: Express Executor
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ Each collection has its own instance of the Classic plan cache. The plan cache o
|
|||
|
||||
### [`PlanCacheKey`](https://github.com/mongodb/mongo/blob/0765809bf08f0c55e37ab6d7ef496568b662cc33/src/mongo/db/query/plan_cache/classic_plan_cache.h#L54)
|
||||
|
||||
A `PlanCacheKey` is a hash value that encodes the [query shape](#aside-query-shapes).
|
||||
A `PlanCacheKey` is a hash value that encodes the [query shape](../query_shape/README.md).
|
||||
|
||||
[`encodeClassic()`](https://github.com/mongodb/mongo/blob/0765809bf08f0c55e37ab6d7ef496568b662cc33/src/mongo/db/query/canonical_query_encoder.cpp#L1287) is used to convert a `CanonicalQuery` into a hexadecimal string representation of the query shape: the `PlanCacheKey`.
|
||||
|
||||
|
|
@ -199,7 +199,7 @@ There is a long-standing request ([SERVER-13341](https://jira.mongodb.org/browse
|
|||
|
||||
## Subplanning
|
||||
|
||||
Rooted `$or` queries (queries that include a `$or` at the top level) interact differently with the [plan cache](https://github.com/mongodb/mongo/blob/17f71567688c266de1f9a4cfc20ef6a42570ba03/src/mongo/db/exec/subplan.cpp#L188-L203). For an introduction to subplanning, refer to [Classic Runtime Planning](../../exec/runtime_planners/classic_runtime_planner/README.md#subplanner).
|
||||
Rooted `$or` queries (queries that include a `$or` at the top level) interact differently with the [plan cache](https://github.com/mongodb/mongo/blob/17f71567688c266de1f9a4cfc20ef6a42570ba03/src/mongo/db/exec/subplan.cpp#L188-L203). For an introduction to subplanning, refer to [Classic Runtime Planning](../../exec/runtime_planners/classic_runtime_planner_for_sbe/README.md#subplanner).
|
||||
|
||||
Rooted `$or` queries interact with the plan cache on a [_per-clause basis_](https://github.com/mongodb/mongo/blob/17f71567688c266de1f9a4cfc20ef6a42570ba03/src/mongo/db/exec/subplan.cpp#L247-L249); each branch of the `$or` uses the plan cache separately.
|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
## Overview
|
||||
|
||||
After a query is [canonicalized](../README_logical_models.md#canonicalquery) and optimized through [heuristic rewrites](../../matcher/README.md), the query planner generates multiple candidate plans, exploring various combinations of available indexes to optimize data access. The resulting physical plans, represented as a vector of `QuerySolution`s, are passed on to either the [multiplanner](../../exec/runtime_planners/classic_runtime_planner/README.md) or the [cost-based ranker](TODO SERVER-100250) to determine an efficient winning plan.
|
||||
After a query is [canonicalized](../README_logical_models.md#canonicalquery) and optimized through [heuristic rewrites](../../matcher/README.md), the query planner generates multiple candidate plans, exploring various combinations of available indexes to optimize data access. The resulting physical plans, represented as a vector of `QuerySolution`s, are passed on to either the [multiplanner](../../exec/runtime_planners/classic_runtime_planner/README.md) or the cost-based ranker (link TODO SERVER-100250) to determine an efficient winning plan.
|
||||
|
||||
The entrypoint to query planning is [`QueryPlanner::plan()`](https://github.com/mongodb/mongo/blob/3b45ca6c10c2a964ab7d606d4f4b04fc3d493bcc/src/mongo/db/query/query_planner.cpp#L938), which is invoked during the [process](https://github.com/mongodb/mongo/blob/e16bc2248a3410167e39d09bb9bc29a96f026ead/src/mongo/db/query/get_executor.cpp#L521) of constructing a plan executor for a query. Given a `CanonicalQuery` and a list of available indices and other data in [`QueryPlannerParams`](https://github.com/mongodb/mongo/blob/e16bc2248a3410167e39d09bb9bc29a96f026ead/src/mongo/db/query/query_planner_params.h#L115), the function returns a list of possible query solutions. Broadly, planning involves two main phases:
|
||||
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ size will be treated separately from the example which does not specify a batch
|
|||
#### Engineering Considerations
|
||||
|
||||
The dimensions considered will depend on the command, but can generally be found in the
|
||||
[`KeyGenerator`](key_generator.h) interface, which will generate the query stats store keys by which
|
||||
[`Key`](key.h) interface, which will generate the query stats store keys by which
|
||||
we accumulate statistics. As one example, you can find the
|
||||
[`FindKey`](find_key.h) which will include all the things tracked in the
|
||||
`FindCmdQueryStatsStoreKeyComponents` (including `batchSize` shown in this example).
|
||||
|
|
|
|||
|
|
@ -296,7 +296,7 @@ The `OplogWriter` runs in an endless loop doing the followings:
|
|||
|
||||
1. Get a batch from the writer batcher, which is encapsulated in the [`OplogWriterBatcher`](https://github.com/mongodb/mongo/blob/r8.0.1/src/mongo/db/repl/oplog_writer_batcher.cpp#L60).
|
||||
2. Write the batch of oplog entries into the oplog.
|
||||
3. Update [**oplog visibility**](../catalog/README.md#oplog-visibility) by notifying the storage
|
||||
3. Update [**oplog visibility**](../storage/README.md#oplog-visibility) by notifying the storage
|
||||
engine of the new oplog entries.
|
||||
4. Advance the node's `lastWritten` optime to the last optime in the batch.
|
||||
5. Tell the storage engine to flush the journal.
|
||||
|
|
@ -690,16 +690,14 @@ section.
|
|||
|
||||
**Available** read concern behaves identically to local read concern in most cases. The exception is
|
||||
reads for sharded collections from secondary shard nodes. Local read concern will wait to refresh
|
||||
the routing table cache when the node realizes its
|
||||
[metadata is stale](../s/README.md#when-the-routing-table-cache-will-refresh), which requires
|
||||
the routing table cache when the node realizes its metadata is stale, which requires
|
||||
contacting the shard's primary or config servers before being able to serve the read. Available read
|
||||
concern does not provide consistency guarantees because it does not wait for routing table cache
|
||||
refreshes. As a result, available read concern potentially serves reads faster and is more tolerant
|
||||
to network partitions than any other read concern, since the node does not need to communicate with
|
||||
another node in the cluster to serve the read. However, this also means that if the node's metadata
|
||||
was stale, available read concern could potentially return
|
||||
[orphan documents](../s/README.md#orphan-filtering) or even a stale view of a chunk that has been
|
||||
moved a long time ago and modified on another shard.
|
||||
was stale, available read concern could potentially return orphan documents or even a stale view of
|
||||
a chunk that has been moved a long time ago and modified on another shard.
|
||||
|
||||
Available read concern is not allowed to be used with causally consistent sessions or transactions.
|
||||
|
||||
|
|
@ -1029,7 +1027,7 @@ The prepare state _must_ endure any state transition or failover, so they must b
|
|||
reconstructed in all situations. If the in-memory state of a prepared transaction is lost, it can be
|
||||
reconstructed using the information in the prepare oplog entry(s).
|
||||
|
||||
[Startup recovery](#startup-recovery), [rollback](#rollback), and [initial sync](#initial-sync) all
|
||||
[Startup recovery](#startup-recovery), [rollback](../storage/wiredtiger/README.md#rollback-to-stable), and [initial sync](#initial-sync) all
|
||||
use the same algorithm to reconstruct prepared transactions. In all situations, the node will go
|
||||
through a period of applying oplog entries to get the data caught up with the rest of the replica
|
||||
set.
|
||||
|
|
@ -1696,7 +1694,7 @@ and transition to the `SECONDARY` state. This transition must succeed if we ever
|
|||
|
||||
Initial sync is the process that we use to add a new node to a replica set. Initial sync is
|
||||
initiated by the `ReplicationCoordinator` and done in a registered subclass of
|
||||
[**`InitialSyncerInterface`**](./initial_syncer_interface.h). The method used is specified by the server parameter `initialSyncMethod`.
|
||||
[**`InitialSyncerInterface`**](initial_sync/initial_syncer_interface.h). The method used is specified by the server parameter `initialSyncMethod`.
|
||||
|
||||
There are currently two initial sync methods implemented, [**Logical Initial Sync**](#logical-initial-sync) (the default)
|
||||
and File Copy Based Initial Sync, which is available only in MongoDB Enterprise Server.
|
||||
|
|
@ -1730,7 +1728,7 @@ disk.
|
|||
# Logical Initial Sync
|
||||
|
||||
Logical initial sync is the default initial sync method, implemented by
|
||||
[**`InitialSyncer`**](./initial_syncer.h).
|
||||
[**`InitialSyncer`**](initial_sync/initial_syncer.h).
|
||||
|
||||
At a high level, there are two phases to initial sync: the [**data clone phase**](#data-clone-phase)
|
||||
and the [**oplog application phase**](#oplog-application-phase). During the data clone phase, the
|
||||
|
|
@ -2239,7 +2237,7 @@ replication consistently [maintains that](https://github.com/mongodb/mongo/blob/
|
|||
|
||||
**`currentCommittedSnapshot`**: An optime maintained in `ReplicationCoordinator` that is used to
|
||||
serve majority reads and is always guaranteed to be <= `lastCommittedOpTime`. This
|
||||
is currently [set to the stable optime](hhttps://github.com/mongodb/mongo/blob/c8ebdc8b2ef2379bba978ab688e2eda1ac702b15/src/mongo/db/repl/replication_coordinator_impl.cpp#L5085).
|
||||
is currently [set to the stable optime](https://github.com/mongodb/mongo/blob/c8ebdc8b2ef2379bba978ab688e2eda1ac702b15/src/mongo/db/repl/replication_coordinator_impl.cpp#L5085).
|
||||
Since it is reset every time we recalculate the stable optime, it will also be up to date.
|
||||
|
||||
**`initialDataTimestamp`**: A timestamp used to indicate the timestamp at which history “begins”.
|
||||
|
|
|
|||
|
|
@ -473,7 +473,7 @@ that matches the read concern of the command. If the command uses `"snapshot"` r
|
|||
the historical routing table at the selected read timestamp. If the command uses any other read concern,
|
||||
it must use the latest cached routing table.
|
||||
|
||||
The [routing table cache](#the-routing-table-cache) provides an interface for obtaining the routing table
|
||||
The routing table cache provides an interface for obtaining the routing table
|
||||
at a particular timestamp and collection version, namely the `ChunkManager`. The `ChunkManager` has an
|
||||
optional clusterTime associated with it and a `RoutingTableHistory` that contains historical routing
|
||||
information for all chunks in the collection. That information is stored in an ordered map from the max
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ MongoDB uses multi-granular intent locking; see the [Concurrency FAQ][]. In all
|
|||
ensure that operations to meta-data, such as creation and deletion of record stores, are serialized
|
||||
with respect to other accesses.
|
||||
|
||||
See the [Catalog](../local_catalog/README) and [Concurrency Control](../local_catalog/lock_manager/README.md) for more information.
|
||||
See the [Catalog](../local_catalog/README.md) and [Concurrency Control](../local_catalog/lock_manager/README.md) for more information.
|
||||
|
||||
## Transactions
|
||||
|
||||
|
|
|
|||
|
|
@ -34,16 +34,16 @@ the data files.
|
|||
To avoid taking unnecessary checkpoints on an idle server, WiredTiger will only take checkpoints for
|
||||
the following scenarios:
|
||||
|
||||
- When the [stable timestamp](../../repl/README.md#replication-timestamp-glossary) is greater than or
|
||||
equal to the [initial data timestamp](../../repl/README.md#replication-timestamp-glossary), we take a
|
||||
- When the [stable timestamp](/src/mongo/db/repl/README.md#replication-timestamp-glossary) is greater than or
|
||||
equal to the [initial data timestamp](/src/mongo/db/repl/README.md#replication-timestamp-glossary), we take a
|
||||
stable checkpoint, which is a durable view of the data at a particular timestamp. This is for
|
||||
steady-state replication.
|
||||
- The [initial data timestamp](../../repl/README.md#replication-timestamp-glossary) is not set, so we
|
||||
- The [initial data timestamp](/src/mongo/db/repl/README.md#replication-timestamp-glossary) is not set, so we
|
||||
must take a full checkpoint. This is when there is no consistent view of the data, such as during
|
||||
initial sync.
|
||||
|
||||
Not only does checkpointing provide us with durability for the database, but it also enables us to
|
||||
take [backups of the data](../../storage/README.md#file-system-backups).
|
||||
take [backups of the data](/src/mongo/db/storage/README.md#file-system-backups).
|
||||
|
||||
When WiredTiger takes a checkpoint, it uses the
|
||||
[`stable_timestamp`](https://github.com/mongodb/mongo/blob/87de9a0cb1/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp#L2011 "Github") (effectively a `read_timestamp`) for what data should be persisted in the checkpoint.
|
||||
|
|
@ -99,7 +99,7 @@ threads are restarted, and two-phase index builds are resumed.
|
|||
See [here](https://source.wiredtiger.com/develop/arch-rts.html) for WiredTiger's architecture guide
|
||||
on rollback-to-stable.
|
||||
|
||||
See [here](../../repl/README.md#rollback-recover-to-a-timestamp-rtt) for more information on what
|
||||
See [here](/src/mongo/db/repl/README.md#rollback-recover-to-a-timestamp-rtt) for more information on what
|
||||
happens in the replication layer during rollback-to-stable.
|
||||
|
||||
## Repair
|
||||
|
|
@ -120,7 +120,7 @@ MongoDB repair attempts to address the following forms of corruption:
|
|||
- Missing WiredTiger data files
|
||||
- Includes all collections, `_mdb_catalog`, and `sizeStorer`
|
||||
- Index inconsistencies
|
||||
- Validate [repair mode](../../validate/README.md#repair-mode) attempts to fix index inconsistencies to avoid a full index
|
||||
- Validate [repair mode](/src/mongo/db/validate/README.md#repair-mode) attempts to fix index inconsistencies to avoid a full index
|
||||
rebuild.
|
||||
- Indexes are rebuilt on collections after they have been salvaged or if they fail validation and
|
||||
validate repair mode is unable to fix all errors.
|
||||
|
|
@ -143,7 +143,7 @@ MongoDB repair attempts to address the following forms of corruption:
|
|||
2. Initialize the StorageEngine and [salvage the `_mdb_catalog` table, if
|
||||
needed](https://github.com/mongodb/mongo/blob/r4.5.0/src/mongo/db/storage/storage_engine_impl.cpp#L95).
|
||||
3. Recover orphaned collections.
|
||||
- If an [ident](#glossary) is known to WiredTiger but is not present in the `_mdb_catalog`,
|
||||
- If an [ident](/src/mongo/db/storage/README.md#glossary) is known to WiredTiger but is not present in the `_mdb_catalog`,
|
||||
[create a new
|
||||
collection](https://github.com/mongodb/mongo/blob/r4.5.0/src/mongo/db/storage/storage_engine_impl.cpp#L145-L189)
|
||||
with the prefix `local.orphan.<ident-name>` that references this ident.
|
||||
|
|
@ -175,7 +175,7 @@ MongoDB repair attempts to address the following forms of corruption:
|
|||
rebuilt](https://github.com/mongodb/mongo/blob/r4.5.0/src/mongo/db/repair_database.cpp#L134-L149)
|
||||
for that collection.
|
||||
5. Validate collection and index consistency
|
||||
- [Collection validation](#collection-validation) checks for consistency between the collection
|
||||
- [Collection validation](/src/mongo/db/validate/README.md) checks for consistency between the collection
|
||||
and indexes. Validate repair mode attempts to fix any inconsistencies it finds.
|
||||
6. Rebuild indexes
|
||||
- If a collection's data has been salvaged or any index inconsistencies are not repairable by
|
||||
|
|
@ -208,7 +208,7 @@ The oplog collection can be truncated both at the front end (most recent entries
|
|||
be deleted when new writes increase the collection size past the cap. MongoDB using the WiredTiger
|
||||
storage engine with `--replSet` handles oplog collection deletion specially via
|
||||
OplogTruncateMarkers, an oplog specific implementation of the
|
||||
[CollectionTruncateMarkers](../README.md#collectionTruncateMarkers) mechanism, ignoring the generic capped
|
||||
[CollectionTruncateMarkers](/src/mongo/db/storage/README.md#collectionTruncateMarkers) mechanism, ignoring the generic capped
|
||||
collection deletion mechanism. The front of the oplog may be truncated back to a particular
|
||||
timestamp during replication startup recovery or replication rollback.
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# Time-Series Collections
|
||||
|
||||
MongoDB supports a new collection type for storing time-series data with the [timeseries](https://github.com/mongodb/mongo/blob/r8.2.1/src/mongo/db/commands/create.idl#L152-L156)
|
||||
MongoDB supports a new collection type for storing time-series data with the [timeseries](../local_catalog/ddl/create.idl)
|
||||
collection option. A time-series collection presents a simple interface for inserting and querying
|
||||
measurements while organizing the actual data in buckets.
|
||||
|
||||
|
|
|
|||
|
|
@ -32,9 +32,9 @@ Additionally, users can specify that they'd like to perform a `full` validation.
|
|||
## Types of Validation
|
||||
|
||||
- Verifies the collection's durable catalog entry and in-memory state match.
|
||||
- Indexes are marked as [multikey](#multikey-indexes) correctly.
|
||||
- Index [multikey](#multikey-indexes) paths cover all of the records in the `RecordStore`.
|
||||
- Indexes are not missing [multikey](#multikey-indexes) metadata information.
|
||||
- Indexes are marked as [multikey](../local_catalog/README.md#multikey-indexes) correctly.
|
||||
- Index [multikey](../local_catalog/README.md#multikey-indexes) paths cover all of the records in the `RecordStore`.
|
||||
- Indexes are not missing [multikey](../local_catalog/README.md#multikey-indexes) metadata information.
|
||||
- Index entries are in increasing order if the sort order is ascending.
|
||||
- Index entries are in decreasing order if the sort order is descending.
|
||||
- Unique indexes do not have duplicate keys.
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ Other operations are to be targeted to whichever shard(s) own data (or relevant
|
|||
|
||||
When a shard receives the request, it will check this token to make sure that it matches the shard's local information. If it matches, then the request will proceed. If the version does not match, the shard will throw [an exception](https://github.com/mongodb/mongo/blob/r6.0.0/src/mongo/s/stale_exception.h).
|
||||
|
||||
When the router receives this exception, it knows that the routing information must have changed, and so it will [perform a refresh](#routing-information-refreshes) to get more recent information before sending the request again.
|
||||
When the router receives this exception, it knows that the routing information must have changed, and so it will [perform a refresh](#routing-aka-placement-information-refreshes) to get more recent information before sending the request again.
|
||||
|
||||
The following diagram depicts a simple example of the shard versioning protocol in action. It assumes that the router is a shard server primary, thus the refresh is simply fetching newer information from the config server.
|
||||
|
||||
|
|
|
|||
|
|
@ -37,4 +37,4 @@ CRUD operations with a read concern weaker than snapshot roughly match the "read
|
|||
|
||||
- The [MoveRange TLA+ specification](https://github.com/mongodb/mongo/blob/d40899bd45db62def8941cc6ba65c44a2cbbb83a/src/mongo/tla_plus/MoveRange/MoveRange.tla), which models the distributed query protocol and verifies the safety and liveness properties described in this readme.
|
||||
- The [Sharded Transactions and DDLs readme](https://github.com/mongodb/mongo/blob/master/src/mongo/db/s/README_transactions_and_ddl.md), covering aspects pertaining to CRUD operations in distributed transactions.
|
||||
- The [RoutingContext readme](/src/mongo/s/query/README_routing_context.md) for information about routing operations safely with the `RoutingContext`
|
||||
- The [RoutingContext readme](../../db/global_catalog/router_role_api/README_routing_context.md) for information about routing operations safely with the `RoutingContext`
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ Stage implementations specify their splitting requirements by implementing the [
|
|||
|
||||
## Routing of aggregations that involve multiple collections
|
||||
|
||||
**Note**: For information about routing aggregations and other query operations safely with the `RoutingContext`, refer to the [RoutingContext README](/src/mongo/s/query/README_routing_context.md).
|
||||
**Note**: For information about routing aggregations and other query operations safely with the `RoutingContext`, refer to the [RoutingContext README](../../db/global_catalog/router_role_api/README_routing_context.md).
|
||||
|
||||
Some aggregation stages reference a second (or more) collections. Some examples of this are the following stages: $lookup, $graphLookup, $out, $merge, $unionWith.
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue