SERVER-114403: Markdown linter handles reference-style links (#44399)

GitOrigin-RevId: 24906c21af2a8a0412b4b052ab0eca5cfaa696be
2025-12-01 09:44:33 -05:00 · 2025-12-01 09:44:33 -05:00 · 87b363ce7d
parent 21759c25a0
commit 87b363ce7d
10 changed files with 159 additions and 39 deletions
--- a/buildscripts/lint_markdown_links.py
+++ b/buildscripts/lint_markdown_links.py
@ -8,7 +8,8 @@ Link Types Validated
 --------------------
 1. Intra-document anchors: `[text](#some-heading)`
 2. Relative file links: `[text](../../path/to/OtherFile.md#anchor)`
-3. Repo-root relative paths beginning with `/src/` (e.g. `[feature flags](/src/mongo/db/query/README_query_feature_flags.md)`).
+3. Repo-root relative paths beginning with `/` (e.g. `[feature flags](/src/mongo/db/query/README_query_feature_flags.md)`)
+4. Reference-style links: `[text][label]` or `[text][]` with definitions like `[label]: url`

 External (http/https) links are currently skipped (no network requests performed) except for a trivial malformed scheme check (e.g. `hhttps://`).

@ -126,11 +127,16 @@ HTML_ANCHOR_RE = re.compile(r'<a\s+(?:name|id)=["\']([^"\']+)["\']\s*>\s*</a>?',
 LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
 # Inline link references: [text]: url
 REF_DEF_RE = re.compile(r"^\s*\[([^\]]+)\]:\s+(\S+)")
-REF_USE_RE = re.compile(r"\[([^\]]+)\]\[(?:(?:[^\]]+))?\]")  # simplified
+# Reference-style links: [text][label] or [text][] but NOT [[double brackets]]
+# Negative lookbehind (?<!\[) ensures first [ is not preceded by [
+# Negative lookahead (?!\[) ensures first [ is not followed by another [
+REF_USE_RE = re.compile(r"(?<!\[)\[([^\]]+)\](?!\])\[(?:(?:[^\]]+))?\]")

 # Characters removed for anchor IDs (GitHub rules simplified). We strip most punctuation except hyphen and underscore.
 PUNCT_TO_STRIP = "\"'!#$%&()*+,./:;<=>?@[]^`{|}~"  # punctuation characters to remove
 ANCHOR_CACHE: dict[str, set[str]] = {}
+# Cache for reference-style link definitions: file_path -> {label: target_url}
+REFERENCE_CACHE: dict[str, dict[str, str]] = {}


 def _detect_repo_root(start: str | None = None) -> str:
@ -244,6 +250,25 @@ def collect_headings(path: str) -> set[str]:
    return anchors


+def collect_reference_definitions(path: str) -> dict[str, str]:
+    """Parse all reference-style link definitions [label]: url from a markdown file."""
+    if path in REFERENCE_CACHE:
+        return REFERENCE_CACHE[path]
+    references: dict[str, str] = {}
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            for line in f:
+                m = REF_DEF_RE.match(line)
+                if m:
+                    label = m.group(1).strip().lower()  # case-insensitive matching
+                    target = m.group(2).strip()
+                    references[label] = target
+    except Exception:
+        pass
+    REFERENCE_CACHE[path] = references
+    return references
+
+
 def is_http_url(url: str) -> bool:
    return url.startswith("http://") or url.startswith("https://")

@ -266,8 +291,8 @@ def parse_links(file_path: str) -> List[Tuple[int, str, str]]:
            fence_delim = None  # track ``` or ~~~
            for idx, raw_line in enumerate(f, start=1):
                line = raw_line.rstrip("\n")
-                # Detect start/end of fenced code blocks. Accept ``` or ~~~ with optional language.
-                fence_match = re.match(r"^(?P<delim>`{3,}|~{3,})(.*)$", line)
+                # Detect start/end of fenced code blocks. Accept ``` or ~~~ with optional language and leading whitespace.
+                fence_match = re.match(r"^\s*(?P<delim>`{3,}|~{3,})(.*)$", line)
                if fence_match:
                    full = fence_match.group("delim")
                    # Toggle if same delimiter starts/ends
@ -292,15 +317,102 @@ def parse_links(file_path: str) -> List[Tuple[int, str, str]]:
                        in_blockquote = False
                    else:
                        continue
+                # Skip lines that are reference definitions themselves
+                if REF_DEF_RE.match(line):
+                    continue
+
+                # Find all backtick regions to exclude from link detection
+                # Build a set of character positions that are inside backticks
+                backtick_positions = set()
+                in_code = False
+                for i, char in enumerate(line):
+                    if char == "`":
+                        in_code = not in_code
+                    elif in_code:
+                        backtick_positions.add(i)
+
+                # Helper function to check if the opening bracket of a link is inside backticks
+                # We only check the start position because if the [ is in code, the whole link should be skipped
+                def is_in_code_span(match_start):
+                    return match_start in backtick_positions
+
+                # Track character ranges of all matched links to avoid double-processing
+                matched_ranges = []
+
+                def overlaps_matched_range(start, end):
+                    """Check if a position range overlaps with any previously matched range."""
+                    for m_start, m_end in matched_ranges:
+                        # Check for any overlap
+                        if start < m_end and end > m_start:
+                            return True
+                    return False
+
+                # Inline links [text](url)
                for m in LINK_RE.finditer(line):
+                    if is_in_code_span(m.start()):
+                        continue  # Skip links inside backticks
                    text, target = m.group(1), m.group(2).strip()
                    links.append((idx, text, target))
+                    matched_ranges.append((m.start(), m.end()))
+
+                # Reference-style links [text][label] or [text][]
+                for m in REF_USE_RE.finditer(line):
+                    if is_in_code_span(m.start()):
+                        continue  # Skip links inside backticks
+                    full_match = m.group(0)
+                    text = m.group(1).strip()
+                    # Extract label from [text][label] - if empty brackets [], use text as label
+                    label_part = full_match[len(text) + 2 :]  # skip [text]
+                    if label_part == "[]":
+                        label = text  # implicit reference: [text][] uses "text" as label
+                    else:
+                        # Explicit label: [text][label]
+                        label = label_part.strip("[]").strip()
+                    # Use special marker to indicate this is a reference link
+                    links.append((idx, text, f"__REF__{label}"))
+                    matched_ranges.append((m.start(), m.end()))
+
+                # Shortcut reference links [text] - single bracket that references a definition
+                # Only match if not already matched by inline or reference-style patterns
+                # Pattern: single bracket pair not preceded by [ and not followed by ( or [
+                for m in re.finditer(r"(?<!\[)\[([^\]]+)\](?![(\[])", line):
+                    if is_in_code_span(m.start()):
+                        continue  # Skip links inside backticks
+                    # Skip if overlaps with already matched ranges
+                    if overlaps_matched_range(m.start(), m.end()):
+                        continue
+                    # Skip if this is part of a double bracket pattern [[...]]
+                    if m.end() < len(line) and line[m.end()] == "]":
+                        continue
+                    text = m.group(1).strip()
+                    # Only treat as reference link if it could plausibly be one
+                    # (contains text, not just punctuation or numbers)
+                    if text and not text.isdigit():
+                        # Use special marker to indicate this is a reference link
+                        # For shortcut references, the label is the text itself
+                        links.append((idx, text, f"__REF__{text}"))
    except Exception:
        pass
    return links


 def validate_link(current_file: str, line: int, text: str, target: str) -> Optional[LinkIssue]:
+    # Handle reference-style links [text][label]
+    if target.startswith("__REF__"):
+        label = target[7:].lower()  # Extract label and normalize to lowercase
+        references = collect_reference_definitions(current_file)
+        if label not in references:
+            return LinkIssue(
+                current_file,
+                line,
+                text,
+                f"[{label}]",
+                f'reference link label "{label}" not defined in this file',
+            )
+        # Resolve the reference and validate the actual target
+        resolved_target = references[label]
+        return validate_link(current_file, line, text, resolved_target)
+
    # Remove surrounding <> used sometimes in markdown
    if target.startswith("<") and target.endswith(">"):
        target = target[1:-1]
@ -396,7 +508,19 @@ def validate_link(current_file: str, line: int, text: str, target: str) -> Optio
        resolved_path = os.path.normpath(os.path.join(current_dir, file_part))

    if not os.path.exists(resolved_path):
-        return LinkIssue(current_file, line, text, target, f"file does not exist: {resolved_path}")
+        # Try appending .md extension if the path doesn't exist
+        if not resolved_path.endswith(".md"):
+            resolved_path_with_md = resolved_path + ".md"
+            if os.path.exists(resolved_path_with_md):
+                resolved_path = resolved_path_with_md
+            else:
+                return LinkIssue(
+                    current_file, line, text, target, f"file does not exist: {resolved_path}"
+                )
+        else:
+            return LinkIssue(
+                current_file, line, text, target, f"file does not exist: {resolved_path}"
+            )

    if frag_part:
        # If target file is NOT markdown and fragment matches a GitHub line anchor (#Lnn or #Lnn-Lmm), accept.
@ -835,6 +959,7 @@ def main(argv: List[str]) -> int:
        # Re-run lint to update issues list after fixes
        if fix_count:
            ANCHOR_CACHE.clear()
+            REFERENCE_CACHE.clear()
            issues = lint_files(files, args.workers)

    if args.json:
--- a/src/mongo/db/ftdc/README.md
+++ b/src/mongo/db/ftdc/README.md
@ -217,8 +217,7 @@ document as its baseline.
 #### Run length encoding of zeros

 A sequence of zeros is compressed to a pair of numbers `[0, x]` where `x` is non-zero positive
-integer that indicates the number of zeros in a sequence. For instance, an array of zeros `[0, 0, 0,
-0]` is transformed to `[0, 4]`.
+integer that indicates the number of zeros in a sequence. For instance, an array of zeros `[0, 0, 0, 0]` is transformed to `[0, 4]`.

 #### Varint compression

--- a/src/mongo/db/query/README_query_feature_flags.md
+++ b/src/mongo/db/query/README_query_feature_flags.md
@ -197,8 +197,6 @@ ianb:
 > the tests has an assertion about the order of the oplog entries to this
 > effect: [v2_delta_oplog_entries_fcv.js][v2_delta_oplog_entries_fcv_dot_js]

-\[snippet\]
-
 ```js
 // Check that the sequence of oplog entries is right. We expect to see the following
 // sequence, in ascending order by timestamp:
--- a/src/mongo/db/query/query_tester/README.md
+++ b/src/mongo/db/query/query_tester/README.md
@ -38,23 +38,23 @@ To perform other operations, consult the table below.

 ### Options

-| Option                           | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| -------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| -t <path/to/testName>            | Required. Can appear multiple times, each time it appears the test following will be run. All test files should use the `.test` suffix.                                                                                                                                                                                                                                                                                                                                                               |
-| --uri <MDBConnString>            | The address at which to connect to a mongod/mongos. Defaults to localhost::27017. Uses the MongoDB URI format                                                                                                                                                                                                                                                                                                                                                                                         |
-| -n <int>                         | Run a specific test in the file immediately preceding this -n argument. Invalid if not following a -t <testName> pair                                                                                                                                                                                                                                                                                                                                                                                 |
-| -r <int> <int>                   | Run a range of tests in the file immediately preceding this -r argument. Invalid if not following a -t <testName> pair                                                                                                                                                                                                                                                                                                                                                                                |
-| -v (verbose)                     | Only available in compare mode. This appends a summary of failing queries to an unsuccessful test file comparison.                                                                                                                                                                                                                                                                                                                                                                                    |
-| --extractFeatures                | Only available in compare mode, and must be specified with -v (verbose). Extracts metadata about most common features across failed queries for an enriched debugging experience. Note that this uses the [feature-extractor](https://github.com/10gen/feature-extractor), which must be present in the user's home directory.                                                                                                                                                                        |
-| --drop                           | Drops the collections needed by the tests to be run before running.                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| --load                           | Builds indexes and inserts documents into the collections needed by the specified test files. If not specified assumes the collection state is correct                                                                                                                                                                                                                                                                                                                                                |
-| --minimal-index                  | Only create the minimal set of indices necessary, currently just geospatial and text indices.                                                                                                                                                                                                                                                                                                                                                                                                         |
-| --mode [run, compare, normalize] | Specify whether to just run the tests, to also compare results (default), or only check that results are normalized. Just running is useful to generate result files. In 'run' mode tests will not fail unless a command fails.                                                                                                                                                                                                                                                                       |
-| --opt-off                        | Disables optimizations (always) and pushing down to the find layer (when possible). Mostly used for generating an initial results file for differential, multiversion testing. This flag requires `--enableTestCommands=true` to be passed to the MongoD.                                                                                                                                                                                                                                             |
-| --out [result, oneline]          | Only available in non-compare modes. **Result:** Generate a new '.results' file from the file being run, with each result in a test's result set appearing on a separate line. Will overwrite existing `.results` files. **Oneline:** Generate a new '.results' file from the file being run, with a test's entire result set appearing on one line. Will overwrite existing `.results` files. All of these apply to every file being run, and will add test numbers to tests if not already present. |
-| --populateAndExit                | Drops current data and loads documents and indexes per specification in the `*.test` file. No tests are run. `--drop` and `--load` are implicitly applied.                                                                                                                                                                                                                                                                                                                                            |
-| --diff [plain, word]             | Specify the type of diff to use when displaying result set differences. Defaults to word-based diff with color if not specified. It is recommended to use the default (`word`) if the terminal `query_tester` is being run in supports ANSI color codes for easier to read output. `plain` uses line based diff with no color.                                                                                                                                                                        |
-| --override [queryShapeHash]      | (Optional) Specify what override to use when running a test. When providing the `queryShapeHash` override, it uses the existing corpus of tests but runs explain of the original command instead, extracting the queryShapeHash and asserting that they match the corresponding `file.queryShapeHash.results` file.                                                                                                                                                                                   |
+| Option                             | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| ---------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `-t <path/to/testName>`            | Required. Can appear multiple times, each time it appears the test following will be run. All test files should use the `.test` suffix.                                                                                                                                                                                                                                                                                                                                                               |
+| `--uri <MDBConnString>`            | The address at which to connect to a mongod/mongos. Defaults to localhost::27017. Uses the MongoDB URI format                                                                                                                                                                                                                                                                                                                                                                                         |
+| `-n <int>`                         | Run a specific test in the file immediately preceding this -n argument. Invalid if not following a -t <testName> pair                                                                                                                                                                                                                                                                                                                                                                                 |
+| `-r <int> <int>`                   | Run a range of tests in the file immediately preceding this -r argument. Invalid if not following a -t <testName> pair                                                                                                                                                                                                                                                                                                                                                                                |
+| `-v (verbose)`                     | Only available in compare mode. This appends a summary of failing queries to an unsuccessful test file comparison.                                                                                                                                                                                                                                                                                                                                                                                    |
+| `--extractFeatures`                | Only available in compare mode, and must be specified with -v (verbose). Extracts metadata about most common features across failed queries for an enriched debugging experience. Note that this uses the [feature-extractor](https://github.com/10gen/feature-extractor), which must be present in the user's home directory.                                                                                                                                                                        |
+| `--drop`                           | Drops the collections needed by the tests to be run before running.                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| `--load`                           | Builds indexes and inserts documents into the collections needed by the specified test files. If not specified assumes the collection state is correct                                                                                                                                                                                                                                                                                                                                                |
+| `--minimal-index`                  | Only create the minimal set of indices necessary, currently just geospatial and text indices.                                                                                                                                                                                                                                                                                                                                                                                                         |
+| `--mode [run, compare, normalize]` | Specify whether to just run the tests, to also compare results (default), or only check that results are normalized. Just running is useful to generate result files. In 'run' mode tests will not fail unless a command fails.                                                                                                                                                                                                                                                                       |
+| `--opt-off`                        | Disables optimizations (always) and pushing down to the find layer (when possible). Mostly used for generating an initial results file for differential, multiversion testing. This flag requires `--enableTestCommands=true` to be passed to the MongoD.                                                                                                                                                                                                                                             |
+| `--out [result, oneline]`          | Only available in non-compare modes. **Result:** Generate a new '.results' file from the file being run, with each result in a test's result set appearing on a separate line. Will overwrite existing `.results` files. **Oneline:** Generate a new '.results' file from the file being run, with a test's entire result set appearing on one line. Will overwrite existing `.results` files. All of these apply to every file being run, and will add test numbers to tests if not already present. |
+| `--populateAndExit`                | Drops current data and loads documents and indexes per specification in the `*.test` file. No tests are run. `--drop` and `--load` are implicitly applied.                                                                                                                                                                                                                                                                                                                                            |
+| `--diff [plain, word]`             | Specify the type of diff to use when displaying result set differences. Defaults to word-based diff with color if not specified. It is recommended to use the default (`word`) if the terminal `query_tester` is being run in supports ANSI color codes for easier to read output. `plain` uses line based diff with no color.                                                                                                                                                                        |
+| `--override [queryShapeHash]`      | (Optional) Specify what override to use when running a test. When providing the `queryShapeHash` override, it uses the existing corpus of tests but runs explain of the original command instead, extracting the queryShapeHash and asserting that they match the corresponding `file.queryShapeHash.results` file.                                                                                                                                                                                   |

 ## File types and formats

--- a/src/mongo/db/query/search/mongot_queries_on_views.md
+++ b/src/mongo/db/query/search/mongot_queries_on_views.md
@ -14,7 +14,7 @@ Search queries on views operate differently from standard view queries. Normally

 To resolve this, the `$_internalSearchIdLookup` stage applies the view's transformations within its own sub-pipeline. This means the view is applied after the `$_internalSearchMongotRemote` stage but before the rest of the user's pipeline. While this technically violates the rule that a view pipeline must come first, it is permitted because `$_internalSearchMongotRemote` does not modify documents; it only retrieves document IDs from `mongot`.

-In summary, `$_internalSearchIdLookup` takes unmodified documents from the `_id` values returned by `$_internalSearchMongotRemote`, applies the view's data transforms, and passes said transformed documents through the rest of the user pipeline [^1].
+In summary, `$_internalSearchIdLookup` takes unmodified documents from the `_id` values returned by `$_internalSearchMongotRemote`, applies the view's data transforms, and passes said transformed documents through the rest of the user pipeline.

 ## Technical Details

--- a/src/mongo/db/repl/FCV_AND_FEATURE_FLAG_README.md
+++ b/src/mongo/db/repl/FCV_AND_FEATURE_FLAG_README.md
@ -374,7 +374,7 @@ There are three locks used in the setFCV command:
  - Other operations should [take this lock in shared mode](https://github.com/mongodb/mongo/blob/bd8a8d4d880577302c777ff961f359b03435126a/src/mongo/db/commands/feature_compatibility_version.cpp#L594-L599)
    if they want to ensure that the FCV state _does not change at all_ during the operation.
    See [example](https://github.com/mongodb/mongo/blob/bd8a8d4d880577302c777ff961f359b03435126a/src/mongo/db/s/config/sharding_catalog_manager_collection_operations.cpp#L489-L490)
- [Global lock]
+- [Global lock](/src/mongo/db/shard_role/lock_manager/d_concurrency.h)
  - The setFCV command [takes this lock in S mode and then releases it immediately](https://github.com/mongodb/mongo/blob/418028cf4dcf416d5ab87552721ed3559bce5507/src/mongo/db/commands/set_feature_compatibility_version_command.cpp#L551-L557)
    shortly after the FCV transitions to a new value (either to the upgrading/downgrading state,
    or to the fully upgrade/downgraded state).
--- a/src/mongo/db/shard_role/lock_manager/README.md
+++ b/src/mongo/db/shard_role/lock_manager/README.md
@ -119,3 +119,5 @@ behavior of the RAII locking types acquiring locks on resources upon their const
 unlocking the lock upon their destruction when going out of scope. Instead, the responsibility of
 unlocking the locks is transferred to the WriteUnitOfWork destructor. Note this is only true for
 transactions that do writes, and therefore only for code that uses WriteUnitOfWork.
+
+[Multiversion concurrency control]: https://en.wikipedia.org/wiki/Multiversion_concurrency_control
--- a/src/mongo/db/shard_role/shard_catalog/README.md
+++ b/src/mongo/db/shard_role/shard_catalog/README.md
@ -742,7 +742,7 @@ values are ObjectId's.

 For more information on time-series collections, see the [timeseries/README][].

-[timeseries/README]: ../timeseries/README.md
+[timeseries/README]: /src/mongo/db/timeseries/README.md

 ## Capped clustered collections

@ -804,8 +804,3 @@ requirements for atomicity, consistency, isolation, and durability, storage engi
 some form of transaction. In contrast, a multi-document transaction in MongoDB is a user-facing
 feature providing similar guarantees across many nodes in a sharded cluster; a storage transaction
 only provides guarantees within one node.
-
-[`BSONObj::woCompare`]: https://github.com/mongodb/mongo/blob/v4.4/src/mongo/bson/bsonobj.h#L460
-[`BSONElement::compareElements`]: https://github.com/mongodb/mongo/blob/v4.4/src/mongo/bson/bsonelement.cpp#L285
-[`Ordering`]: https://github.com/mongodb/mongo/blob/v4.4/src/mongo/bson/ordering.h
-[initial sync]: ../repl/README.md#initial-sync
--- a/src/mongo/db/sharding_environment/README_startup_and_shutdown.md
+++ b/src/mongo/db/sharding_environment/README_startup_and_shutdown.md
@ -71,7 +71,7 @@ on entering quiesce mode, prompting it to respond to all waiting hello requests.
 ### helloOk Protocol Negotation

 In order to preserve backwards compatibility with old drivers, mongos currently supports both
-the [`isMaster`] command and the [`hello`] command. New drivers and 5.0+ versions of the server
+the [isMaster command][] and the [hello command][]. New drivers and 5.0+ versions of the server
 will support `hello`. When connecting to a sharded cluster via mongos, a new driver will send
 "helloOk: true" as a part of the initial handshake. If mongos supports hello, it will respond
 with "helloOk: true" as well. This way, new drivers know that they're communicating with a version
@ -85,7 +85,5 @@ drivers will not specify this flag at all, so the behavior remains the same.
 When mongos establishes outgoing connections to mongod nodes in the cluster, it always uses `hello`
 rather than `isMaster`.

-#### Code references
-
- [isMaster command](https://github.com/mongodb/mongo/blob/r4.8.0-alpha/src/mongo/s/commands/cluster_is_master_cmd.cpp#L248) for mongos.
- [hello command](https://github.com/mongodb/mongo/blob/r4.8.0-alpha/src/mongo/s/commands/cluster_is_master_cmd.cpp#L64) for mongos.
+[isMaster command]: https://github.com/mongodb/mongo/blob/r4.8.0-alpha/src/mongo/s/commands/cluster_is_master_cmd.cpp#L248
+[hello command]: https://github.com/mongodb/mongo/blob/r4.8.0-alpha/src/mongo/s/commands/cluster_is_master_cmd.cpp#L64
--- a/src/mongo/db/storage/key_string/README.md
+++ b/src/mongo/db/storage/key_string/README.md
@ -129,4 +129,7 @@ representation, from lower memory addresses to higher addresses, is the same as
 for that type. For example, ASCII strings are binary comparable, but double precision floating point
 numbers and little-endian integers are not.

+[`BSONObj::woCompare`]: https://github.com/mongodb/mongo/blob/v4.4/src/mongo/bson/bsonobj.h#L460
+[`BSONElement::compareElements`]: https://github.com/mongodb/mongo/blob/v4.4/src/mongo/bson/bsonelement.cpp#L285
+[`Ordering`]: https://github.com/mongodb/mongo/blob/v4.4/src/mongo/bson/ordering.h
 [initial sync]: /src/mongo/db/repl/README.md#initial-sync