SERVER-114403: Markdown linter handles reference-style links (#44399)

GitOrigin-RevId: 24906c21af2a8a0412b4b052ab0eca5cfaa696be
This commit is contained in:
Louis Williams 2025-12-01 09:44:33 -05:00 committed by MongoDB Bot
parent 21759c25a0
commit 87b363ce7d
10 changed files with 159 additions and 39 deletions

View File

@ -8,7 +8,8 @@ Link Types Validated
--------------------
1. Intra-document anchors: `[text](#some-heading)`
2. Relative file links: `[text](../../path/to/OtherFile.md#anchor)`
3. Repo-root relative paths beginning with `/src/` (e.g. `[feature flags](/src/mongo/db/query/README_query_feature_flags.md)`).
3. Repo-root relative paths beginning with `/` (e.g. `[feature flags](/src/mongo/db/query/README_query_feature_flags.md)`)
4. Reference-style links: `[text][label]` or `[text][]` with definitions like `[label]: url`
External (http/https) links are currently skipped (no network requests performed) except for a trivial malformed scheme check (e.g. `hhttps://`).
@ -126,11 +127,16 @@ HTML_ANCHOR_RE = re.compile(r'<a\s+(?:name|id)=["\']([^"\']+)["\']\s*>\s*</a>?',
LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
# Inline link references: [text]: url
REF_DEF_RE = re.compile(r"^\s*\[([^\]]+)\]:\s+(\S+)")
REF_USE_RE = re.compile(r"\[([^\]]+)\]\[(?:(?:[^\]]+))?\]") # simplified
# Reference-style links: [text][label] or [text][] but NOT [[double brackets]]
# Negative lookbehind (?<!\[) ensures first [ is not preceded by [
# Negative lookahead (?!\[) ensures first [ is not followed by another [
REF_USE_RE = re.compile(r"(?<!\[)\[([^\]]+)\](?!\])\[(?:(?:[^\]]+))?\]")
# Characters removed for anchor IDs (GitHub rules simplified). We strip most punctuation except hyphen and underscore.
PUNCT_TO_STRIP = "\"'!#$%&()*+,./:;<=>?@[]^`{|}~" # punctuation characters to remove
ANCHOR_CACHE: dict[str, set[str]] = {}
# Cache for reference-style link definitions: file_path -> {label: target_url}
REFERENCE_CACHE: dict[str, dict[str, str]] = {}
def _detect_repo_root(start: str | None = None) -> str:
@ -244,6 +250,25 @@ def collect_headings(path: str) -> set[str]:
return anchors
def collect_reference_definitions(path: str) -> dict[str, str]:
"""Parse all reference-style link definitions [label]: url from a markdown file."""
if path in REFERENCE_CACHE:
return REFERENCE_CACHE[path]
references: dict[str, str] = {}
try:
with open(path, "r", encoding="utf-8") as f:
for line in f:
m = REF_DEF_RE.match(line)
if m:
label = m.group(1).strip().lower() # case-insensitive matching
target = m.group(2).strip()
references[label] = target
except Exception:
pass
REFERENCE_CACHE[path] = references
return references
def is_http_url(url: str) -> bool:
return url.startswith("http://") or url.startswith("https://")
@ -266,8 +291,8 @@ def parse_links(file_path: str) -> List[Tuple[int, str, str]]:
fence_delim = None # track ``` or ~~~
for idx, raw_line in enumerate(f, start=1):
line = raw_line.rstrip("\n")
# Detect start/end of fenced code blocks. Accept ``` or ~~~ with optional language.
fence_match = re.match(r"^(?P<delim>`{3,}|~{3,})(.*)$", line)
# Detect start/end of fenced code blocks. Accept ``` or ~~~ with optional language and leading whitespace.
fence_match = re.match(r"^\s*(?P<delim>`{3,}|~{3,})(.*)$", line)
if fence_match:
full = fence_match.group("delim")
# Toggle if same delimiter starts/ends
@ -292,15 +317,102 @@ def parse_links(file_path: str) -> List[Tuple[int, str, str]]:
in_blockquote = False
else:
continue
# Skip lines that are reference definitions themselves
if REF_DEF_RE.match(line):
continue
# Find all backtick regions to exclude from link detection
# Build a set of character positions that are inside backticks
backtick_positions = set()
in_code = False
for i, char in enumerate(line):
if char == "`":
in_code = not in_code
elif in_code:
backtick_positions.add(i)
# Helper function to check if the opening bracket of a link is inside backticks
# We only check the start position because if the [ is in code, the whole link should be skipped
def is_in_code_span(match_start):
return match_start in backtick_positions
# Track character ranges of all matched links to avoid double-processing
matched_ranges = []
def overlaps_matched_range(start, end):
"""Check if a position range overlaps with any previously matched range."""
for m_start, m_end in matched_ranges:
# Check for any overlap
if start < m_end and end > m_start:
return True
return False
# Inline links [text](url)
for m in LINK_RE.finditer(line):
if is_in_code_span(m.start()):
continue # Skip links inside backticks
text, target = m.group(1), m.group(2).strip()
links.append((idx, text, target))
matched_ranges.append((m.start(), m.end()))
# Reference-style links [text][label] or [text][]
for m in REF_USE_RE.finditer(line):
if is_in_code_span(m.start()):
continue # Skip links inside backticks
full_match = m.group(0)
text = m.group(1).strip()
# Extract label from [text][label] - if empty brackets [], use text as label
label_part = full_match[len(text) + 2 :] # skip [text]
if label_part == "[]":
label = text # implicit reference: [text][] uses "text" as label
else:
# Explicit label: [text][label]
label = label_part.strip("[]").strip()
# Use special marker to indicate this is a reference link
links.append((idx, text, f"__REF__{label}"))
matched_ranges.append((m.start(), m.end()))
# Shortcut reference links [text] - single bracket that references a definition
# Only match if not already matched by inline or reference-style patterns
# Pattern: single bracket pair not preceded by [ and not followed by ( or [
for m in re.finditer(r"(?<!\[)\[([^\]]+)\](?![(\[])", line):
if is_in_code_span(m.start()):
continue # Skip links inside backticks
# Skip if overlaps with already matched ranges
if overlaps_matched_range(m.start(), m.end()):
continue
# Skip if this is part of a double bracket pattern [[...]]
if m.end() < len(line) and line[m.end()] == "]":
continue
text = m.group(1).strip()
# Only treat as reference link if it could plausibly be one
# (contains text, not just punctuation or numbers)
if text and not text.isdigit():
# Use special marker to indicate this is a reference link
# For shortcut references, the label is the text itself
links.append((idx, text, f"__REF__{text}"))
except Exception:
pass
return links
def validate_link(current_file: str, line: int, text: str, target: str) -> Optional[LinkIssue]:
# Handle reference-style links [text][label]
if target.startswith("__REF__"):
label = target[7:].lower() # Extract label and normalize to lowercase
references = collect_reference_definitions(current_file)
if label not in references:
return LinkIssue(
current_file,
line,
text,
f"[{label}]",
f'reference link label "{label}" not defined in this file',
)
# Resolve the reference and validate the actual target
resolved_target = references[label]
return validate_link(current_file, line, text, resolved_target)
# Remove surrounding <> used sometimes in markdown
if target.startswith("<") and target.endswith(">"):
target = target[1:-1]
@ -396,7 +508,19 @@ def validate_link(current_file: str, line: int, text: str, target: str) -> Optio
resolved_path = os.path.normpath(os.path.join(current_dir, file_part))
if not os.path.exists(resolved_path):
return LinkIssue(current_file, line, text, target, f"file does not exist: {resolved_path}")
# Try appending .md extension if the path doesn't exist
if not resolved_path.endswith(".md"):
resolved_path_with_md = resolved_path + ".md"
if os.path.exists(resolved_path_with_md):
resolved_path = resolved_path_with_md
else:
return LinkIssue(
current_file, line, text, target, f"file does not exist: {resolved_path}"
)
else:
return LinkIssue(
current_file, line, text, target, f"file does not exist: {resolved_path}"
)
if frag_part:
# If target file is NOT markdown and fragment matches a GitHub line anchor (#Lnn or #Lnn-Lmm), accept.
@ -835,6 +959,7 @@ def main(argv: List[str]) -> int:
# Re-run lint to update issues list after fixes
if fix_count:
ANCHOR_CACHE.clear()
REFERENCE_CACHE.clear()
issues = lint_files(files, args.workers)
if args.json:

View File

@ -217,8 +217,7 @@ document as its baseline.
#### Run length encoding of zeros
A sequence of zeros is compressed to a pair of numbers `[0, x]` where `x` is non-zero positive
integer that indicates the number of zeros in a sequence. For instance, an array of zeros `[0, 0, 0,
0]` is transformed to `[0, 4]`.
integer that indicates the number of zeros in a sequence. For instance, an array of zeros `[0, 0, 0, 0]` is transformed to `[0, 4]`.
#### Varint compression

View File

@ -197,8 +197,6 @@ ianb:
> the tests has an assertion about the order of the oplog entries to this
> effect: [v2_delta_oplog_entries_fcv.js][v2_delta_oplog_entries_fcv_dot_js]
\[snippet\]
```js
// Check that the sequence of oplog entries is right. We expect to see the following
// sequence, in ascending order by timestamp:

View File

@ -38,23 +38,23 @@ To perform other operations, consult the table below.
### Options
| Option | Description |
| -------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| -t <path/to/testName> | Required. Can appear multiple times, each time it appears the test following will be run. All test files should use the `.test` suffix. |
| --uri <MDBConnString> | The address at which to connect to a mongod/mongos. Defaults to localhost::27017. Uses the MongoDB URI format |
| -n <int> | Run a specific test in the file immediately preceding this -n argument. Invalid if not following a -t <testName> pair |
| -r <int> <int> | Run a range of tests in the file immediately preceding this -r argument. Invalid if not following a -t <testName> pair |
| -v (verbose) | Only available in compare mode. This appends a summary of failing queries to an unsuccessful test file comparison. |
| --extractFeatures | Only available in compare mode, and must be specified with -v (verbose). Extracts metadata about most common features across failed queries for an enriched debugging experience. Note that this uses the [feature-extractor](https://github.com/10gen/feature-extractor), which must be present in the user's home directory. |
| --drop | Drops the collections needed by the tests to be run before running. |
| --load | Builds indexes and inserts documents into the collections needed by the specified test files. If not specified assumes the collection state is correct |
| --minimal-index | Only create the minimal set of indices necessary, currently just geospatial and text indices. |
| --mode [run, compare, normalize] | Specify whether to just run the tests, to also compare results (default), or only check that results are normalized. Just running is useful to generate result files. In 'run' mode tests will not fail unless a command fails. |
| --opt-off | Disables optimizations (always) and pushing down to the find layer (when possible). Mostly used for generating an initial results file for differential, multiversion testing. This flag requires `--enableTestCommands=true` to be passed to the MongoD. |
| --out [result, oneline] | Only available in non-compare modes. **Result:** Generate a new '.results' file from the file being run, with each result in a test's result set appearing on a separate line. Will overwrite existing `.results` files. **Oneline:** Generate a new '.results' file from the file being run, with a test's entire result set appearing on one line. Will overwrite existing `.results` files. All of these apply to every file being run, and will add test numbers to tests if not already present. |
| --populateAndExit | Drops current data and loads documents and indexes per specification in the `*.test` file. No tests are run. `--drop` and `--load` are implicitly applied. |
| --diff [plain, word] | Specify the type of diff to use when displaying result set differences. Defaults to word-based diff with color if not specified. It is recommended to use the default (`word`) if the terminal `query_tester` is being run in supports ANSI color codes for easier to read output. `plain` uses line based diff with no color. |
| --override [queryShapeHash] | (Optional) Specify what override to use when running a test. When providing the `queryShapeHash` override, it uses the existing corpus of tests but runs explain of the original command instead, extracting the queryShapeHash and asserting that they match the corresponding `file.queryShapeHash.results` file. |
| Option | Description |
| ---------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `-t <path/to/testName>` | Required. Can appear multiple times, each time it appears the test following will be run. All test files should use the `.test` suffix. |
| `--uri <MDBConnString>` | The address at which to connect to a mongod/mongos. Defaults to localhost::27017. Uses the MongoDB URI format |
| `-n <int>` | Run a specific test in the file immediately preceding this -n argument. Invalid if not following a -t <testName> pair |
| `-r <int> <int>` | Run a range of tests in the file immediately preceding this -r argument. Invalid if not following a -t <testName> pair |
| `-v (verbose)` | Only available in compare mode. This appends a summary of failing queries to an unsuccessful test file comparison. |
| `--extractFeatures` | Only available in compare mode, and must be specified with -v (verbose). Extracts metadata about most common features across failed queries for an enriched debugging experience. Note that this uses the [feature-extractor](https://github.com/10gen/feature-extractor), which must be present in the user's home directory. |
| `--drop` | Drops the collections needed by the tests to be run before running. |
| `--load` | Builds indexes and inserts documents into the collections needed by the specified test files. If not specified assumes the collection state is correct |
| `--minimal-index` | Only create the minimal set of indices necessary, currently just geospatial and text indices. |
| `--mode [run, compare, normalize]` | Specify whether to just run the tests, to also compare results (default), or only check that results are normalized. Just running is useful to generate result files. In 'run' mode tests will not fail unless a command fails. |
| `--opt-off` | Disables optimizations (always) and pushing down to the find layer (when possible). Mostly used for generating an initial results file for differential, multiversion testing. This flag requires `--enableTestCommands=true` to be passed to the MongoD. |
| `--out [result, oneline]` | Only available in non-compare modes. **Result:** Generate a new '.results' file from the file being run, with each result in a test's result set appearing on a separate line. Will overwrite existing `.results` files. **Oneline:** Generate a new '.results' file from the file being run, with a test's entire result set appearing on one line. Will overwrite existing `.results` files. All of these apply to every file being run, and will add test numbers to tests if not already present. |
| `--populateAndExit` | Drops current data and loads documents and indexes per specification in the `*.test` file. No tests are run. `--drop` and `--load` are implicitly applied. |
| `--diff [plain, word]` | Specify the type of diff to use when displaying result set differences. Defaults to word-based diff with color if not specified. It is recommended to use the default (`word`) if the terminal `query_tester` is being run in supports ANSI color codes for easier to read output. `plain` uses line based diff with no color. |
| `--override [queryShapeHash]` | (Optional) Specify what override to use when running a test. When providing the `queryShapeHash` override, it uses the existing corpus of tests but runs explain of the original command instead, extracting the queryShapeHash and asserting that they match the corresponding `file.queryShapeHash.results` file. |
## File types and formats

View File

@ -14,7 +14,7 @@ Search queries on views operate differently from standard view queries. Normally
To resolve this, the `$_internalSearchIdLookup` stage applies the view's transformations within its own sub-pipeline. This means the view is applied after the `$_internalSearchMongotRemote` stage but before the rest of the user's pipeline. While this technically violates the rule that a view pipeline must come first, it is permitted because `$_internalSearchMongotRemote` does not modify documents; it only retrieves document IDs from `mongot`.
In summary, `$_internalSearchIdLookup` takes unmodified documents from the `_id` values returned by `$_internalSearchMongotRemote`, applies the view's data transforms, and passes said transformed documents through the rest of the user pipeline [^1].
In summary, `$_internalSearchIdLookup` takes unmodified documents from the `_id` values returned by `$_internalSearchMongotRemote`, applies the view's data transforms, and passes said transformed documents through the rest of the user pipeline.
## Technical Details

View File

@ -374,7 +374,7 @@ There are three locks used in the setFCV command:
- Other operations should [take this lock in shared mode](https://github.com/mongodb/mongo/blob/bd8a8d4d880577302c777ff961f359b03435126a/src/mongo/db/commands/feature_compatibility_version.cpp#L594-L599)
if they want to ensure that the FCV state _does not change at all_ during the operation.
See [example](https://github.com/mongodb/mongo/blob/bd8a8d4d880577302c777ff961f359b03435126a/src/mongo/db/s/config/sharding_catalog_manager_collection_operations.cpp#L489-L490)
- [Global lock]
- [Global lock](/src/mongo/db/shard_role/lock_manager/d_concurrency.h)
- The setFCV command [takes this lock in S mode and then releases it immediately](https://github.com/mongodb/mongo/blob/418028cf4dcf416d5ab87552721ed3559bce5507/src/mongo/db/commands/set_feature_compatibility_version_command.cpp#L551-L557)
shortly after the FCV transitions to a new value (either to the upgrading/downgrading state,
or to the fully upgrade/downgraded state).

View File

@ -119,3 +119,5 @@ behavior of the RAII locking types acquiring locks on resources upon their const
unlocking the lock upon their destruction when going out of scope. Instead, the responsibility of
unlocking the locks is transferred to the WriteUnitOfWork destructor. Note this is only true for
transactions that do writes, and therefore only for code that uses WriteUnitOfWork.
[Multiversion concurrency control]: https://en.wikipedia.org/wiki/Multiversion_concurrency_control

View File

@ -742,7 +742,7 @@ values are ObjectId's.
For more information on time-series collections, see the [timeseries/README][].
[timeseries/README]: ../timeseries/README.md
[timeseries/README]: /src/mongo/db/timeseries/README.md
## Capped clustered collections
@ -804,8 +804,3 @@ requirements for atomicity, consistency, isolation, and durability, storage engi
some form of transaction. In contrast, a multi-document transaction in MongoDB is a user-facing
feature providing similar guarantees across many nodes in a sharded cluster; a storage transaction
only provides guarantees within one node.
[`BSONObj::woCompare`]: https://github.com/mongodb/mongo/blob/v4.4/src/mongo/bson/bsonobj.h#L460
[`BSONElement::compareElements`]: https://github.com/mongodb/mongo/blob/v4.4/src/mongo/bson/bsonelement.cpp#L285
[`Ordering`]: https://github.com/mongodb/mongo/blob/v4.4/src/mongo/bson/ordering.h
[initial sync]: ../repl/README.md#initial-sync

View File

@ -71,7 +71,7 @@ on entering quiesce mode, prompting it to respond to all waiting hello requests.
### helloOk Protocol Negotation
In order to preserve backwards compatibility with old drivers, mongos currently supports both
the [`isMaster`] command and the [`hello`] command. New drivers and 5.0+ versions of the server
the [isMaster command][] and the [hello command][]. New drivers and 5.0+ versions of the server
will support `hello`. When connecting to a sharded cluster via mongos, a new driver will send
"helloOk: true" as a part of the initial handshake. If mongos supports hello, it will respond
with "helloOk: true" as well. This way, new drivers know that they're communicating with a version
@ -85,7 +85,5 @@ drivers will not specify this flag at all, so the behavior remains the same.
When mongos establishes outgoing connections to mongod nodes in the cluster, it always uses `hello`
rather than `isMaster`.
#### Code references
- [isMaster command](https://github.com/mongodb/mongo/blob/r4.8.0-alpha/src/mongo/s/commands/cluster_is_master_cmd.cpp#L248) for mongos.
- [hello command](https://github.com/mongodb/mongo/blob/r4.8.0-alpha/src/mongo/s/commands/cluster_is_master_cmd.cpp#L64) for mongos.
[isMaster command]: https://github.com/mongodb/mongo/blob/r4.8.0-alpha/src/mongo/s/commands/cluster_is_master_cmd.cpp#L248
[hello command]: https://github.com/mongodb/mongo/blob/r4.8.0-alpha/src/mongo/s/commands/cluster_is_master_cmd.cpp#L64

View File

@ -129,4 +129,7 @@ representation, from lower memory addresses to higher addresses, is the same as
for that type. For example, ASCII strings are binary comparable, but double precision floating point
numbers and little-endian integers are not.
[`BSONObj::woCompare`]: https://github.com/mongodb/mongo/blob/v4.4/src/mongo/bson/bsonobj.h#L460
[`BSONElement::compareElements`]: https://github.com/mongodb/mongo/blob/v4.4/src/mongo/bson/bsonelement.cpp#L285
[`Ordering`]: https://github.com/mongodb/mongo/blob/v4.4/src/mongo/bson/ordering.h
[initial sync]: /src/mongo/db/repl/README.md#initial-sync