SERVER-103116: add smoke test script tests (#34264)

GitOrigin-RevId: c05577ba4aa3620515d8b4c888d5af3ec1222237
2025-05-07 07:43:44 -05:00 · 2025-05-07 07:43:44 -05:00 · 1a7098051c
parent 2fba020861
commit 1a7098051c
2 changed files with 576 additions and 0 deletions
--- a/buildscripts/smoke_tests/replication.yml
+++ b/buildscripts/smoke_tests/replication.yml
@ -0,0 +1,125 @@
+# Replication Smoke Tests
+# THIS FILE IS GENERATED - DO NOT MODIFY
+suites:
+  streams:
+    # failed 268/12593 times, success p90 => 2.827s, score => 94.785
+    - src/mongo/db/modules/enterprise/jstests/streams/infinite_loop.js
+    # failed 139/12565 times, success p90 => 0.405s, score => 343.136
+    - src/mongo/db/modules/enterprise/jstests/streams/parse_only.js
+    # failed 20/12575 times, success p90 => 0.318s, score => 62.934
+    - src/mongo/db/modules/enterprise/jstests/streams/bad_source.js
+    # failed 37/12586 times, success p90 => 0.627s, score => 58.973
+    - src/mongo/db/modules/enterprise/jstests/streams/documents.js
+    # failed 32/12575 times, success p90 => 0.914s, score => 35.029
+    - src/mongo/db/modules/enterprise/jstests/streams/sample.js
+    # failed 33/12576 times, success p90 => 0.983s, score => 33.585
+    - src/mongo/db/modules/enterprise/jstests/streams/validate.js
+    # failed 32/12586 times, success p90 => 1.495s, score => 21.408
+    - src/mongo/db/modules/enterprise/jstests/streams/documents_data_source.js
+    # failed 30/12587 times, success p90 => 1.724s, score => 17.397
+    - src/mongo/db/modules/enterprise/jstests/streams/stop_reason.js
+    # failed 39/13999 times, success p90 => 2.497s, score => 15.619
+    - src/mongo/db/modules/enterprise/jstests/streams/simple_merge.js
+    # failed 30/12552 times, success p90 => 1.958s, score => 15.320
+    - src/mongo/db/modules/enterprise/jstests/streams/start_list.js
+    # failed 35/12566 times, success p90 => 2.328s, score => 15.032
+    - src/mongo/db/modules/enterprise/jstests/streams/sample_dlq.js
+    # failed 44/12579 times, success p90 => 3.040s, score => 14.473
+    - src/mongo/db/modules/enterprise/jstests/streams/dlq.js
+    # failed 31/14013 times, success p90 => 2.154s, score => 14.391
+    - src/mongo/db/modules/enterprise/jstests/streams/duplicate_fields.js
+    # failed 40/12583 times, success p90 => 2.882s, score => 13.878
+    - src/mongo/db/modules/enterprise/jstests/streams/group_dlq.js
+    # failed 41/14003 times, success p90 => 3.019s, score => 13.582
+    - src/mongo/db/modules/enterprise/jstests/streams/replace_root.js
+    # failed 51/12570 times, success p90 => 4.249s, score => 12.004
+    - src/mongo/db/modules/enterprise/jstests/streams/merge_pipeline.js
+    # failed 3/12577 times, success p90 => 0.262s, score => 11.438
+    - src/mongo/db/modules/enterprise/jstests/streams/checkpoint_helper.js
+    # failed 29/14016 times, success p90 => 2.594s, score => 11.178
+    - src/mongo/db/modules/enterprise/jstests/streams/checkpoint_backwards_compat.js
+    # failed 31/12566 times, success p90 => 2.779s, score => 11.156
+    - src/mongo/db/modules/enterprise/jstests/streams/sample_data_source.js
+    # failed 33/14008 times, success p90 => 3.034s, score => 10.876
+    - src/mongo/db/modules/enterprise/jstests/streams/set.js
+    # failed 39/14004 times, success p90 => 3.712s, score => 10.507
+    - src/mongo/db/modules/enterprise/jstests/streams/replace_with.js
+    # failed 20/10824 times, success p90 => 2.040s, score => 9.803
+    - src/mongo/db/modules/enterprise/jstests/streams/stack_trace.js
+    # failed 30/14004 times, success p90 => 3.540s, score => 8.474
+    - src/mongo/db/modules/enterprise/jstests/streams/redact.js
+    # failed 38/12580 times, success p90 => 4.915s, score => 7.731
+    - src/mongo/db/modules/enterprise/jstests/streams/late_events.js
+    # failed 34/14007 times, success p90 => 4.499s, score => 7.557
+    - src/mongo/db/modules/enterprise/jstests/streams/unwind.js
+    # failed 48/12575 times, success p90 => 6.484s, score => 7.403
+    - src/mongo/db/modules/enterprise/jstests/streams/stream_meta.js
+    # failed 2/12566 times, success p90 => 0.277s, score => 7.215
+    - src/mongo/db/modules/enterprise/jstests/streams/fake_client.js
+    # failed 29/14007 times, success p90 => 4.077s, score => 7.114
+    - src/mongo/db/modules/enterprise/jstests/streams/unset.js
+    # failed 37/12569 times, success p90 => 5.477s, score => 6.756
+    - src/mongo/db/modules/enterprise/jstests/streams/geospatial.js
+    # failed 38/14004 times, success p90 => 7.363s, score => 5.161
+    - src/mongo/db/modules/enterprise/jstests/streams/window_group.js
+    # failed 34/13998 times, success p90 => 7.354s, score => 4.624
+    - src/mongo/db/modules/enterprise/jstests/streams/feature_flag.js
+    # failed 217/14020 times, success p90 => 47.916s, score => 4.529
+    - src/mongo/db/modules/enterprise/jstests/streams/merge_requires_unique_index.js
+    # failed 34/12572 times, success p90 => 9.179s, score => 3.704
+    - src/mongo/db/modules/enterprise/jstests/streams/lookup_pipeline.js
+  replica_sets_initsync_static_jscore_passthrough:
+    # failed 53.0/13023 times, success p90 => 1.660s, score => 31.919
+    - jstests/core/timeseries/query/timeseries_ixscan_clusteredidxscan_union.js
+    # failed 10.0/10821 times, success p90 => 0.399s, score => 25.077
+    - jstests/core/replicate_record_ids/disallow_capped.js
+    # failed 19.0/13032 times, success p90 => 0.766s, score => 24.791
+    - jstests/core/shell/shelltypes.js
+    # failed 22.0/13021 times, success p90 => 0.902s, score => 24.393
+    - jstests/core/txns/disallow_operations_on_prepared_transaction.js
+    # failed 40.0/3348 times, success p90 => 1.777s, score => 22.513
+    - jstests/core/query/find_and_modify/find_and_modify_metrics.js
+    # failed 7.0/10821 times, success p90 => 0.316s, score => 22.119
+    - jstests/core/replicate_record_ids/replicate_record_ids_collection_creation.js
+    # failed 19.0/13037 times, success p90 => 0.863s, score => 22.020
+    - jstests/core/txns/no_writes_to_config_transactions_with_prepared_transaction.js
+    # failed 7.0/10817 times, success p90 => 0.324s, score => 21.612
+    - jstests/core/replicate_record_ids/collmod_removes_replicate_record_ids.js
+    # failed 18.0/13026 times, success p90 => 0.876s, score => 20.547
+    - jstests/core/txns/commit_prepared_transaction.js
+    # failed 12.0/10795 times, success p90 => 0.588s, score => 20.420
+    - src/mongo/db/modules/enterprise/jstests/fle2/basic_create_collection_text.js
+    # failed 17.0/8699 times, success p90 => 0.838s, score => 20.280
+    - jstests/core/shell/role_management_helpers.js
+    # failed 16.0/8696 times, success p90 => 0.796s, score => 20.099
+    - jstests/core/administrative/roles_info.js
+    # failed 19.0/13031 times, success p90 => 0.947s, score => 20.060
+    - jstests/core/index/fts/fts_proj.js
+    # failed 18.0/13026 times, success p90 => 0.901s, score => 19.971
+    - jstests/core/write/bulk/bulk_write_timeseries_basic.js
+    # failed 19.0/13030 times, success p90 => 0.970s, score => 19.590
+    - jstests/core/timeseries/geo/timeseries_geonear_mindistance_and_maxdistance.js
+    # failed 19.0/13039 times, success p90 => 0.976s, score => 19.461
+    - jstests/core/index/fts/fts_score_sort.js
+    # failed 18.0/13025 times, success p90 => 0.963s, score => 18.687
+    - jstests/core/index/geo/geo_update_btree2.js
+    # failed 10.0/7115 times, success p90 => 0.583s, score => 17.164
+    - src/mongo/db/modules/enterprise/jstests/fle2/bulk_write_insert_text.js
+    # failed 10.0/10821 times, success p90 => 0.596s, score => 16.767
+    - jstests/core/timeseries/write/timeseries_update_compressed_buckets.js
+    # failed 13.0/13022 times, success p90 => 0.792s, score => 16.419
+    - jstests/core/txns/prepare_prepared_transaction.js
+    # failed 19.0/2582 times, success p90 => 1.199s, score => 15.849
+    - jstests/core/catalog/db_stats.js
+    # failed 14.0/13047 times, success p90 => 0.894s, score => 15.667
+    - jstests/core/txns/aggregation_in_transaction.js
+    # failed 12.0/13033 times, success p90 => 0.811s, score => 14.790
+    - jstests/core/txns/transaction_error_handling.js
+    # failed 12.0/13046 times, success p90 => 0.815s, score => 14.722
+    - jstests/core/txns/prepare_nonexistent_transaction.js
+    # failed 10.0/8692 times, success p90 => 0.706s, score => 14.171
+    - jstests/core/shell/connection_string_validation.js
+    # failed 11.0/13036 times, success p90 => 0.793s, score => 13.868
+    - jstests/core/txns/list_collections_not_blocked_by_txn.js
+    # failed 11.0/13026 times, success p90 => 0.802s, score => 13.723
+    - jstests/core/txns/prepare_transaction_fails_on_temp_collections.js
--- a/src/mongo/db/repl/smoke_tests.py
+++ b/src/mongo/db/repl/smoke_tests.py
@ -0,0 +1,451 @@
+#!/usr/bin/env python3
+#
+# Replication Team Smoke Tests
+#
+# To be run prior to submitting evergreen patches.
+# Runs the following locally and makes sure they pass:
+#   * clang format
+#   * clang tidy
+#   * build install-dist-test
+#   * replication unit tests
+#   * replication smoke tests
+#
+# By default, notifies the locally configured Evergreen user
+# via slack once the smoke test are finished.
+#
+
+import hashlib
+import os
+import subprocess
+import sys
+import time
+from collections import defaultdict, deque
+from dataclasses import dataclass
+from pathlib import Path
+from socket import gethostname
+from typing import Any, Deque, Dict, List, Optional, Set, Union
+
+from rich.status import Status
+
+REPL = Path(__file__).resolve().parent
+ROOT = REPL.parent.parent.parent.parent
+MONGO_PYTHON = ROOT.joinpath("python3-venv")
+MONGO_PYTHON_INTERPRETER = MONGO_PYTHON.joinpath("bin", "python")
+BAZEL = Path("/usr/local/bin/bazel")
+
+
+def make_unique_name():
+    ctx = hashlib.new("sha256")
+    ctx.update(ROOT.resolve().as_posix().encode())
+    return ctx.hexdigest()[-8:]
+
+
+REPO_UNIQUE_NAME = make_unique_name()
+
+
+def ensure_python3_venv():
+    if sys.executable != MONGO_PYTHON_INTERPRETER.as_posix():
+        os.execv(
+            MONGO_PYTHON_INTERPRETER,
+            [MONGO_PYTHON_INTERPRETER, *sys.argv],
+        )
+    # needed for relative imports for eg: buildscripts
+    sys.path.append(ROOT.as_posix())
+
+
+ensure_python3_venv()
+# can import these after verifying we're running with the correct venv
+from buildscripts.resmokelib.utils.evergreen_conn import get_evergreen_api
+
+
+def humanize_duration(x: float):
+    raw = int(x)
+    seconds = raw % 60
+    raw //= 60
+    minutes = raw % 60
+    hours = raw // 60
+    return f"{hours:02d}h {minutes:02d}m {seconds:02d}s"
+
+
+@dataclass
+class Node:
+    name: str
+    args: List[str]
+    popen_kwargs: Dict[str, str]
+    log_file: Path
+    deps: Set["Node"]
+    _start_time: Optional[float] = None
+    _finish_time: Optional[float] = None
+    _proc: Optional[subprocess.Popen] = None
+
+    def __str__(self):
+        return f'Node("{self.name}")'
+
+    def __repr__(self):
+        return f'Node("{self.name}")'
+
+    def __hash__(self):
+        return hash(self.name)
+
+    def __eq__(self, other: "Node"):
+        return self.name == other.name
+
+    def __lt__(self, other: "Node"):
+        return self.name < other.name
+
+    def start(self):
+        self._start_time = time.monotonic()
+        logstream = self.log_file.open("w")
+        self._proc = subprocess.Popen(
+            self.args,
+            stdout=logstream,
+            stderr=logstream,
+        )
+
+    def returncode(self):
+        if self._proc is None:
+            return None
+        if self._finish_time is not None:
+            return self._proc.returncode
+        if self._proc.poll() is None:
+            return None
+        self._finish_time = time.monotonic()
+        return self._proc.returncode
+
+    def deps_are_satisfied(self, finished: Set["Node"]):
+        return self.deps.issubset(finished)
+
+
+def normalize_deps(x: Union[None, Node, Set[Node]]):
+    if x is None:
+        return set()
+    elif isinstance(x, (tuple, list, set)):
+        return set(x)
+    else:
+        return {x}
+
+
+def send_slack_notification(nodes: List[Node], total_elapsed: float):
+    overall_success = True
+    lines = [
+        "```",
+        f"id={REPO_UNIQUE_NAME} host={gethostname()} root={ROOT}",
+    ]
+
+    failure_lines = list()
+
+    for node in nodes:
+        rc = node.returncode()
+        succeeded = rc == 0
+        finished = rc is not None
+        command = " ".join(node.args)
+        overall_success &= succeeded
+        if succeeded:
+            elapsed = node._finish_time - node._start_time
+            lines.append(f"{humanize_duration(elapsed)} {node.name} ✔")
+        elif not finished:
+            lines.append(f"            {node.name}")
+        else:
+            elapsed = node._finish_time - node._start_time
+            lines.append(f"{humanize_duration(elapsed)} {node.name} ✖")
+            failure_lines.append(f"Command '{node.name}', rc={node._proc.returncode}:")
+            failure_lines.append(f"```\n{command}\n```")
+            failure_lines.append(f"Log: {node.log_file}")
+
+    lines.append("```")
+    lines.extend(failure_lines)
+
+    if overall_success:
+        lines.insert(
+            0,
+            f"SUCCESS - Replication smoke tests passed in {humanize_duration(total_elapsed)}",
+        )
+    else:
+        lines.insert(
+            0, f"FAILURE - Replication smoke tests failed in {humanize_duration(total_elapsed)}"
+        )
+
+    evg = get_evergreen_api()
+    evg.send_slack_message(
+        target=f"@{evg._auth.username}",
+        msg="\n".join(lines),
+    )
+
+
+class CommandRunner:
+    def __init__(
+        self,
+        *,
+        log_path: Path,
+        notify_slack: bool,
+        parallelism: int,
+    ):
+        self._log_path = log_path
+        self._parallelism = parallelism
+        self._downstream: Dict[Node, Set[Node]] = defaultdict(set)
+        self._nodes: Set[Node] = set()
+        self._finished: Set[Node] = set()
+        self._ready: Deque[Node] = deque()
+        self._running: Set[Node] = set()
+        self._status = Status(status="repl smoke tests")
+        self._start_time = time.monotonic()
+        self._finish_time: Optional[float] = None
+        self._notify_slack = notify_slack
+
+    def _notify(self, event: str, node: Node):
+        if event == "spawn":
+            print(f"{' '.join(node.args)}")
+        self._update_display()
+
+    def _update_display(self):
+        nrun = len(self._running)
+        nfin = len(self._finished)
+        ntot = len(self._nodes)
+        elapsed = time.monotonic() - self._start_time
+        self._status.update(
+            status=f"running {nrun}, completed {nfin}/{ntot} {humanize_duration(elapsed)}"
+        )
+
+    def command(
+        self,
+        *,
+        name: str,
+        args: List[Any],
+        log_file: str,
+        deps: Union[None, Node, Set[Node]] = None,
+        **kwargs,
+    ) -> Node:
+        log_file = self._log_path.joinpath(log_file)
+        kwargs.setdefault("cwd", ROOT)
+
+        node = Node(
+            name=name,
+            args=list(map(str, args)),
+            popen_kwargs=kwargs,
+            log_file=log_file,
+            deps=normalize_deps(deps),
+        )
+        self._nodes.add(node)
+        if len(node.deps) == 0:
+            self._notify("ready", node)
+            self._ready.append(node)
+
+        for dep in node.deps:
+            self._downstream[dep].add(node)
+
+        return node
+
+    def run(self):
+        print(f"Logging results to {self._log_path}")
+        self._status.start()
+        try:
+            iter_finished: Set[Node] = set()
+            while self._finished != self._nodes:
+                while len(self._running) < self._parallelism and len(self._ready) > 0:
+                    node = self._ready.popleft()
+                    node.start()
+                    self._running.add(node)
+                    self._notify("spawn", node)
+
+                for node in self._running:
+                    rc = node.returncode()
+                    if rc is not None:
+                        self._notify("reap", node)
+                        iter_finished.add(node)
+                        self._finished.add(node)
+                        if rc != 0:
+                            raise subprocess.CalledProcessError(
+                                returncode=rc,
+                                cmd=" ".join(node.args),
+                                output=f"Log: {node.log_file}",
+                            )
+                        for down in self._downstream[node]:
+                            if down.deps_are_satisfied(self._finished):
+                                self._notify("ready", down)
+                                self._ready.append(down)
+                for node in iter_finished:
+                    self._running.remove(node)
+                if len(iter_finished) == 0:
+                    time.sleep(0.1)
+                iter_finished.clear()
+                self._update_display()
+
+            elapsed = time.monotonic() - self._start_time
+            if self._notify_slack:
+                send_slack_notification(
+                    nodes=sorted(self._nodes),
+                    total_elapsed=elapsed,
+                )
+            print(f"Completed {len(self._finished)}/{len(self._nodes)} in {elapsed:.3f}s.")
+        except subprocess.CalledProcessError as cpe:
+            print(f"""\
+Failure:
+    command {cpe.cmd}
+    rc      {cpe.returncode}
+    log     {cpe.output}""")
+            send_slack_notification(
+                nodes=sorted(self._nodes),
+                total_elapsed=time.monotonic() - self._start_time,
+            )
+            raise
+        finally:
+            self._status.stop()
+
+
+def run_replication_smoke_tests(
+    *,
+    log_path: Path,
+    upstream_branch: str,
+    bazel_args: List[str],
+    send_slack_notification: bool,
+):
+    log_path = log_path.joinpath(REPO_UNIQUE_NAME)
+    log_path.mkdir(parents=True, exist_ok=True)
+
+    runner = CommandRunner(
+        log_path=log_path,
+        notify_slack=send_slack_notification,
+        parallelism=os.cpu_count(),
+    )
+
+    formatters = [
+        runner.command(
+            name="clang format",
+            args=[
+                MONGO_PYTHON_INTERPRETER,
+                ROOT.joinpath("buildscripts", "clang_format.py"),
+                "format-my",
+                upstream_branch,
+            ],
+            log_file="clang_format.log",
+        ),
+        runner.command(
+            name="starlark format",
+            args=[
+                MONGO_PYTHON_INTERPRETER,
+                ROOT.joinpath("buildscripts", "buildifier.py"),
+                "--generate-report",
+                "--binary-dir=./",
+                "lint-all",
+            ],
+            log_file="starlark_format.log",
+        ),
+        runner.command(
+            name="python format",
+            args=[
+                MONGO_PYTHON_INTERPRETER,
+                ROOT.joinpath("buildscripts", "pylinters.py"),
+                "fix",
+            ],
+            log_file="python_format.log",
+        ),
+        runner.command(
+            # catch-all for other bazel-driven formatters
+            name="misc. format",
+            args=[
+                BAZEL,
+                "run",
+                "//:format",
+            ],
+            log_file="misc_format.log",
+        ),
+    ]
+
+    install = runner.command(
+        name="build install executables",
+        args=[
+            BAZEL,
+            "build",
+            *bazel_args,
+            "//:install-dist-test",
+        ],
+        log_file="build_install_dist_test.log",
+        deps=formatters,
+    )
+
+    smoke_tests = runner.command(
+        name="run repl smoke tests",
+        args=[
+            MONGO_PYTHON_INTERPRETER,
+            ROOT.joinpath("buildscripts", "run_smoke_tests.py"),
+            "--suites",
+            "replication",
+        ],
+        log_file="smoke_tests.log",
+        # these can run while clang tidy is running, but i think it conflicts with unittests
+        deps=install,
+    )
+
+    unittests = runner.command(
+        name="run repl unittests",
+        args=[
+            BAZEL,
+            "test",
+            *bazel_args,
+            "--test_tag_filters=mongo_unittest",
+            "--test_output=summary",
+            "//src/mongo/db/repl/...",
+        ],
+        # NOTE: bazel already stores the real logs somewhere else
+        log_file="unittests.log",
+        # not a true dep, but bazel access has to be serialized
+        deps=install,
+    )
+
+    # unfortunately this shuffles bazel stuff around meaning we have to wait
+    # for our tests to finish so the executables for the smoke tests are still
+    # there
+    runner.command(
+        name="clang tidy",
+        args=[
+            BAZEL,
+            "build",
+            # NOTE: don't use user-provided bazel args for clang-tidy
+            "--config=clang-tidy",
+            "--verbose_failures",
+            "--keep_going",
+            "//src/mongo/...",
+        ],
+        log_file="clang_tidy.log",
+        # again not a true dep, just serializing bazel access
+        deps=(smoke_tests, unittests),
+    )
+
+    runner.run()
+
+
+def main():
+    from argparse import ArgumentParser
+
+    p = ArgumentParser()
+
+    p.add_argument(
+        "--log-path",
+        type=Path,
+        help="Directory to place logs from smoke test stages",
+        default=Path("~/.logs/replication_smoke_tests").expanduser(),
+    )
+    p.add_argument(
+        "--upstream-branch",
+        type=str,
+        default="origin/master",
+        help="Git branch to format diff against",
+    )
+    p.add_argument(
+        "--send-slack-notification",
+        type=int,
+        default=1,
+        help='Send a slack notification based on the local evergreen configuration to "yourself"',
+    )
+
+    args, bazel_args = p.parse_known_args()
+    run_replication_smoke_tests(
+        log_path=args.log_path,
+        upstream_branch=args.upstream_branch,
+        bazel_args=bazel_args,
+        send_slack_notification=args.send_slack_notification,
+    )
+
+
+if __name__ == "__main__":
+    main()