SERVER-103116: add smoke test script tests (#34264)

GitOrigin-RevId: c05577ba4aa3620515d8b4c888d5af3ec1222237
This commit is contained in:
Myles 2025-05-07 07:43:44 -05:00 committed by MongoDB Bot
parent 2fba020861
commit 1a7098051c
2 changed files with 576 additions and 0 deletions

View File

@ -0,0 +1,125 @@
# Replication Smoke Tests
# THIS FILE IS GENERATED - DO NOT MODIFY
suites:
streams:
# failed 268/12593 times, success p90 => 2.827s, score => 94.785
- src/mongo/db/modules/enterprise/jstests/streams/infinite_loop.js
# failed 139/12565 times, success p90 => 0.405s, score => 343.136
- src/mongo/db/modules/enterprise/jstests/streams/parse_only.js
# failed 20/12575 times, success p90 => 0.318s, score => 62.934
- src/mongo/db/modules/enterprise/jstests/streams/bad_source.js
# failed 37/12586 times, success p90 => 0.627s, score => 58.973
- src/mongo/db/modules/enterprise/jstests/streams/documents.js
# failed 32/12575 times, success p90 => 0.914s, score => 35.029
- src/mongo/db/modules/enterprise/jstests/streams/sample.js
# failed 33/12576 times, success p90 => 0.983s, score => 33.585
- src/mongo/db/modules/enterprise/jstests/streams/validate.js
# failed 32/12586 times, success p90 => 1.495s, score => 21.408
- src/mongo/db/modules/enterprise/jstests/streams/documents_data_source.js
# failed 30/12587 times, success p90 => 1.724s, score => 17.397
- src/mongo/db/modules/enterprise/jstests/streams/stop_reason.js
# failed 39/13999 times, success p90 => 2.497s, score => 15.619
- src/mongo/db/modules/enterprise/jstests/streams/simple_merge.js
# failed 30/12552 times, success p90 => 1.958s, score => 15.320
- src/mongo/db/modules/enterprise/jstests/streams/start_list.js
# failed 35/12566 times, success p90 => 2.328s, score => 15.032
- src/mongo/db/modules/enterprise/jstests/streams/sample_dlq.js
# failed 44/12579 times, success p90 => 3.040s, score => 14.473
- src/mongo/db/modules/enterprise/jstests/streams/dlq.js
# failed 31/14013 times, success p90 => 2.154s, score => 14.391
- src/mongo/db/modules/enterprise/jstests/streams/duplicate_fields.js
# failed 40/12583 times, success p90 => 2.882s, score => 13.878
- src/mongo/db/modules/enterprise/jstests/streams/group_dlq.js
# failed 41/14003 times, success p90 => 3.019s, score => 13.582
- src/mongo/db/modules/enterprise/jstests/streams/replace_root.js
# failed 51/12570 times, success p90 => 4.249s, score => 12.004
- src/mongo/db/modules/enterprise/jstests/streams/merge_pipeline.js
# failed 3/12577 times, success p90 => 0.262s, score => 11.438
- src/mongo/db/modules/enterprise/jstests/streams/checkpoint_helper.js
# failed 29/14016 times, success p90 => 2.594s, score => 11.178
- src/mongo/db/modules/enterprise/jstests/streams/checkpoint_backwards_compat.js
# failed 31/12566 times, success p90 => 2.779s, score => 11.156
- src/mongo/db/modules/enterprise/jstests/streams/sample_data_source.js
# failed 33/14008 times, success p90 => 3.034s, score => 10.876
- src/mongo/db/modules/enterprise/jstests/streams/set.js
# failed 39/14004 times, success p90 => 3.712s, score => 10.507
- src/mongo/db/modules/enterprise/jstests/streams/replace_with.js
# failed 20/10824 times, success p90 => 2.040s, score => 9.803
- src/mongo/db/modules/enterprise/jstests/streams/stack_trace.js
# failed 30/14004 times, success p90 => 3.540s, score => 8.474
- src/mongo/db/modules/enterprise/jstests/streams/redact.js
# failed 38/12580 times, success p90 => 4.915s, score => 7.731
- src/mongo/db/modules/enterprise/jstests/streams/late_events.js
# failed 34/14007 times, success p90 => 4.499s, score => 7.557
- src/mongo/db/modules/enterprise/jstests/streams/unwind.js
# failed 48/12575 times, success p90 => 6.484s, score => 7.403
- src/mongo/db/modules/enterprise/jstests/streams/stream_meta.js
# failed 2/12566 times, success p90 => 0.277s, score => 7.215
- src/mongo/db/modules/enterprise/jstests/streams/fake_client.js
# failed 29/14007 times, success p90 => 4.077s, score => 7.114
- src/mongo/db/modules/enterprise/jstests/streams/unset.js
# failed 37/12569 times, success p90 => 5.477s, score => 6.756
- src/mongo/db/modules/enterprise/jstests/streams/geospatial.js
# failed 38/14004 times, success p90 => 7.363s, score => 5.161
- src/mongo/db/modules/enterprise/jstests/streams/window_group.js
# failed 34/13998 times, success p90 => 7.354s, score => 4.624
- src/mongo/db/modules/enterprise/jstests/streams/feature_flag.js
# failed 217/14020 times, success p90 => 47.916s, score => 4.529
- src/mongo/db/modules/enterprise/jstests/streams/merge_requires_unique_index.js
# failed 34/12572 times, success p90 => 9.179s, score => 3.704
- src/mongo/db/modules/enterprise/jstests/streams/lookup_pipeline.js
replica_sets_initsync_static_jscore_passthrough:
# failed 53.0/13023 times, success p90 => 1.660s, score => 31.919
- jstests/core/timeseries/query/timeseries_ixscan_clusteredidxscan_union.js
# failed 10.0/10821 times, success p90 => 0.399s, score => 25.077
- jstests/core/replicate_record_ids/disallow_capped.js
# failed 19.0/13032 times, success p90 => 0.766s, score => 24.791
- jstests/core/shell/shelltypes.js
# failed 22.0/13021 times, success p90 => 0.902s, score => 24.393
- jstests/core/txns/disallow_operations_on_prepared_transaction.js
# failed 40.0/3348 times, success p90 => 1.777s, score => 22.513
- jstests/core/query/find_and_modify/find_and_modify_metrics.js
# failed 7.0/10821 times, success p90 => 0.316s, score => 22.119
- jstests/core/replicate_record_ids/replicate_record_ids_collection_creation.js
# failed 19.0/13037 times, success p90 => 0.863s, score => 22.020
- jstests/core/txns/no_writes_to_config_transactions_with_prepared_transaction.js
# failed 7.0/10817 times, success p90 => 0.324s, score => 21.612
- jstests/core/replicate_record_ids/collmod_removes_replicate_record_ids.js
# failed 18.0/13026 times, success p90 => 0.876s, score => 20.547
- jstests/core/txns/commit_prepared_transaction.js
# failed 12.0/10795 times, success p90 => 0.588s, score => 20.420
- src/mongo/db/modules/enterprise/jstests/fle2/basic_create_collection_text.js
# failed 17.0/8699 times, success p90 => 0.838s, score => 20.280
- jstests/core/shell/role_management_helpers.js
# failed 16.0/8696 times, success p90 => 0.796s, score => 20.099
- jstests/core/administrative/roles_info.js
# failed 19.0/13031 times, success p90 => 0.947s, score => 20.060
- jstests/core/index/fts/fts_proj.js
# failed 18.0/13026 times, success p90 => 0.901s, score => 19.971
- jstests/core/write/bulk/bulk_write_timeseries_basic.js
# failed 19.0/13030 times, success p90 => 0.970s, score => 19.590
- jstests/core/timeseries/geo/timeseries_geonear_mindistance_and_maxdistance.js
# failed 19.0/13039 times, success p90 => 0.976s, score => 19.461
- jstests/core/index/fts/fts_score_sort.js
# failed 18.0/13025 times, success p90 => 0.963s, score => 18.687
- jstests/core/index/geo/geo_update_btree2.js
# failed 10.0/7115 times, success p90 => 0.583s, score => 17.164
- src/mongo/db/modules/enterprise/jstests/fle2/bulk_write_insert_text.js
# failed 10.0/10821 times, success p90 => 0.596s, score => 16.767
- jstests/core/timeseries/write/timeseries_update_compressed_buckets.js
# failed 13.0/13022 times, success p90 => 0.792s, score => 16.419
- jstests/core/txns/prepare_prepared_transaction.js
# failed 19.0/2582 times, success p90 => 1.199s, score => 15.849
- jstests/core/catalog/db_stats.js
# failed 14.0/13047 times, success p90 => 0.894s, score => 15.667
- jstests/core/txns/aggregation_in_transaction.js
# failed 12.0/13033 times, success p90 => 0.811s, score => 14.790
- jstests/core/txns/transaction_error_handling.js
# failed 12.0/13046 times, success p90 => 0.815s, score => 14.722
- jstests/core/txns/prepare_nonexistent_transaction.js
# failed 10.0/8692 times, success p90 => 0.706s, score => 14.171
- jstests/core/shell/connection_string_validation.js
# failed 11.0/13036 times, success p90 => 0.793s, score => 13.868
- jstests/core/txns/list_collections_not_blocked_by_txn.js
# failed 11.0/13026 times, success p90 => 0.802s, score => 13.723
- jstests/core/txns/prepare_transaction_fails_on_temp_collections.js

View File

@ -0,0 +1,451 @@
#!/usr/bin/env python3
#
# Replication Team Smoke Tests
#
# To be run prior to submitting evergreen patches.
# Runs the following locally and makes sure they pass:
# * clang format
# * clang tidy
# * build install-dist-test
# * replication unit tests
# * replication smoke tests
#
# By default, notifies the locally configured Evergreen user
# via slack once the smoke test are finished.
#
import hashlib
import os
import subprocess
import sys
import time
from collections import defaultdict, deque
from dataclasses import dataclass
from pathlib import Path
from socket import gethostname
from typing import Any, Deque, Dict, List, Optional, Set, Union
from rich.status import Status
REPL = Path(__file__).resolve().parent
ROOT = REPL.parent.parent.parent.parent
MONGO_PYTHON = ROOT.joinpath("python3-venv")
MONGO_PYTHON_INTERPRETER = MONGO_PYTHON.joinpath("bin", "python")
BAZEL = Path("/usr/local/bin/bazel")
def make_unique_name():
ctx = hashlib.new("sha256")
ctx.update(ROOT.resolve().as_posix().encode())
return ctx.hexdigest()[-8:]
REPO_UNIQUE_NAME = make_unique_name()
def ensure_python3_venv():
if sys.executable != MONGO_PYTHON_INTERPRETER.as_posix():
os.execv(
MONGO_PYTHON_INTERPRETER,
[MONGO_PYTHON_INTERPRETER, *sys.argv],
)
# needed for relative imports for eg: buildscripts
sys.path.append(ROOT.as_posix())
ensure_python3_venv()
# can import these after verifying we're running with the correct venv
from buildscripts.resmokelib.utils.evergreen_conn import get_evergreen_api
def humanize_duration(x: float):
raw = int(x)
seconds = raw % 60
raw //= 60
minutes = raw % 60
hours = raw // 60
return f"{hours:02d}h {minutes:02d}m {seconds:02d}s"
@dataclass
class Node:
name: str
args: List[str]
popen_kwargs: Dict[str, str]
log_file: Path
deps: Set["Node"]
_start_time: Optional[float] = None
_finish_time: Optional[float] = None
_proc: Optional[subprocess.Popen] = None
def __str__(self):
return f'Node("{self.name}")'
def __repr__(self):
return f'Node("{self.name}")'
def __hash__(self):
return hash(self.name)
def __eq__(self, other: "Node"):
return self.name == other.name
def __lt__(self, other: "Node"):
return self.name < other.name
def start(self):
self._start_time = time.monotonic()
logstream = self.log_file.open("w")
self._proc = subprocess.Popen(
self.args,
stdout=logstream,
stderr=logstream,
)
def returncode(self):
if self._proc is None:
return None
if self._finish_time is not None:
return self._proc.returncode
if self._proc.poll() is None:
return None
self._finish_time = time.monotonic()
return self._proc.returncode
def deps_are_satisfied(self, finished: Set["Node"]):
return self.deps.issubset(finished)
def normalize_deps(x: Union[None, Node, Set[Node]]):
if x is None:
return set()
elif isinstance(x, (tuple, list, set)):
return set(x)
else:
return {x}
def send_slack_notification(nodes: List[Node], total_elapsed: float):
overall_success = True
lines = [
"```",
f"id={REPO_UNIQUE_NAME} host={gethostname()} root={ROOT}",
]
failure_lines = list()
for node in nodes:
rc = node.returncode()
succeeded = rc == 0
finished = rc is not None
command = " ".join(node.args)
overall_success &= succeeded
if succeeded:
elapsed = node._finish_time - node._start_time
lines.append(f"{humanize_duration(elapsed)} {node.name}")
elif not finished:
lines.append(f" {node.name}")
else:
elapsed = node._finish_time - node._start_time
lines.append(f"{humanize_duration(elapsed)} {node.name}")
failure_lines.append(f"Command '{node.name}', rc={node._proc.returncode}:")
failure_lines.append(f"```\n{command}\n```")
failure_lines.append(f"Log: {node.log_file}")
lines.append("```")
lines.extend(failure_lines)
if overall_success:
lines.insert(
0,
f"SUCCESS - Replication smoke tests passed in {humanize_duration(total_elapsed)}",
)
else:
lines.insert(
0, f"FAILURE - Replication smoke tests failed in {humanize_duration(total_elapsed)}"
)
evg = get_evergreen_api()
evg.send_slack_message(
target=f"@{evg._auth.username}",
msg="\n".join(lines),
)
class CommandRunner:
def __init__(
self,
*,
log_path: Path,
notify_slack: bool,
parallelism: int,
):
self._log_path = log_path
self._parallelism = parallelism
self._downstream: Dict[Node, Set[Node]] = defaultdict(set)
self._nodes: Set[Node] = set()
self._finished: Set[Node] = set()
self._ready: Deque[Node] = deque()
self._running: Set[Node] = set()
self._status = Status(status="repl smoke tests")
self._start_time = time.monotonic()
self._finish_time: Optional[float] = None
self._notify_slack = notify_slack
def _notify(self, event: str, node: Node):
if event == "spawn":
print(f"{' '.join(node.args)}")
self._update_display()
def _update_display(self):
nrun = len(self._running)
nfin = len(self._finished)
ntot = len(self._nodes)
elapsed = time.monotonic() - self._start_time
self._status.update(
status=f"running {nrun}, completed {nfin}/{ntot} {humanize_duration(elapsed)}"
)
def command(
self,
*,
name: str,
args: List[Any],
log_file: str,
deps: Union[None, Node, Set[Node]] = None,
**kwargs,
) -> Node:
log_file = self._log_path.joinpath(log_file)
kwargs.setdefault("cwd", ROOT)
node = Node(
name=name,
args=list(map(str, args)),
popen_kwargs=kwargs,
log_file=log_file,
deps=normalize_deps(deps),
)
self._nodes.add(node)
if len(node.deps) == 0:
self._notify("ready", node)
self._ready.append(node)
for dep in node.deps:
self._downstream[dep].add(node)
return node
def run(self):
print(f"Logging results to {self._log_path}")
self._status.start()
try:
iter_finished: Set[Node] = set()
while self._finished != self._nodes:
while len(self._running) < self._parallelism and len(self._ready) > 0:
node = self._ready.popleft()
node.start()
self._running.add(node)
self._notify("spawn", node)
for node in self._running:
rc = node.returncode()
if rc is not None:
self._notify("reap", node)
iter_finished.add(node)
self._finished.add(node)
if rc != 0:
raise subprocess.CalledProcessError(
returncode=rc,
cmd=" ".join(node.args),
output=f"Log: {node.log_file}",
)
for down in self._downstream[node]:
if down.deps_are_satisfied(self._finished):
self._notify("ready", down)
self._ready.append(down)
for node in iter_finished:
self._running.remove(node)
if len(iter_finished) == 0:
time.sleep(0.1)
iter_finished.clear()
self._update_display()
elapsed = time.monotonic() - self._start_time
if self._notify_slack:
send_slack_notification(
nodes=sorted(self._nodes),
total_elapsed=elapsed,
)
print(f"Completed {len(self._finished)}/{len(self._nodes)} in {elapsed:.3f}s.")
except subprocess.CalledProcessError as cpe:
print(f"""\
Failure:
command {cpe.cmd}
rc {cpe.returncode}
log {cpe.output}""")
send_slack_notification(
nodes=sorted(self._nodes),
total_elapsed=time.monotonic() - self._start_time,
)
raise
finally:
self._status.stop()
def run_replication_smoke_tests(
*,
log_path: Path,
upstream_branch: str,
bazel_args: List[str],
send_slack_notification: bool,
):
log_path = log_path.joinpath(REPO_UNIQUE_NAME)
log_path.mkdir(parents=True, exist_ok=True)
runner = CommandRunner(
log_path=log_path,
notify_slack=send_slack_notification,
parallelism=os.cpu_count(),
)
formatters = [
runner.command(
name="clang format",
args=[
MONGO_PYTHON_INTERPRETER,
ROOT.joinpath("buildscripts", "clang_format.py"),
"format-my",
upstream_branch,
],
log_file="clang_format.log",
),
runner.command(
name="starlark format",
args=[
MONGO_PYTHON_INTERPRETER,
ROOT.joinpath("buildscripts", "buildifier.py"),
"--generate-report",
"--binary-dir=./",
"lint-all",
],
log_file="starlark_format.log",
),
runner.command(
name="python format",
args=[
MONGO_PYTHON_INTERPRETER,
ROOT.joinpath("buildscripts", "pylinters.py"),
"fix",
],
log_file="python_format.log",
),
runner.command(
# catch-all for other bazel-driven formatters
name="misc. format",
args=[
BAZEL,
"run",
"//:format",
],
log_file="misc_format.log",
),
]
install = runner.command(
name="build install executables",
args=[
BAZEL,
"build",
*bazel_args,
"//:install-dist-test",
],
log_file="build_install_dist_test.log",
deps=formatters,
)
smoke_tests = runner.command(
name="run repl smoke tests",
args=[
MONGO_PYTHON_INTERPRETER,
ROOT.joinpath("buildscripts", "run_smoke_tests.py"),
"--suites",
"replication",
],
log_file="smoke_tests.log",
# these can run while clang tidy is running, but i think it conflicts with unittests
deps=install,
)
unittests = runner.command(
name="run repl unittests",
args=[
BAZEL,
"test",
*bazel_args,
"--test_tag_filters=mongo_unittest",
"--test_output=summary",
"//src/mongo/db/repl/...",
],
# NOTE: bazel already stores the real logs somewhere else
log_file="unittests.log",
# not a true dep, but bazel access has to be serialized
deps=install,
)
# unfortunately this shuffles bazel stuff around meaning we have to wait
# for our tests to finish so the executables for the smoke tests are still
# there
runner.command(
name="clang tidy",
args=[
BAZEL,
"build",
# NOTE: don't use user-provided bazel args for clang-tidy
"--config=clang-tidy",
"--verbose_failures",
"--keep_going",
"//src/mongo/...",
],
log_file="clang_tidy.log",
# again not a true dep, just serializing bazel access
deps=(smoke_tests, unittests),
)
runner.run()
def main():
from argparse import ArgumentParser
p = ArgumentParser()
p.add_argument(
"--log-path",
type=Path,
help="Directory to place logs from smoke test stages",
default=Path("~/.logs/replication_smoke_tests").expanduser(),
)
p.add_argument(
"--upstream-branch",
type=str,
default="origin/master",
help="Git branch to format diff against",
)
p.add_argument(
"--send-slack-notification",
type=int,
default=1,
help='Send a slack notification based on the local evergreen configuration to "yourself"',
)
args, bazel_args = p.parse_known_args()
run_replication_smoke_tests(
log_path=args.log_path,
upstream_branch=args.upstream_branch,
bazel_args=bazel_args,
send_slack_notification=args.send_slack_notification,
)
if __name__ == "__main__":
main()