From 37a3ed4860ea3b40bc7791aff6cec9b0d3183bde Mon Sep 17 00:00:00 2001 From: Sean Lyons Date: Fri, 15 Aug 2025 10:48:25 -0400 Subject: [PATCH] SERVER-102654 Save core dumps as undeclared test output (#40094) GitOrigin-RevId: ba47e626d2c03546af8b76ed7fcfab9891b29b29 --- bazel/resmoke/resmoke_shim.py | 40 +++++++++++++++++++- etc/evergreen_yml_components/definitions.yml | 4 +- evergreen/gather_mongo_coredumps.sh | 11 ++++-- 3 files changed, 47 insertions(+), 8 deletions(-) diff --git a/bazel/resmoke/resmoke_shim.py b/bazel/resmoke/resmoke_shim.py index 5fed28b9a83..513e0b0beb0 100644 --- a/bazel/resmoke/resmoke_shim.py +++ b/bazel/resmoke/resmoke_shim.py @@ -1,12 +1,17 @@ import os import pathlib +import signal import sys from functools import cache +import psutil + REPO_ROOT = pathlib.Path(__file__).parent.parent.parent sys.path.append(str(REPO_ROOT)) from buildscripts.bazel_local_resources import acquire_local_resource from buildscripts.resmokelib import cli +from buildscripts.resmokelib.hang_analyzer.process import signal_python +from buildscripts.resmokelib.logging.loggers import new_resmoke_logger @cache @@ -43,6 +48,36 @@ def add_evergreen_build_info(args): add_volatile_arg(args, "--versionId=", "version_id") add_volatile_arg(args, "--requester=", "requester") +class ResmokeShimContext: + def __init__(self): + self.links = [] + + def __enter__(self): + # Bazel will send SIGTERM on a test timeout. If all processes haven't terminated + # after –-local_termination_grace_seconds (default 15s), Bazel will SIGKILL them instead. + signal.signal(signal.SIGTERM, self._handle_interrupt) + + # Symlink source directories because resmoke uses relative paths profusely. + base_dir = os.path.join(os.environ.get("TEST_SRCDIR"), "_main") + working_dir = os.getcwd() + for entry in os.scandir(base_dir): + link = os.path.join(working_dir, entry.name) + self.links.append(link) + os.symlink(entry.path, link) + return self + + def __exit__(self, exception_type, exception_value, exception_traceback): + for link in self.links: + os.unlink(link) + + def _handle_interrupt(self, signum, frame): + # Attempt a clean shutdown, producing python stacktraces and generating core dumps for + # any still running process. It is likely that most programs will have terminated before + # core dumps can be produced, since Bazel sends SIGTERM to all processes, not just this one. + # TODO: SERVER-109274 + pid = os.getpid() + p = psutil.Process(pid) + signal_python(new_resmoke_logger(), p.name, pid) if __name__ == "__main__": sys.argv[0] = ( @@ -66,6 +101,7 @@ if __name__ == "__main__": resmoke_args.append(f"--dbpathPrefix={os.path.join(undeclared_output_dir,'data')}") resmoke_args.append(f"--taskWorkDir={undeclared_output_dir}") resmoke_args.append(f"--reportFile={os.path.join(undeclared_output_dir,'report.json')}") + os.chdir(undeclared_output_dir) if os.environ.get("TEST_SHARD_INDEX") and os.environ.get("TEST_TOTAL_SHARDS"): shard_count = os.environ.get("TEST_TOTAL_SHARDS") @@ -90,7 +126,7 @@ if __name__ == "__main__": ): resmoke_args.append("--historicTestRuntimes=bazel/resmoke/test_runtimes.json") - - cli.main(resmoke_args) + with ResmokeShimContext() as ctx: + cli.main(resmoke_args) lock.release() diff --git a/etc/evergreen_yml_components/definitions.yml b/etc/evergreen_yml_components/definitions.yml index 89ea48028ae..b3290c20586 100644 --- a/etc/evergreen_yml_components/definitions.yml +++ b/etc/evergreen_yml_components/definitions.yml @@ -2587,8 +2587,8 @@ functions: - "src/evergreen/run_python_script.sh" - "buildscripts/fast_archive.py" - "-f=mongo-coredumps.json" - - "-p=./**.core" - - "-p=./**.mdmp" # Windows: minidumps + - "-p=./*.core" + - "-p=./*.mdmp" # Windows: minidumps - "-n=Core Dump" "archive mongo coredumps": &archive_mongo_coredumps diff --git a/evergreen/gather_mongo_coredumps.sh b/evergreen/gather_mongo_coredumps.sh index aa38dc018e6..763543c317d 100755 --- a/evergreen/gather_mongo_coredumps.sh +++ b/evergreen/gather_mongo_coredumps.sh @@ -1,10 +1,13 @@ cd src -# Find all core files and move to src -core_files=$(/usr/bin/find -H .. \( -name "*.core" -o -name "*.mdmp" \) 2>/dev/null) +# Find all core files and symlink them to src +# -H is used to follow hard-links, but add bazel-testlogs explicitly. This ensures we look +# in bazel-testlogs, but don't follow soft-links and end up with multiple copies of the same +# core dump from bazel-testlogs, bazel-out, etc. +core_files=$(/usr/bin/find -H .. bazel-testlogs \( -name "*.core" -o -name "*.mdmp" \) 2>/dev/null) for core_file in $core_files; do base_name=$(echo $core_file | sed "s/.*\///") - # Move file if it does not already exist + # Symlink file if it does not already exist if [ ! -f $base_name ]; then - mv $core_file . + ln -sf $core_file $base_name fi done