mongo/buildscripts/resmokelib/testing/job.py

616 lines
26 KiB
Python

"""Enable running tests simultaneously by processing them from a multi-consumer queue."""
import logging
import sys
import threading
import time
from collections import namedtuple
from typing import TYPE_CHECKING, List, Optional, Union
from opentelemetry import context, trace
from opentelemetry.context.context import Context
from opentelemetry.trace.status import StatusCode
from buildscripts.resmokelib import config, errors
from buildscripts.resmokelib.hang_analyzer.timeout_for_hang_analyzer import (
TimeoutForHangAnalyzer,
)
from buildscripts.resmokelib.testing import testcases
from buildscripts.resmokelib.testing.fixtures import shardedcluster
from buildscripts.resmokelib.testing.fixtures.interface import Fixture, create_fixture_table
from buildscripts.resmokelib.testing.hook_test_archival import HookTestArchival
from buildscripts.resmokelib.testing.hooks.interface import Hook
from buildscripts.resmokelib.testing.queue_element import QueueElem, QueueElemRepeatTime
from buildscripts.resmokelib.testing.report import TestReport
from buildscripts.resmokelib.testing.testcases import fixture as _fixture
from buildscripts.resmokelib.testing.testcases.interface import TestCase
from buildscripts.resmokelib.utils import queue as _queue
# TODO: if we ever fix the circular deps in resmoke we will be able to get rid of this
if TYPE_CHECKING:
from buildscripts.resmokelib.testing.executor import TestQueue
TRACER = trace.get_tracer("resmoke")
class Job(object):
"""Run tests from a queue."""
def __init__(
self,
job_num: int,
logger: logging.Logger,
fixture: Fixture,
hooks: List[Hook],
report: TestReport,
archival: HookTestArchival,
suite_options: config.SuiteOptions,
test_queue_logger: logging.Logger,
):
"""Initialize the job with the specified fixture and hooks."""
self.logger = logger
self.fixture = fixture
self.hooks = hooks
self.report = report
self.archival = archival
self.suite_options = suite_options
self.manager = FixtureTestCaseManager(test_queue_logger, self.fixture, job_num, self.report)
# Don't check fixture.is_running() when using hooks that kill and restart fixtures, such
# as ContinuousStepdown or KillReplicator. Even if the fixture is still running as
# expected, there is a race where fixture.is_running() could fail if called after the
# primary was killed but before it was restarted.
self._check_if_fixture_running = not any(
hasattr(hook, "STOPS_FIXTURE") and hook.STOPS_FIXTURE for hook in self.hooks
)
@property
def job_num(self) -> int:
"""Forward the job_num option from FixtureTestCaseManager."""
return self.manager.job_num
@staticmethod
def _interrupt_all_jobs(
queue: "TestQueue[Union[QueueElemRepeatTime, QueueElem]]", interrupt_flag: threading.Event
):
# Set the interrupt flag so that other jobs do not start running more tests.
interrupt_flag.set()
# Drain the queue to unblock the main thread.
Job._drain_queue(queue)
def start(
self,
queue: "TestQueue[Union[QueueElemRepeatTime, QueueElem]]",
interrupt_flag: threading.Event,
parent_context: Context,
setup_flag: Optional[threading.Event] = None,
teardown_flag: Optional[threading.Event] = None,
hook_failure_flag: Optional[threading.Event] = None,
):
"""Continuously execute tests from 'queue' and records their details in 'report'.
If 'setup_flag' is not None, then a test to set up the fixture will be run
before running any other test. If an error occurs while setting up the fixture,
then the 'setup_flag' will be set.
If 'teardown_flag' is not None, then a test to tear down the fixture
will be run before this method returns. If an error occurs
while destroying the fixture, then the 'teardown_flag' will be set.
"""
# Since this is called from another thread we need to pass the context in
# This will make it have the correct parent and traceid
context.attach(parent_context)
setup_succeeded = True
if setup_flag is not None:
try:
setup_succeeded = self.manager.setup_fixture(self.logger)
except errors.StopExecution as err:
# Something went wrong when setting up the fixture. Perhaps we couldn't get a
# test_id from logkeeper for where to put the log output. We don't attempt to run
# any tests.
self.logger.error(
"Received a StopExecution exception when setting up the fixture: %s.", err
)
setup_succeeded = False
except:
# Something unexpected happened when setting up the fixture. We don't attempt to run
# any tests.
self.logger.exception("Encountered an error when setting up the fixture.")
setup_succeeded = False
if not setup_succeeded:
setup_flag.set()
self._interrupt_all_jobs(queue, interrupt_flag)
if setup_succeeded:
try:
self._run(queue, interrupt_flag, teardown_flag, hook_failure_flag)
except errors.StopExecution as err:
# Stop running tests immediately.
self.logger.error("Received a StopExecution exception: %s.", err)
self._interrupt_all_jobs(queue, interrupt_flag)
except:
# Unknown error, stop execution.
self.logger.exception("Encountered an error during test execution.")
self._interrupt_all_jobs(queue, interrupt_flag)
if teardown_flag is not None:
try:
teardown_succeeded = self.manager.teardown_fixture(self.logger)
except errors.StopExecution as err:
# Something went wrong when tearing down the fixture. Perhaps we couldn't get a
# test_id from logkeeper for where to put the log output. We indicate back to the
# executor thread that teardown has failed. This likely means resmoke.py is exiting
# without having terminated all of the child processes it spawned.
self.logger.error(
"Received a StopExecution exception when tearing down the fixture: %s.", err
)
teardown_succeeded = False
except:
# Something unexpected happened when tearing down the fixture. We indicate back to
# the executor thread that teardown has failed. This may mean resmoke.py is exiting
# without having terminated all of the child processes it spawned.
self.logger.exception("Encountered an error when tearing down the fixture.")
teardown_succeeded = False
if not teardown_succeeded:
teardown_flag.set()
@staticmethod
def _get_time():
"""Get current time to aid in the unit testing of the _run method."""
return time.time()
def _run(
self,
queue: "TestQueue[Union[QueueElemRepeatTime, QueueElem]]",
interrupt_flag: threading.Event,
teardown_flag: Optional[threading.Event] = None,
hook_failure_flag: Optional[threading.Event] = None,
):
"""Call the before/after suite hooks and continuously execute tests from 'queue'."""
self._run_hooks_before_suite(hook_failure_flag)
while not queue.empty() and not interrupt_flag.is_set():
queue_elem = queue.get_nowait()
queue_elem.job_started()
test_time_start = self._get_time()
test = queue_elem.testcase
self._requeue_test(queue, queue_elem, interrupt_flag)
try:
self._execute_test(test, hook_failure_flag)
finally:
queue_elem.job_completed(self._get_time() - test_time_start)
queue.task_done()
self._run_hooks_after_suite(teardown_flag, hook_failure_flag)
def _log_requeue_test(self, queue_elem: QueueElemRepeatTime):
"""Log the requeue of a test."""
if self.suite_options.time_repeat_tests_secs:
progress = "{} of ({}/{}/{:2.2f} min/max/time)".format(
queue_elem.repeat_num + 1,
self.suite_options.num_repeat_tests_min,
self.suite_options.num_repeat_tests_max,
self.suite_options.time_repeat_tests_secs,
)
else:
progress = "{} of {}".format(
queue_elem.repeat_num + 1, self.suite_options.num_repeat_tests
)
self.logger.info(
("Requeueing test %s %s, cumulative time elapsed %0.2f"),
queue_elem.testcase.test_name,
progress,
queue_elem.repeat_time_elapsed,
)
def _requeue_test(
self,
queue: "TestQueue[Union[QueueElemRepeatTime, QueueElem]]",
queue_elem: QueueElemRepeatTime,
interrupt_flag: threading.Event,
):
"""Requeue a test if it needs to be repeated."""
if not queue_elem.should_requeue():
return
queue_elem.testcase = testcases.make_test_case(
queue_elem.testcase.REGISTERED_NAME,
queue_elem.testcase.logger,
[queue_elem.testcase.test_name],
**queue_elem.test_config,
)
if not interrupt_flag.is_set():
self._log_requeue_test(queue_elem)
queue.put(queue_elem)
@TRACER.start_as_current_span("job._execute_test")
def _execute_test(self, test: TestCase, hook_failure_flag: Optional[threading.Event]):
"""Call the before/after test hooks and execute 'test'."""
common_test_attributes = test.get_test_otel_attributes()
execute_test_span = trace.get_current_span()
execute_test_span.set_attributes(attributes=common_test_attributes)
execute_test_span.set_status(StatusCode.ERROR, "fail_early")
try:
test.configure(self.fixture, config.NUM_CLIENTS_PER_FIXTURE, config.USE_TENANT_CLIENT)
except:
self.logger.error(
"%s marked as a failure because it could not be configured.",
test.short_description(),
)
self._fail_test(test, sys.exc_info(), return_code=2)
raise errors.StopExecution("Could not configure the test %s" % test.short_description())
self._run_hooks_before_tests(test, hook_failure_flag)
self.report.logging_prefix = create_fixture_table(self.fixture)
with TRACER.start_as_current_span("run_test", attributes=common_test_attributes):
if config.TEST_TIMEOUT:
timer = threading.Timer(config.TEST_TIMEOUT, test.on_timeout)
timer.start()
test(self.report)
if config.TEST_TIMEOUT:
timer.cancel()
timer.join()
try:
if test.propagate_error is not None:
raise test.propagate_error
if test.timed_out.is_set():
# Restart the fixture, since it has been killed by the test's timeout handler
self.logger.info(
"Restarting the fixture since it is not running after a test timed out."
)
self.fixture.setup()
self.fixture.await_ready()
# We are intentionally only checking the individual 'test' status and not calling
# report.wasSuccessful() here. It is possible that a thread running in the background as
# part of a hook has added a failed test case to 'self.report'. Checking the individual
# 'test' status ensures self._run_hooks_after_tests() is called if it is a hook's test
# case that has failed and not 'test' that has failed.
if self.suite_options.fail_fast and self.report.find_test_info(test).status != "pass":
self.logger.info("%s failed, so stopping..." % (test.short_description()))
raise errors.StopExecution("%s failed" % (test.short_description()))
if self._check_if_fixture_running and not self.fixture.is_running():
self.logger.error(
"%s marked as a failure because the fixture crashed during the test.",
test.short_description(),
)
self.report.setFailure(
test, return_code=2, reason="the fixture crashed during the test"
)
# Always fail fast if the fixture fails.
raise errors.StopExecution(
"%s not running after %s" % (self.fixture, test.short_description())
)
finally:
success: bool = self.report.find_test_info(test).status == "pass"
if success:
execute_test_span.set_status(StatusCode.OK)
else:
execute_test_span.set_status(StatusCode.ERROR, "fail")
# Stop background hooks first since they can interfere with fixture startup and teardown
# done as part of archival.
self._run_hooks_after_tests(test, hook_failure_flag, background=True)
if self.archival:
result = TestResult(test=test, hook=None, success=success)
self.archival.archive(self.logger, result, self.manager)
self._run_hooks_after_tests(test, hook_failure_flag, background=False)
def _run_hook(
self,
hook: Hook,
hook_function,
test: TestCase,
hook_failure_flag: Optional[threading.Event],
):
"""Provide helper to run hook and archival."""
try:
success = False
TimeoutForHangAnalyzer(
timeout=config.HANG_ANALYZER_HOOK_TIMEOUT,
func=hook_function,
args=(test, self.report),
).run()
success = True
except TimeoutError:
self.logger.error(
f"The '{hook_function.__name__}' of hook {hook.__class__.__name__} did not complete in {config.HANG_ANALYZER_HOOK_TIMEOUT} seconds. The hook may continue to run in the background. When the hang analyzer is called, this timeout is enforced to ensure Resmoke can complete a graceful shutdown."
)
finally:
if not success and hook_failure_flag is not None:
hook_failure_flag.set()
if self.archival:
result = TestResult(test=test, hook=hook, success=success)
self.archival.archive(self.logger, result, self.manager)
@TRACER.start_as_current_span("job._run_hooks_before_suite")
def _run_hooks_before_suite(self, hook_failure_flag: Optional[threading.Event]):
"""Run the before_suite method on each of the hooks."""
run_hooks_before_suite_span = trace.get_current_span()
hooks_failed = True
try:
for hook in self.hooks:
hook.before_suite(self.report)
hooks_failed = False
finally:
if hooks_failed and hook_failure_flag is not None:
hook_failure_flag.set()
run_hooks_before_suite_span.set_status(
StatusCode.ERROR if hooks_failed else StatusCode.OK
)
@TRACER.start_as_current_span("job._run_hooks_after_suite")
def _run_hooks_after_suite(
self, teardown_flag: Optional[threading.Event], hook_failure_flag: Optional[threading.Event]
):
"""Run the after_suite method on each of the hooks."""
run_hooks_after_suite_span = trace.get_current_span()
hooks_failed = True
try:
for hook in self.hooks:
TimeoutForHangAnalyzer(
timeout=config.HANG_ANALYZER_HOOK_TIMEOUT,
func=hook.after_suite,
args=(self.report, teardown_flag),
).run()
hooks_failed = False
except TimeoutError:
self.logger.error(
f"The 'after_suite' of hook {hook.__class__.__name__} did not complete in {config.HANG_ANALYZER_HOOK_TIMEOUT} seconds. The hook may continue to run in the background. When the hang analyzer is called, this timeout is enforced to ensure Resmoke can complete a graceful shutdown."
)
finally:
if hooks_failed and hook_failure_flag is not None:
hook_failure_flag.set()
run_hooks_after_suite_span.set_status(
StatusCode.ERROR if hooks_failed else StatusCode.OK
)
def _run_hooks_before_tests(self, test: TestCase, hook_failure_flag: Optional[threading.Event]):
"""Run the before_test method on each of the hooks.
Swallows any TestFailure exceptions if set to continue on
failure, and reraises any other exceptions.
"""
try:
for hook in self.hooks:
self._run_hook(hook, hook.before_test, test, hook_failure_flag)
except errors.StopExecution:
raise
except errors.ServerFailure:
self.logger.exception(
"%s marked as a failure by a hook's before_test.", test.short_description()
)
self._fail_test(test, sys.exc_info(), return_code=2)
raise errors.StopExecution("A hook's before_test failed")
except errors.TestFailure:
self.logger.exception(
"%s marked as a failure by a hook's before_test.", test.short_description()
)
self._fail_test(test, sys.exc_info(), return_code=1)
if self.suite_options.fail_fast:
raise errors.StopExecution("A hook's before_test failed")
except:
# Record the before_test() error in 'self.report'.
self.report.startTest(test)
self.report.addError(test, sys.exc_info())
self.report.stopTest(test)
raise
def _run_hooks_after_tests(
self, test: TestCase, hook_failure_flag: Optional[threading.Event], background: bool = False
):
"""Run the after_test method on each of the hooks.
Swallows any TestFailure exceptions if set to continue on
failure, and reraises any other exceptions.
@param test: the test after which we run the hooks.
@param background: whether to run background hooks.
"""
suite_with_balancer = (
isinstance(self.fixture, shardedcluster.ShardedClusterFixture)
and self.fixture.enable_balancer
)
if not background and suite_with_balancer:
try:
self.logger.info("Stopping the balancer before running end-test hooks")
self.fixture.stop_balancer()
except:
self.logger.exception(
"%s failed while stopping the balancer for after-test hooks",
test.short_description(),
)
self.report.setFailure(
test,
return_code=2,
reason="the balancer failed to stop before running after-test hooks",
)
if self.archival:
result = TestResult(test=test, hook=None, success=False)
self.archival.archive(self.logger, result, self.manager)
raise errors.StopExecution("stop_balancer failed before running after test hooks")
try:
for hook in self.hooks:
if hook.IS_BACKGROUND == background:
self._run_hook(hook, hook.after_test, test, hook_failure_flag)
except errors.StopExecution:
raise
except errors.ServerFailure:
self.logger.exception(
"%s marked as a failure by a hook's after_test.", test.short_description()
)
self.report.setFailure(
test, return_code=2, reason=f"The hook {hook.REGISTERED_NAME} failed."
)
raise errors.StopExecution("A hook's after_test failed")
except errors.TestFailure:
self.logger.exception(
"%s marked as a failure by a hook's after_test.", test.short_description()
)
self.report.setFailure(
test, return_code=1, reason=f"The hook {hook.REGISTERED_NAME} failed."
)
if self.suite_options.fail_fast:
raise errors.StopExecution("A hook's after_test failed")
except:
self.report.setError(test, sys.exc_info())
raise
if not background and suite_with_balancer:
try:
self.logger.info("Resuming the balancer after running end-test hooks")
self.fixture.start_balancer()
except:
self.logger.exception(
"%s failed while re-starting the balancer after end-test hooks",
test.short_description(),
)
self.report.setFailure(
test,
return_code=2,
reason="the balancer failed to restart after running after test hooks",
)
if self.archival:
result = TestResult(test=test, hook=None, success=False)
self.archival.archive(self.logger, result, self.manager)
raise errors.StopExecution("start_balancer failed after running after test hooks")
def _fail_test(self, test: TestCase, exc_info, return_code=1):
"""Provide helper to record a test as a failure with the provided return code.
This method should not be used if 'test' has already been
started, instead use TestReport.setFailure().
"""
self.report.startTest(test)
test.return_code = return_code
self.report.addFailure(test, exc_info)
self.report.stopTest(test)
@staticmethod
def _drain_queue(queue):
"""Remove all elements from 'queue' without actually doing anything to them.
Necessary to unblock the main thread that is waiting for 'queue' to be empty.
"""
try:
while not queue.empty():
queue.get_nowait()
queue.task_done()
except _queue.Empty:
# Multiple threads may be draining the queue simultaneously, so just ignore the
# exception from the race between queue.empty() being false and failing to get an item.
pass
TestResult = namedtuple("TestResult", ["test", "hook", "success"])
class FixtureTestCaseManager:
"""Class that holds information needed to create new fixture setup/teardown test cases for a single job."""
def __init__(
self, test_queue_logger: logging.Logger, fixture: Fixture, job_num: int, report: TestReport
):
"""
Initialize the test case manager.
:param test_queue_logger: The logger associated with this job's test queue.
:param fixture: The fixture associated with this job.
:param job_num: This job's unique identifier.
:param report: Report object collecting test results.
"""
self.test_queue_logger = test_queue_logger
self.fixture = fixture
self.job_num = job_num
self.report = report
self.times_set_up = 0 # Setups and kills may run multiple times.
def setup_fixture(self, logger: logging.Logger):
"""
Run a test that sets up the job's fixture and waits for it to be ready.
Return True if the setup was successful, False otherwise.
"""
test_case = _fixture.FixtureSetupTestCase(
self.test_queue_logger, self.fixture, "job{}".format(self.job_num), self.times_set_up
)
test_case(self.report)
if self.report.find_test_info(test_case).status != "pass":
logger.error("The setup of %s failed.", self.fixture)
return False
return True
def teardown_fixture(self, logger: logging.Logger, abort: bool = False):
"""
Run a test that tears down the job's fixture.
Return True if the teardown was successful, False otherwise.
"""
try:
test_case: Union[_fixture.FixtureAbortTestCase, _fixture.FixtureTeardownTestCase] = None
if abort:
test_case = _fixture.FixtureAbortTestCase(
self.test_queue_logger,
self.fixture,
"job{}".format(self.job_num),
self.times_set_up,
)
self.times_set_up += 1
else:
test_case = _fixture.FixtureTeardownTestCase(
self.test_queue_logger, self.fixture, "job{}".format(self.job_num)
)
# Refresh the fixture table before teardown to capture changes due to
# CleanEveryN and stepdown hooks.
self.report.logging_prefix = create_fixture_table(self.fixture)
test_case(self.report)
if self.report.find_test_info(test_case).status != "pass":
logger.error("The teardown of %s failed.", self.fixture)
return False
return True
finally:
# This is a failsafe. In the event that 'teardown_fixture' fails,
# any rogue logger handlers will be removed from this fixture.
# If not cleaned up, these will trigger 'setup failures' --
# indicated by exiting with LoggerRuntimeConfigError.EXIT_CODE.
if not isinstance(test_case, _fixture.FixtureAbortTestCase):
for handler in self.fixture.logger.handlers:
# We ignore the cancellation token returned by close_later() since we always
# want the logs to eventually get flushed.
self.fixture.fixturelib.close_loggers(handler)