mongo/buildscripts/powercycle/plugins.py

511 lines
24 KiB
Python

"""Interactions with the undodb tool-suite."""
import getpass
import os
import re
import subprocess
import sys
import shlex
import yaml
from buildscripts.powercycle.run import RemoteOperations, SSHOperation
from buildscripts.resmokelib.plugin import PluginInterface, Subcommand
_EXPANSIONS_FILE = 'expansions.yml'
class PowercycleCommand(Subcommand): # pylint: disable=abstract-method, too-many-instance-attributes
"""Base class for remote operations to set up powercycle."""
def __init__(self):
"""Initialize PowercycleCommand."""
self.expansions = yaml.safe_load(open(_EXPANSIONS_FILE))
self.exe = ".exe" if self.is_windows() else ""
self.retries = 0 if "ssh_retries" not in self.expansions else int(
self.expansions["ssh_retries"])
self.ssh_identity = self._get_ssh_identity()
self.ssh_connection_options = self.ssh_identity + " " + self.expansions[
"ssh_connection_options"]
self.sudo = "" if self.is_windows() else "sudo"
# The username on the Windows image that powercycle uses is currently the default user.
self.user = "Administrator" if self.is_windows() else getpass.getuser()
self.user_host = self.user + "@" + self.expansions["private_ip_address"]
self.remote_op = RemoteOperations(
user_host=self.user_host,
ssh_connection_options=self.ssh_connection_options,
retries=self.retries,
)
@staticmethod
def is_windows() -> bool:
""":return: True if running on Windows."""
return sys.platform == "win32" or sys.platform == "cygwin"
@staticmethod
def _call(cmd):
cmd = shlex.split(cmd)
# Use a common pipe for stdout & stderr for logging.
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
buff_stdout, _ = process.communicate()
buff = buff_stdout.decode("utf-8", "replace")
return process.poll(), buff
def _get_posix_workdir(self) -> str:
workdir = self.expansions['workdir']
if self.is_windows():
workdir = workdir.replace("\\", "/")
return workdir
def _get_ssh_identity(self) -> str:
workdir = self._get_posix_workdir()
pem_file = '/'.join([workdir, 'powercycle.pem'])
return f"-i {pem_file}"
class SetUpEC2Instance(PowercycleCommand):
"""Set up EC2 instance."""
COMMAND = "setUpEC2Instance"
def execute(self) -> None: # pylint: disable=too-many-instance-attributes, too-many-locals, too-many-statements
""":return: None."""
# First operation -
# Copy mount_drives.sh script to remote host.
# last arg is "operation_dir", which for the COPY_TO action, is the remote
# directory. We just have it default to the home directory instead of setting
# one explicitly.
self.remote_op.operation(SSHOperation.COPY_TO, 'buildscripts/mount_drives.sh', None)
# Second operation -
# Mount /data on the attached drive(s), more than 1 indicates a RAID set.
script_opts = f"-d '{self.expansions['data_device_names']}'"
if "raid_data_device_name" in self.expansions:
script_opts = f"{script_opts} -r {self.expansions['raid_data_device_name']}"
if "fstype" in self.expansions:
script_opts = f"{script_opts} -t {self.expansions['fstype']}"
if "fs_options" in self.expansions:
script_opts = f"{script_opts} -o '{self.expansions['fs_options']}'"
script_opts = f"{script_opts} -l '{self.expansions['log_device_name']}'"
log = "/log"
group_cmd = f"id -Gn {self.user}"
_, group = self._call(group_cmd)
group = group.split(" ")[0]
user_group = f"{self.user}:{group}"
script_opts = f"{script_opts} -u {user_group}"
data_db = "/data/db"
cmds = f"{self.sudo} bash mount_drives.sh {script_opts}; mount; ls -ld {data_db} {log}; df"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
# Third operation -
# Create remote_dir, if not '.' (pwd).
if 'remote_dir' not in self.expansions:
raise ValueError("The 'remote_dir' expansion must be set.")
remote_dir = self.expansions['remote_dir']
if self.expansions['remote_dir'] != ".":
set_permission = f"chmod 777 {self.expansions['remote_dir']}"
if self.is_windows():
set_permission = f"setfacl -s user::rwx,group::rwx,other::rwx {remote_dir}"
cmds = f"{self.sudo} mkdir -p {remote_dir}; {self.sudo} chown {user_group} {remote_dir}; {set_permission}; ls -ld {remote_dir}"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
# Fourth operation -
# Copy buildscripts, pytests and mongoDB executables to the remote host.
files = ['etc', 'buildscripts', 'pytests']
mongo_executables = ["mongo", "mongod", "mongos"]
for executable in mongo_executables:
files.append(f"dist-test/bin/{executable}{self.exe}")
self.remote_op.operation(SSHOperation.COPY_TO, files, remote_dir)
# Fifth operation -
# Set up virtualenv on remote.
venv = "venv" if "virtualenv_dir" not in self.expansions else self.expansions[
"virtualenv_dir"]
python = "/opt/mongodbtoolchain/v3/bin/python3" if "python" not in self.expansions else self.expansions[
"python"]
cmds = f"python_loc=$(which {python})"
cmds = f"{cmds}; remote_dir={remote_dir}"
cmds = f"{cmds}; if [ 'Windows_NT' = '$OS' ]; then python_loc=$(cygpath -w $python_loc); remote_dir=$(cygpath -w $remote_dir); fi"
cmds = f"{cmds}; virtualenv --python $python_loc --system-site-packages {venv}"
cmds = f"{cmds}; activate=$(find {venv} -name 'activate')"
cmds = f"{cmds}; . $activate"
cmds = f"{cmds}; pip3 install -r $remote_dir/etc/pip/powercycle-requirements.txt"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
# Sixth operation -
# Enable core dumps on non-Windows remote hosts.
# The core pattern must specify a director, since mongod --fork will chdir("/")
# and cannot generate a core dump there (see SERVER-21635).
# We need to reboot the host for the core limits to take effect.
if not self.is_windows():
core_pattern = f"{remote_dir}/dump_%e.%p.core"
sysctl_conf = "/etc/sysctl.conf"
cmds = "ulimit -a"
cmds = f"{cmds}; echo \"{self.user} - core unlimited\" | {self.sudo} tee -a /etc/security/limits.conf"
cmds = f"{cmds}; if [ -f {sysctl_conf} ]"
cmds = f"{cmds}; then grep ^kernel.core_pattern {sysctl_conf}"
cmds = f"{cmds}; if [ $? -eq 0 ]"
cmds = f"{cmds}; then {self.sudo} sed -i \"s,kernel.core_pattern=.*,kernel.core_pattern=$core_pattern,\" {sysctl_conf}"
cmds = f"{cmds}; else echo \"kernel.core_pattern={core_pattern}\" | {self.sudo} tee -a {sysctl_conf}"
cmds = f"{cmds}; fi"
cmds = f"{cmds}; else echo Cannot change the core pattern and no core dumps will be generated."
cmds = f"{cmds}; fi"
# The following line for restarting the machine is based on
# https://unix.stackexchange.com/a/349558 in order to ensure the ssh client gets a
# response from the remote machine before it restarts.
cmds = f"{cmds}; nohup {self.sudo} reboot &>/dev/null & exit"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
# Seventh operation -
# Print the ulimit & kernel.core_pattern
if not self.is_windows():
# Always exit successfully, as this is just informational.
cmds = "uptime"
cmds = f"{cmds}; ulimit -a"
cmds = f"{cmds}; if [ -f /sbin/sysctl ]"
cmds = f"{cmds}; then /sbin/sysctl kernel.core_pattern"
cmds = f"{cmds}; fi"
remote_op_special_retry = self.remote_op if "ssh_retries" in self.expansions else RemoteOperations(
user_host=self.user_host, ssh_connection_options=self.ssh_connection_options,
retries=3, debug=True)
remote_op_special_retry.operation(SSHOperation.SHELL, cmds, None, True)
# Eighth operation -
# Set up curator to collect system & process stats on remote.
variant = "windows" if self.is_windows() else "ubuntu1604"
curator_hash = "117d1a65256ff78b6d15ab79a1c7088443b936d0"
curator_url = f"https://s3.amazonaws.com/boxes.10gen.com/build/curator/curator-dist-{variant}-{curator_hash}.tar.gz"
cmds = f"curl -s {curator_url} | tar -xzv"
monitor_system_file = self.expansions["monitor_system_file"]
monitor_proc_file = self.expansions["monitor_proc_file"]
if self.is_windows():
# Since curator runs as SYSTEM user, ensure the output files can be accessed.
cmds = f"{cmds}; touch {monitor_system_file}; chmod 777 {monitor_system_file}"
cmds = f"{cmds}; cygrunsrv --install curator_sys --path curator --chdir $HOME --args 'stat system --file {monitor_system_file}'"
cmds = f"{cmds}; touch {monitor_proc_file}; chmod 777 {monitor_proc_file}"
cmds = f"{cmds}; cygrunsrv --install curator_proc --path curator --chdir $HOME --args 'stat process-all --file {monitor_proc_file}'"
cmds = f"{cmds}; cygrunsrv --start curator_sys"
cmds = f"{cmds}; cygrunsrv --start curator_proc"
else:
cmds = f"{cmds}; cmd=\"@reboot cd $HOME && {self.sudo} ./curator stat system >> {monitor_system_file}\""
cmds = f"{cmds}; (crontab -l ; echo \"$cmd\") | crontab -"
cmds = f"{cmds}; cmd=\"@reboot cd $HOME && $sudo ./curator stat process-all >> {monitor_proc_file}\""
cmds = f"{cmds}; (crontab -l ; echo \"$cmd\") | crontab -"
cmds = f"{cmds}; crontab -l"
cmds = f"{cmds}; {{ {self.sudo} $HOME/curator stat system --file {monitor_system_file} > /dev/null 2>&1 & {self.sudo} $HOME/curator stat process-all --file {monitor_proc_file} > /dev/null 2>&1 & }} & disown"
self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True)
# Ninth operation -
def configure_firewall():
# Many systems have the firewall disabled, by default. In case the firewall is
# enabled we add rules for the mongod ports on the remote.
standard_port = self.expansions["standard_port"]
secret_port = self.expansions["secret_port"]
# RHEL 7 firewall rules
firewall_cmd = self._call("which firewall-cmd")
if firewall_cmd[1] and "no firewall-cmd in" not in firewall_cmd[1]:
cmds = f"{self.sudo} firewall-cmd --permanent --zone=public --add-port=ssh/tcp"
cmds = f"{cmds}; {self.sudo} firewall-cmd --permanent --zone=public --add-port={standard_port}/tcp"
cmds = f"{cmds}; {self.sudo} firewall-cmd --permanent --zone=public --add-port={secret_port}/tcp"
cmds = f"{cmds}; {self.sudo} firewall-cmd --reload"
cmds = f"{cmds}; {self.sudo} firewall-cmd --list-all"
elif self._call(f"{self.sudo} iptables --list")[1]:
cmds = f"{self.sudo} iptables -I INPUT 1 -p tcp --dport ssh -j ACCEPT"
cmds = f"{cmds}; {self.sudo} iptables -I INPUT 1 -p tcp --dport {standard_port} -j ACCEPT"
cmds = f"{cmds}; {self.sudo} iptables -I INPUT 1 -p tcp --dport {secret_port} -j ACCEPT"
if os.path.exists("/etc/iptables") and os.path.isdir("/etc/iptables"):
rules_file = "/etc/iptables/iptables.rules"
elif os.path.exists("/etc/sysconfig/iptables") and os.path.isfile(
"/etc/sysconfig/iptables"):
rules_file = "/etc/sysconfig/iptables"
else:
rules_file = "/etc/iptables.up.rules"
cmds = f"{cmds}; {self.sudo} iptables-save | {self.sudo} tee {rules_file}"
cmds = f"{cmds}; {self.sudo} iptables --list-rules"
elif self._call(f"{self.sudo} service iptables status")[1]:
cmds = f"{self.sudo} iptables -I INPUT 1 -p tcp --dport ssh -j ACCEPT"
cmds = f"{cmds}; {self.sudo} iptables -I INPUT 1 -p tcp --dport {standard_port} -j ACCEPT"
cmds = f"{cmds}; {self.sudo} iptables -I INPUT 1 -p tcp --dport {secret_port} -j ACCEPT"
cmds = f"{cmds}; {self.sudo} service iptables save"
cmds = f"{cmds}; {self.sudo} service iptables status"
# Ubuntu firewall rules
elif self._call(f"{self.sudo} ufw status")[1]:
cmds = f"{self.sudo} ufw allow ssh/tcp"
cmds = f"{cmds}; {self.sudo} ufw allow {standard_port}/tcp"
cmds = f"{cmds}; {self.sudo} ufw allow {secret_port}/tcp"
cmds = f"{cmds}; {self.sudo} ufw reload"
cmds = f"{cmds}; {self.sudo} ufw status"
# SuSE firewall rules
# TODO: Add firewall rules using SuSEfirewall2
elif self._call(f"{self.sudo} /sbin/SuSEfirewall2 help")[1]:
cmds = f"{self.sudo} /sbin/SuSEfirewall2 stop"
cmds = f"{cmds}; {self.sudo} /sbin/SuSEfirewall2 off"
# Windows firewall rules
elif self._call(f"netsh advfirewall show store")[1]:
add_rule = "netsh advfirewall firewall add rule"
cmds = f"{add_rule} name='MongoDB port {standard_port} in' dir=in action=allow protocol=TCP localport={standard_port}"
cmds = f"{cmds}; {add_rule} name='MongoDB port {standard_port} out' dir=in action=allow protocol=TCP localport={standard_port}"
cmds = f"{cmds}; {add_rule} name='MongoDB port {secret_port} in' dir=in action=allow protocol=TCP localport={secret_port}"
cmds = f"{cmds}; {add_rule} name='MongoDB port {secret_port} out' dir=in action=allow protocol=TCP localport={secret_port}"
cmds = f"{cmds}; netsh advfirewall firewall show rule name=all | grep -A 13 'MongoDB'"
else:
print("Firewall not active or unkown firewall command on this platform")
return
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
configure_firewall()
# Tenth operation -
# Install NotMyFault, used to crash Windows.
if self.is_windows() and "windows_crash_zip" in self.expansions:
windows_crash_zip = self.expansions["windows_crash_zip"]
windows_crash_dl = self.expansions["windows_crash_dl"]
windows_crash_dir = self.expansions["windows_crash_dir"]
cmds = f"curl -s -o {windows_crash_zip} {windows_crash_dl}"
cmds = f"{cmds}; unzip -q {windows_crash_zip} -d {windows_crash_dir}"
cmds = f"{cmds}; chmod +x {windows_crash_dir}/*.exe"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
class TarEC2Artifacts(PowercycleCommand):
"""Tar EC2 artifacts."""
COMMAND = "tarEC2Artifacts"
def execute(self) -> None:
""":return: None."""
if "ec2_artifacts" not in self.expansions or "ec2_ssh_failure" in self.expansions:
return
tar_cmd = "tar" if "tar" not in self.expansions else self.expansions["tar"]
cmd = f"{tar_cmd} czf ec2_artifacts.tgz {self.expansions['ec2_artifacts']}"
self.remote_op.operation(SSHOperation.SHELL, cmd, None)
class CopyEC2Artifacts(PowercycleCommand):
"""Copy EC2 artifacts."""
COMMAND = "copyEC2Artifacts"
def execute(self) -> None:
""":return: None."""
if "ec2_artifacts" not in self.expansions or "ec2_ssh_failure" in self.expansions:
return
self.remote_op.operation(SSHOperation.COPY_FROM, "ec2_artifacts.tgz", None)
class GatherRemoteEventLogs(PowercycleCommand):
"""
Gather remote event logs.
The event logs on Windows are a useful diagnostic to have when determining if something bad
happened to the remote machine after it was repeatedly crashed during powercycle testing. For
example, the Application and System event logs have previously revealed that the mongod.exe
process abruptly exited due to not being able to open a file despite the process successfully
being restarted and responding to network requests.
"""
COMMAND = "gatherRemoteEventLogs"
def execute(self) -> None:
""":return: None."""
if not self.is_windows() or not os.path.exists(self.expansions.get(
"aws_ec2_yml", "")) or self.expansions.get("ec2_ssh_failure", ""):
return
cmds = f"mkdir -p {self.expansions['event_logpath']}"
cmds = f"{cmds}; wevtutil qe Application /c:10000 /rd:true /f:Text > {self.expansions['event_logpath']}/application.log"
cmds = f"{cmds}; wevtutil qe Security /c:10000 /rd:true /f:Text > {self.expansions['event_logpath']}/security.log"
cmds = f"{cmds}; wevtutil qe System /c:10000 /rd:true /f:Text > {self.expansions['event_logpath']}/system.log"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
class GatherRemoteMongoCoredumps(PowercycleCommand):
"""Gather Remote Mongo Coredumps."""
COMMAND = "gatherRemoteMongoCoredumps"
def execute(self) -> None:
""":return: None."""
aws_ec2_yml = self.expansions["aws_ec2_yml"]
if os.path.exists(aws_ec2_yml) and os.path.isfile(
aws_ec2_yml) or "ec2_ssh_failure" in self.expansions:
return
remote_dir = "." if "remote_dir" not in self.expansions else self.expansions["remote_dir"]
# Find all core files and move to $remote_dir
cmds = "core_files=$(/usr/bin/find -H . ( -name '*.core' -o -name '*.mdmp' ) 2> /dev/null)"
cmds = f"{cmds}; if [ -z \"$core_files\" ]; then exit 0; fi"
cmds = f"{cmds}; echo Found remote core files $core_files, moving to $(pwd)"
cmds = f"{cmds}; for core_file in $core_files"
cmds = f"{cmds}; do base_name=$(echo $core_file | sed 's/.*///')"
cmds = f"{cmds}; if [ ! -f $base_name ]; then mv $core_file .; fi"
cmds = f"{cmds}; done"
self.remote_op.operation(SSHOperation.SHELL, cmds, remote_dir)
class CopyRemoteMongoCoredumps(PowercycleCommand):
"""Copy Remote Mongo Coredumps."""
COMMAND = "copyRemoteMongoCoredumps"
def execute(self) -> None:
""":return: None."""
if not os.path.exists(self.expansions.get("aws_ec2_yml", "")) or self.expansions.get(
"ec2_ssh_failure", ""):
return
if self.is_windows():
core_suffix = "mdmp"
else:
core_suffix = "core"
# Core file may not exist so we ignore the return code.
self.remote_op.operation(SSHOperation.SHELL,
f"{self.expansions.get('remote_dir', '.')}/*.{core_suffix}", None,
True)
class CopyEC2MonitorFiles(PowercycleCommand):
"""Copy EC2 monitor files."""
COMMAND = "copyEC2MonitorFiles"
def execute(self) -> None:
""":return: None."""
tar_cmd = "tar" if "tar" not in self.expansions else self.expansions["tar"]
cmd = f"{tar_cmd} czf ec2_monitor_files.tgz {self.expansions['ec2_monitor_files']}"
self.remote_op.operation(SSHOperation.SHELL, cmd, None)
self.remote_op.operation(SSHOperation.COPY_FROM, 'ec2_monitor_files.tgz', None)
class RunHangAnalyzerOnRemoteInstance(PowercycleCommand):
"""Run the hang-analyzer on a remote instance."""
COMMAND = "runHangAnalyzerOnRemoteInstance"
def execute(self) -> None: # pylint: disable=too-many-locals
""":return: None."""
if "private_ip_address" not in self.expansions:
return
hang_analyzer_processes = "dbtest,java,mongo,mongod,mongos,python,_test" if "hang_analyzer_processes" not in self.expansions else self.expansions[
"hang_analyzer_processes"]
hang_analyzer_option = f"-o file -o stdout -p {hang_analyzer_processes}"
hang_analyzer_dump_core = True if "hang_analyzer_dump_core" not in self.expansions else self.expansions[
"hang_analyzer_dump_core"]
if hang_analyzer_dump_core:
hang_analyzer_option = f"-c {hang_analyzer_option}"
core_ext = "core"
if self.is_windows():
core_ext = "mdmp"
remote_dir = "." if "remote_dir" not in self.expansions else self.expansions["remote_dir"]
files = self._call("ls")[1].split("\n")
dbg_regex = re.compile(r"(\.debug$)|(\.dSYM$)|(\.pdb$)")
debug_files = [f for f in files if dbg_regex.match(f)]
file_param = []
for debug_file in debug_files:
file_param.append(debug_file)
if file_param:
self.remote_op.operation(SSHOperation.COPY_TO, file_param, remote_dir)
# Activate virtualenv on remote host. The virtualenv bin_dir is different for Linux and
# Windows.
venv = "venv" if "virtualenv_dir" not in self.expansions else self.expansions[
"virtualenv_dir"]
virtual_env = os.environ["VIRTUAL_ENV"]
_, activate_loc = self._call(f"find {virtual_env} -name activate")
bin_dir_regex = re.compile(f"{re.escape(virtual_env)}/(.*)/activate")
bin_dir = bin_dir_regex.match(activate_loc)[1]
cmds = f". {venv}/{bin_dir}/activate"
# In the 'cmds' variable we pass to remote host, use 'python' instead of '$python' since
# we don't want to evaluate the local python variable, but instead pass the python string
# so the remote host will use the right python when the virtualenv is sourced.
cmds = f"{cmds}; cd {remote_dir}"
cmds = f"{cmds}; PATH=\"/opt/mongodbtoolchain/gdb/bin:$PATH\" python buildscripts/resmoke.py hang-analyzer {hang_analyzer_option}"
self.remote_op.operation(SSHOperation.SHELL, cmds, None)
file_param = []
file_param.append(f"{remote_dir}/debugger*.*")
file_param.append(f"{remote_dir}/*.{core_ext}")
self.remote_op.operation(SSHOperation.COPY_FROM, file_param, None)
class NoOp(Subcommand):
"""No op."""
def execute(self) -> None:
""":return: None."""
pass
class PowercyclePlugin(PluginInterface):
"""Interact with powercycle_operations."""
def add_subcommand(self, subparsers):
"""
Add 'powercycle_operations' subcommand.
:param subparsers: argparse parser to add to
:return: None
"""
subparsers.add_parser(SetUpEC2Instance.COMMAND)
subparsers.add_parser(TarEC2Artifacts.COMMAND)
subparsers.add_parser(CopyEC2Artifacts.COMMAND)
subparsers.add_parser(GatherRemoteEventLogs.COMMAND)
subparsers.add_parser(GatherRemoteMongoCoredumps.COMMAND)
subparsers.add_parser(CopyRemoteMongoCoredumps.COMMAND)
subparsers.add_parser(CopyEC2MonitorFiles.COMMAND)
subparsers.add_parser(RunHangAnalyzerOnRemoteInstance.COMMAND)
# Accept arbitrary args like 'powercycle.py undodb foobar', but ignore them.
def parse(self, subcommand, parser, parsed_args, **kwargs): # pylint: disable=too-many-return-statements
"""
Return powercycle_operation if command is one we recognize.
:param subcommand: equivalent to parsed_args.command
:param parser: parser used
:param parsed_args: output of parsing
:param kwargs: additional args
:return: None or a Subcommand
"""
# Only return subcommand if expansion file has been written.
if not os.path.exists(_EXPANSIONS_FILE):
print(f"Did not find {_EXPANSIONS_FILE}, skipping {subcommand}.")
return NoOp()
if subcommand == SetUpEC2Instance.COMMAND:
return SetUpEC2Instance()
elif subcommand == TarEC2Artifacts.COMMAND:
return TarEC2Artifacts()
elif subcommand == CopyEC2Artifacts.COMMAND:
return CopyEC2Artifacts()
elif subcommand == GatherRemoteEventLogs.COMMAND:
return GatherRemoteEventLogs()
elif subcommand == GatherRemoteMongoCoredumps.COMMAND:
return GatherRemoteMongoCoredumps()
elif subcommand == CopyRemoteMongoCoredumps.COMMAND:
return CopyRemoteMongoCoredumps()
elif subcommand == CopyEC2MonitorFiles.COMMAND:
return CopyEC2MonitorFiles()
elif subcommand == RunHangAnalyzerOnRemoteInstance.COMMAND:
return RunHangAnalyzerOnRemoteInstance()
else:
return None