mongo/buildscripts/gdb/mongo_lock.py

458 lines
16 KiB
Python

"""Mongo lock module."""
import re
import sys
import os
from pathlib import Path
import gdb
import gdb.printing
if not gdb:
sys.path.insert(0, str(Path(os.path.abspath(__file__)).parent.parent.parent))
from buildscripts.gdb.mongo import get_current_thread_name, get_thread_id, RegisterMongoCommand
if sys.version_info[0] < 3:
raise gdb.GdbError(
"MongoDB gdb extensions only support Python 3. Your GDB was compiled against Python 2")
class NonExecutingThread(object):
"""NonExecutingThread class.
Idle multi-statement transactions can hold locks that are not associated with an active
thread. In order to generate meaningful digraphs that include these locks, we create an
object that implements the "Thread" class interface but populates with LockerId rather than
thread::id. This allows us to uniquely identify each idle transaction.
"""
def __init__(self, locker_id):
"""Initialize Thread."""
self.locker_id = locker_id
def __eq__(self, other):
if isinstance(other, NonExecutingThread):
return self.locker_id == other.locker_id
return NotImplemented
def __ne__(self, other):
return not self == other
def __str__(self):
return "Idle Transaction (LockerId {})".format(self.locker_id)
def key(self):
"""Return NonExecutingThread key."""
return "LockerId {}".format(self.locker_id)
class Thread(object):
"""Thread class."""
def __init__(self, thread_id, lwpid, thread_name):
"""Initialize Thread."""
self.thread_id = thread_id
self.lwpid = lwpid
self.name = thread_name
def __eq__(self, other):
if isinstance(other, Thread):
return self.thread_id == other.thread_id
return NotImplemented
def __ne__(self, other):
return not self == other
def __str__(self):
return "{} (Thread 0x{:012x} (LWP {}))".format(self.name, self.thread_id, self.lwpid)
def key(self):
"""Return thread key."""
return "Thread 0x{:012x}".format(self.thread_id)
class Lock(object):
"""Lock class."""
def __init__(self, addr, resource):
"""Initialize Lock."""
self.addr = addr
self.resource = resource
def __eq__(self, other):
if isinstance(other, Lock):
return self.addr == other.addr
return NotImplemented
def __ne__(self, other):
return not self == other
def __str__(self):
return "Lock 0x{:012x} ({})".format(self.addr, self.resource)
def key(self):
"""Return lock key."""
return "Lock 0x{:012x}".format(self.addr)
class Graph(object):
"""Graph class.
The Graph is a dict with the following structure:
{'node_key': {'node': {id: val}, 'next_nodes': [node_key_1, ...]}}
Example graph:
{
'Lock 1': {'node': {1: 'MongoDB lock'}, 'next_nodes': ['Thread 1']},
'Lock 2': {'node': {2: 'MongoDB lock'}, 'next_nodes': ['Thread 2']},
'Thread 1': {'node': {1: 123}, 'next_nodes': ['Lock 2']},
'Thread 2': {'node': {2: 456}, 'next_nodes': ['Lock 1']}
}
"""
def __init__(self):
"""Initialize Graph."""
self.nodes = {}
def is_empty(self):
"""Return True if graph is empty."""
return not bool(self.nodes)
def add_node(self, node):
"""Add node to graph."""
if not self.find_node(node):
self.nodes[node.key()] = {'node': node, 'next_nodes': []}
def find_node(self, node):
"""Find node in graph."""
if node.key() in self.nodes:
return self.nodes[node.key()]
return None
def find_from_node(self, from_node):
"""Find from node."""
for node_key in self.nodes:
node = self.nodes[node_key]
for next_node in node['next_nodes']:
if next_node == from_node['node'].key():
return node
return None
def remove_nodes_without_edge(self):
"""Remove nodes without edge."""
# Rebuild graph by removing any nodes which do not have any incoming or outgoing edges.
temp_nodes = {}
for node_key in self.nodes:
node = self.nodes[node_key]
if node['next_nodes'] or self.find_from_node(node) is not None:
temp_nodes[node_key] = self.nodes[node_key]
self.nodes = temp_nodes
def add_edge(self, from_node, to_node):
"""Add edge."""
f_node = self.find_node(from_node)
if f_node is None:
self.add_node(from_node)
f_node = self.nodes[from_node.key()]
t_node = self.find_node(to_node)
if t_node is None:
self.add_node(to_node)
t_node = self.nodes[to_node.key()]
for n_node in f_node['next_nodes']:
if n_node == to_node.key():
return
self.nodes[from_node.key()]['next_nodes'].append(to_node.key())
def print(self):
"""Print graph."""
for node_key in self.nodes:
print("Node", self.nodes[node_key]['node'])
for to_node in self.nodes[node_key]['next_nodes']:
print(" ->", self.nodes[to_node]['node'])
def _get_node_escaped(self, node_key):
"""Return the name of the node with any double quotes escaped.
The DOT language requires that literal double quotes be escaped using a backslash character.
"""
return str(self.nodes[node_key]['node']).replace('"', '\\"')
def to_graph(self, nodes=None, message=None):
"""Return the 'to_graph'."""
sb = []
sb.append('# Legend:')
sb.append('# Thread 1 -> Lock C (MODE_IX) indicates Thread 1 is waiting on Lock C and'
' Lock C is currently held in MODE_IX')
sb.append('# Lock C (MODE_IX) -> Thread 2 indicates Lock C is held by Thread 2 in'
' MODE_IX')
if message is not None:
sb.append(message)
sb.append('digraph "mongod+lock-status" {')
# Draw the graph from left to right. There can be hundreds of threads blocked by the same
# resource, but only a few resources involved in a deadlock, so we prefer a long graph
# than a super wide one. Long resource / thread names would make a wide graph even wider.
sb.append(' rankdir=LR;')
for node_key in self.nodes:
for next_node_key in self.nodes[node_key]['next_nodes']:
sb.append(' "{}" -> "{}";'.format(
self._get_node_escaped(node_key), self._get_node_escaped(next_node_key)))
for node_key in self.nodes:
color = ""
if nodes and node_key in nodes:
color = "color = red"
sb.append(' "{0}" [label="{0}" {1}]'.format(self._get_node_escaped(node_key), color))
sb.append("}")
return "\n".join(sb)
def depth_first_search(self, node_key, nodes_visited, nodes_in_cycle=None):
"""Perform depth first search and return the list of nodes in the cycle or None.
The nodes_visited is a set of nodes which indicates it has been visited.
The node_in_cycle is a list of nodes in the potential cycle.
"""
if nodes_in_cycle is None:
nodes_in_cycle = []
nodes_visited.add(node_key)
nodes_in_cycle.append(node_key)
for node in self.nodes[node_key]['next_nodes']:
if node in nodes_in_cycle:
# The graph cycle starts at the index of node in nodes_in_cycle.
return nodes_in_cycle[nodes_in_cycle.index(node):]
if node not in nodes_visited:
dfs_nodes = self.depth_first_search(node, nodes_visited, nodes_in_cycle)
if dfs_nodes:
return dfs_nodes
# This node_key is not part of the graph cycle.
nodes_in_cycle.pop()
return None
def detect_cycle(self):
"""If a cycle is detected, returns a list of nodes in the cycle or None."""
nodes_visited = set()
for node in self.nodes:
if node not in nodes_visited:
cycle_path = self.depth_first_search(node, nodes_visited)
if cycle_path:
return [str(self.nodes[node_key]['node']) for node_key in cycle_path]
return None
def find_thread(thread_dict, search_thread_id):
"""Find thread."""
for (_, thread) in list(thread_dict.items()):
if thread.thread_id == search_thread_id:
return thread
return None
def find_func_block(block):
"""Find func block."""
while block:
if block.function:
return block
block = block.superblock
return None
def find_frame(function_name_pattern):
"""Find frame."""
frame = gdb.newest_frame()
while frame:
block = None
try:
block = frame.block()
except RuntimeError as err:
if err.args[0] != "Cannot locate block for frame.":
raise
block = find_func_block(block)
if block and re.match(function_name_pattern, block.function.name):
return frame
try:
frame = frame.older()
except gdb.error as err:
print("Ignoring GDB error '%s' in find_frame" % str(err))
break
return None
def find_mutex_holder(graph, thread_dict, show):
"""Find mutex holder."""
frame = find_frame(r'std::mutex::lock\(\)')
if frame is None:
return
frame.select()
# Waiting for mutex locking!
mutex_this, _ = gdb.lookup_symbol("this", frame.block())
mutex_value = mutex_this.value(frame)
# The mutex holder is a LWPID
mutex_holder_lwpid = int(mutex_value["_M_mutex"]["__data"]["__owner"])
# At time thread_dict was initialized, the mutex holder may not have been found.
# Use the thread LWP as a substitute for showing output or generating the graph.
if mutex_holder_lwpid not in thread_dict:
print("Warning: Mutex at {} held by thread with LWP {}"
" not found in thread_dict. Using LWP to track thread.".format(
mutex_value, mutex_holder_lwpid))
mutex_holder = Thread(mutex_holder_lwpid, mutex_holder_lwpid, '"[unknown]"')
else:
mutex_holder = thread_dict[mutex_holder_lwpid]
(_, mutex_waiter_lwpid, _) = gdb.selected_thread().ptid
mutex_waiter = thread_dict[mutex_waiter_lwpid]
if show:
print("Mutex at {} held by {} waited on by {}".format(mutex_value, mutex_holder,
mutex_waiter))
if graph:
graph.add_edge(mutex_waiter, Lock(int(mutex_value), "Mutex"))
graph.add_edge(Lock(int(mutex_value), "Mutex"), mutex_holder)
def find_lock_manager_holders(graph, thread_dict, show):
"""Find lock manager holders."""
frame = find_frame(r'mongo::LockerImpl::')
if not frame:
return
frame.select()
(_, lock_waiter_lwpid, _) = gdb.selected_thread().ptid
lock_waiter = thread_dict[lock_waiter_lwpid]
locker_ptr_type = gdb.lookup_type("mongo::LockerImpl").pointer()
# Do not call mongo::getGlobalLockManager() due to the compiler optimizing this function in a very weird way
# See SERVER-72816 for more context
lock_head = gdb.parse_and_eval(
"mongo::LockManager::get((mongo::ServiceContext*) mongo::getGlobalServiceContext())->_getBucket(resId)->findOrInsert(resId)"
)
granted_list = lock_head.dereference()["grantedList"]
lock_request_ptr = granted_list["_front"]
while lock_request_ptr:
lock_request = lock_request_ptr.dereference()
locker_ptr = lock_request["locker"]
locker_ptr = locker_ptr.cast(locker_ptr_type)
locker = locker_ptr.dereference()
lock_holder_id = int(locker["_threadId"]["_M_thread"])
if lock_holder_id == 0:
locker_id = int(locker["_id"])
lock_holder = NonExecutingThread(locker_id)
else:
lock_holder = find_thread(thread_dict, lock_holder_id)
if show:
print("MongoDB Lock at {} held by {} ({}) waited on by {}".format(
lock_head, lock_holder, lock_request["mode"], lock_waiter))
if graph:
graph.add_edge(lock_waiter, Lock(int(lock_head), lock_request["mode"]))
graph.add_edge(Lock(int(lock_head), lock_request["mode"]), lock_holder)
lock_request_ptr = lock_request["next"]
def get_locks(graph, thread_dict, show=False):
"""Get locks."""
for thread in gdb.selected_inferior().threads():
try:
if not thread.is_valid():
continue
thread.switch()
find_mutex_holder(graph, thread_dict, show)
find_lock_manager_holders(graph, thread_dict, show)
except gdb.error as err:
print("Ignoring GDB error '%s' in get_locks" % str(err))
def get_threads_info():
"""Get threads info."""
thread_dict = {}
for thread in gdb.selected_inferior().threads():
try:
if not thread.is_valid():
continue
thread.switch()
# PTID is a tuple: Process ID (PID), Lightweight Process ID (LWPID), Thread ID (TID)
(_, lwpid, _) = thread.ptid
thread_num = thread.num
thread_name = get_current_thread_name()
thread_id = get_thread_id()
if not thread_id:
print("Unable to retrieve thread_info for thread %d" % thread_num)
continue
thread_dict[lwpid] = Thread(thread_id, lwpid, thread_name)
except gdb.error as err:
print("Ignoring GDB error '%s' in get_threads_info" % str(err))
return thread_dict
class MongoDBShowLocks(gdb.Command):
"""Show MongoDB locks & pthread mutexes."""
def __init__(self):
"""Initialize MongoDBShowLocks."""
RegisterMongoCommand.register(self, "mongodb-show-locks", gdb.COMMAND_DATA)
def invoke(self, *_):
"""Invoke mongodb_show_locks."""
self.mongodb_show_locks()
@staticmethod
def mongodb_show_locks():
"""GDB in-process python supplement."""
try:
thread_dict = get_threads_info()
get_locks(graph=None, thread_dict=thread_dict, show=True)
except gdb.error as err:
print("Ignoring GDB error '%s' in mongodb_show_locks" % str(err))
MongoDBShowLocks()
class MongoDBWaitsForGraph(gdb.Command):
"""Create MongoDB WaitsFor lock graph [graph_file]."""
def __init__(self):
"""Initialize MongoDBWaitsForGraph."""
RegisterMongoCommand.register(self, "mongodb-waitsfor-graph", gdb.COMMAND_DATA)
def invoke(self, arg, *_):
"""Invoke mongodb_waitsfor_graph."""
self.mongodb_waitsfor_graph(arg)
@staticmethod
def mongodb_waitsfor_graph(graph_file=None):
"""GDB in-process python supplement."""
graph = Graph()
try:
thread_dict = get_threads_info()
get_locks(graph=graph, thread_dict=thread_dict, show=False)
graph.remove_nodes_without_edge()
if graph.is_empty():
print("Not generating the digraph, since the lock graph is empty")
return
cycle_message = "# No cycle detected in the graph"
cycle_nodes = graph.detect_cycle()
if cycle_nodes:
cycle_message = "# Cycle detected in the graph nodes %s" % cycle_nodes
if graph_file:
print("Saving digraph to %s" % graph_file)
with open(graph_file, 'w') as fh:
fh.write(graph.to_graph(nodes=cycle_nodes, message=cycle_message))
print(cycle_message.split("# ")[1])
else:
print(graph.to_graph(nodes=cycle_nodes, message=cycle_message))
except gdb.error as err:
print("Ignoring GDB error '%s' in mongod_deadlock_graph" % str(err))
MongoDBWaitsForGraph()
print("MongoDB Lock analysis commands loaded")