import os import sys from pathlib import Path from pprint import pprint import bson import gdb if not gdb: sys.path.insert(0, str(Path(os.path.abspath(__file__)).parent.parent.parent)) from buildscripts.gdb.mongo import lookup_type DEBUGGING = False """ Public API to be called by users. The input `ident` is a string of the form: 'collection-2--4547167393143767234'. From within gdb type: python dump_pages_for_table('collection-2--4547167393143767234') Some behaviors/limitations: * Disk images of data are not deserialized into their separate key/value pairs. * If update chain WT_UPDATEs are valid bson, the values will be parsed and output as BSON maps. * If updates are not bson (e.g: index entries), they will be output as a raw byte array. * WT_UPDATE structures have a pretty printer registered. Disabling pretty printers will result in more raw output. * Any `file:*.wt` can be output, e.g: `_mdb_catalog` or `WiredTiger`. Though the output may be less supported/of lower quality. """ def dump_pages_for_table(ident): conn_impl_type = lookup_type("WT_CONNECTION_IMPL") if not conn_impl_type: print( "WT_CONNECTION_IMPL type not found. Try invoking this function from a different \ thread and frame." ) return conn_impl_ptr_type = conn_impl_type.pointer() dbg("impl", conn_impl_ptr_type) conn_ptr = None try: conn_ptr = gdb.parse_and_eval("session->iface->connection") except gdb.error: pass if not conn_ptr or not conn_ptr.address: print( "Failed to find a suitable `WT_SESSION session` object to extract a connection object \ from. Try finding an eviction thread and frame, e.g: `__wt_evict_thread_run`. If the session is \ optimized out, try going up stack frames until the variable is in a local scope rather than a \ function input." ) return conn = conn_ptr.reinterpret_cast(conn_impl_ptr_type).dereference() dbg("conn", conn) data_handle, all_dhs = get_data_handle(conn, "file:{}.wt".format(ident)) if not data_handle: print("Data handle not found for ident. Ident: `{}`".format(ident)) print("All known data handles:") pprint(all_dhs) return dump_handle(data_handle) # Private API. def dbg(ident, var): if not DEBUGGING: return print("----------") if type(var) == gdb.Value: print("{}: ({}*){}".format(ident, var.type, var.address)) else: print(ident) print(" " + str(type(var))) methods = dir(var) out = [name for name in methods if not name.startswith("__")] for item in out: print(" " + item) if type(var) == gdb.Value: print("\n Fields:") print("\t" + "\n\t".join(str(var).split("\n"))) def walk_wt_list(lst): ret = [] node = lst["tqh_first"] dbg("node", node) while True: if not node: break ret.append(node.dereference()) node = node["q"]["tqe_next"] return ret def get_data_handle(conn, handle_name): dbg("datahandles", conn["dhqh"]) ret = None all_file_dhs = [] for handle in walk_wt_list(conn["dhqh"]): if handle["name"].string().startswith("file:"): all_file_dhs.append(handle["name"].string()[5:-3]) if handle["name"].string() == handle_name: ret = handle return ret, all_file_dhs def get_btree_handle(dhandle): btree = lookup_type("WT_BTREE").pointer() return dhandle["handle"].reinterpret_cast(btree).dereference() def dump_update_chain(update_chain): while True: if not update_chain: print(" λ (End of update chain)") break dbg("update", update_chain) wt_val = update_chain.dereference() obj = None dbg("wt_val", wt_val) val_bytes = gdb.selected_inferior().read_memory(wt_val["data"], wt_val["size"]) can_bson = wt_val["type"] == 3 if can_bson: try: obj = bson.decode_all(val_bytes)[0] except: pass print(" " + "\n ".join(str(wt_val).split("\n")) + " " + str(obj) + " =>") update_chain = update_chain["next"] def dump_insert_list(wt_insert): key_struct = wt_insert["u"]["key"] key = ( gdb.selected_inferior() .read_memory(int(wt_insert.address) + key_struct["offset"], key_struct["size"]) .tobytes() ) print("Key: " + str(key)) print("Value:") update_chain = wt_insert["upd"] dump_update_chain(update_chain) def dump_skip_list(wt_insert_head): if not wt_insert_head["head"].address: return wt_insert = wt_insert_head["head"][0] idx = 0 while True: if not wt_insert: break dump_insert_list(wt_insert.dereference()) dbg("insert" + str(idx), wt_insert.dereference()) idx += 1 wt_insert = wt_insert["next"][0] def dump_modified(leaf_page): print("Modify:") if not leaf_page["modify"]: print("No modifies") return leaf_modify = leaf_page["modify"].dereference() dbg("modify", leaf_modify) row_leaf_insert = leaf_modify["u2"]["row_leaf"]["insert"] dbg("row store", row_leaf_insert) if not row_leaf_insert: print("No insert list") else: print("Insert list:") dump_skip_list(row_leaf_insert.dereference().dereference()) row_leaf_update = leaf_modify["u2"]["row_leaf"]["update"] if not row_leaf_update: print("No update list") else: print("Update list:") leaf_num_entries = int(leaf_page["entries"]) for i in range(0, leaf_num_entries): dump_update_chain(row_leaf_update[i]) def dump_disk(leaf_page): dbg("in-memory page:", leaf_page) dsk = leaf_page["dsk"].dereference() if int(dsk.address) == 0: print("No page loaded from disk.") return dbg("on-disk page:", dsk) wt_page_header_size = 28 wt_block_header_size = 12 page_bytes = ( gdb.selected_inferior() .read_memory( int(dsk.address) + wt_page_header_size + wt_block_header_size, int(dsk["mem_size"]) ) .tobytes() ) print("Dsk:\n" + str(page_bytes)) def dump_handle(dhandle): print("Dumping: " + dhandle["name"].string()) btree = get_btree_handle(dhandle) root = btree["root"] root_page = root["page"].dereference() dbg("btree", btree) dbg("root", btree["root"]) dbg("root page", root_page) rpindex = root_page["u"]["intl"]["__index"].dereference() leaf_num_entries = int(rpindex["entries"]) for idx in range(0, leaf_num_entries): dbg("rpindex", rpindex) dbg("rp-pre-index", rpindex["index"].dereference().dereference()) leaf_page = rpindex["index"][idx].dereference()["page"].dereference() dbg("leaf", leaf_page) dump_disk(leaf_page) dump_modified(leaf_page)