mongo/modules_poc/mod_scanner.py

940 lines
34 KiB
Python
Executable File

#!/usr/bin/env python3
import dataclasses
import functools
import itertools
import json
import os
import sys
from dataclasses import dataclass
from datetime import datetime
from functools import cache, cached_property
from glob import glob
from pathlib import Path # if you haven't already done so
from typing import Literal, NoReturn
import pyzstd
import regex as re
import yaml
try:
from yaml import CDumper as Dumper
except ImportError:
raise RuntimeError("Why no cYaml?")
file = Path(__file__).resolve()
parent, root = file.parent, file.parents[1]
sys.path.append(str(file.parent))
# os.chdir(parent.parent) # repo root (uncomment for python debugger)
import cindex as clang
from cindex import (
AccessSpecifier,
Config,
Cursor,
CursorKind,
Index,
LinkageKind,
RefQualifierKind,
TranslationUnit,
)
from mod_mapping import mod_for_file, normpath_for_file
def perr(*values):
print(*values, file=sys.stderr)
def perr_exit(*values) -> NoReturn:
perr(*values)
sys.exit(1)
# Monkey patch some features into clang's python binding. Keeping commented out for now in case we decide not to use modified lib.
# clang.functionList.append(("clang_File_isEqual", [ClangFile, ClangFile], ctypes.c_int))
# clang.functionList.append(("clang_Cursor_hasAttrs", [Cursor], ctypes.c_uint))
# clang.Cursor.__hash__ = lambda self: self.hash
# clang.File.__eq__ = lambda self, other: other is not None and bool(
# clang.conf.lib.clang_File_isEqual(self, other)
# )
# def get_specialized_template(node: Cursor):
# return Cursor.from_cursor_result(clang.conf.lib.clang_getSpecializedCursorTemplate(node), node)
# def has_attrs(node: Cursor):
# return node.has_attrs()
def is_tu(c: Cursor | CursorKind):
if isinstance(c, Cursor):
c = c.kind
return c == CursorKind.TRANSLATION_UNIT
out_from_env = os.environ.get("MOD_SCANNER_OUTPUT", None)
is_local = out_from_env is None
class DecoratedCursor(Cursor):
# All USRs start with 'c:'. Local USRs then have a filename+'@' followed by
# an optional number+'@'. Global USRs just start with 'c:@'
_USR_GLOBALIZER_REGEX = re.compile(r"c:[\w\.\-]+@(\d+@)?")
# CursorKinds that represent types. For these we prefer definition locations.
# This was decided by manually examining the unique kinds from the output.
TYPE_KINDS = {
CursorKind.ENUM_DECL,
CursorKind.STRUCT_DECL,
CursorKind.UNION_DECL,
CursorKind.CLASS_DECL,
CursorKind.CLASS_TEMPLATE,
CursorKind.CLASS_TEMPLATE_PARTIAL_SPECIALIZATION,
# Unsure about these:
# CursorKind.TYPE_ALIAS_DECL,
# CursorKind.TYPEDEF_DECL,
# CursorKind.TYPE_ALIAS_TEMPLATE_DECL,
}
def __init__(self, c: Cursor):
# Unfortunately, need to decompose and have Cursor constructor recompose.
super().__init__(c._kind_id, c.xdata, c.data)
self._tu = c._tu
@staticmethod
@cache
def normalize(c: Cursor):
assert c.kind != CursorKind.NAMESPACE # Should not be called with this.
# unresolve implicit instantiations
while templ := c.specialized_template:
# Clang unfortunate behavior: The method to "unspecialize" a template
# will both go from implicit instantiation to the template *and* go from
# an explicit specialization to the primary template. Ideally, we
# would only do the first, but that isn't an option. So we try to fake
# it by only using the result if the locations are the same. However,
# in some cases (notably including template methods of a template class),
# clang will jump from the definition to the declaration, and neither
# orig.canonical or result.get_definition() works to get the same location.
# So we compromise: fully unspecialize non-type templates (variables and
# functions), but require the locs to match on types. This is important
# because class specializations can have different members than their
# primary template and we want to handle those correctly. We ignore all
# child declarations of functions, so that isn't a problem there.
# This still chokes on explicit and extern template instantiations, but
# it isn't clear how to fix that.
if c.kind in DecoratedCursor.TYPE_KINDS:
if templ.location != c.location and templ.extent != c.extent:
templ_def = templ.get_definition()
if not templ_def or templ_def.location != c.location:
break
c = templ
usr = c.get_usr()
definition = c.get_definition()
# In clang terms, the "canonical" declaration is the first one seen in the TU.
canonical = c.canonical
assert canonical
if c.kind not in (
CursorKind.TYPEDEF_DECL,
CursorKind.TYPE_ALIAS_DECL,
): # Hit a clang bug :(
assert canonical.get_usr() == usr
if definition:
assert definition.get_usr() == usr
# For types, prefer the definition if it is in a header, otherwise use the canonical decl.
c = canonical
if c.kind in DecoratedCursor.TYPE_KINDS:
if definition and definition.location.file.name.endswith(".h"):
c = definition
return DecoratedCursor(c)
@cached_property
def raw_parent(self):
if is_tu(self.semantic_parent):
# We never want to treat TUs as parents.
return
assert self.semantic_parent
return DecoratedCursor(self.semantic_parent)
@cached_property
def normalized_parent(self):
if not self.raw_parent:
return None
if self.raw_parent.kind == CursorKind.NAMESPACE:
return self.raw_parent # Note: returning same object to share cached properties.
return DecoratedCursor.normalize(self.raw_parent)
@property
def normalized_parents(self):
p = self.normalized_parent
while p and not is_tu(p):
yield p
p = p.normalized_parent
@cached_property
def raw_usr(self):
return self.get_usr()
@cached_property
def globalized_usr(self):
"""
Removes the file and unique number clang adds to some USRs without external linkage.
This includes (among other cases) anything that has a lambda as part of its type,
and namspace-scope constant integers. This interferes with our normalizing of USRs
because it breaks the rule that everything's USR starts with its partent's USR.
Globalizing restores that property.
I have manually verified that this does not cause problematic collisions between USRs.
There were only 4 groups of declarations that ended up with the same USR after
globalizing. 3 were all function-local lambdas that get filtered out with other
function-local declarations, and the last was the decay operator for lambdas
used to build a hand-rolled VTable in a class's private section.
"""
usr = DecoratedCursor._USR_GLOBALIZER_REGEX.sub("c:@", self.raw_usr)
return usr
@cached_property
def normalized_usr(self):
"""
Like globalized_usr, but replaces the raw_parent's USR prefix with the normalized _parent's USR
"""
usr = self.globalized_usr
if not usr or self.kind == CursorKind.NAMESPACE or not self.raw_parent:
# Namespaces don't undergo any normalization, so we can break the cycle here.
return usr
assert usr.startswith(self.raw_parent.globalized_usr)
return self.normalized_parent.normalized_usr + usr[len(self.raw_parent.globalized_usr) :]
@cached_property
def definition(self):
d = self.get_definition()
if not d:
return None
if d == self:
return self # keep cache
return DecoratedCursor(self)
@property # no need to cache
def has_definition(self):
return self.definition is not None
@cached_property
def string_for_context(self):
if self.kind == CursorKind.STATIC_ASSERT:
return self.kind.name
else:
return f"{self.kind.name} {fully_qualified(self, 'spelling')}"
DETAIL_REGEX = re.compile(r"(detail|internal)s?$")
@dataclass
class GetVisibilityResult:
attr: str
alt: str | None
parent: DecoratedCursor | None # only None for UNKNOWN
non_ns_parent: DecoratedCursor | None
def get_visibility(
c: DecoratedCursor, scanning_parent=False, last_non_ns_parent=None
) -> GetVisibilityResult:
if c.kind != CursorKind.NAMESPACE:
last_non_ns_parent = c
is_internal_namespace = c.kind == CursorKind.NAMESPACE and DETAIL_REGEX.match(c.spelling)
in_complete_header = normpath_for_file(c) in complete_headers
# ideally this would be in an if c.has_attrs() block, but that seems to not work in all cases.
# TODO: try again when on a newer clang. Also might be worth seeing if we can narrow down
# the cases where it doesn't work.
for child in c.get_children():
if child.kind != CursorKind.ANNOTATE_ATTR:
continue
terms = child.spelling.split("::")
if not (len(terms) >= 3 and terms.pop(0) == "mongo" and terms.pop(0) == "mod"):
continue
if terms[0] == "shallow":
terms.pop(0)
assert terms
if scanning_parent:
continue # shallow doesn't apply to children
attr = terms.pop(0)
if is_internal_namespace and not attr.endswith("private"):
perr(
pretty_location(c.location)
+ ": namespaces ending in 'detail(s)' or 'internal(s)' are implicitly private."
+ " Attributes other than MONGO_MOD_FILE_PRIVATE are ignored."
)
break
if terms:
alt = "::".join(terms)
assert attr in ("use_replacement",)
# Must specify an alternate API
if not alt or alt.isspace():
perr_exit(
pretty_location(c.location)
+ ": MONGO_MOD_USE_REPLACEMENT() must specify the replacement API"
)
else:
alt = None
assert attr in (
"open",
"public",
"private",
"file_private",
"needs_replacement",
)
if attr == "open" and scanning_parent:
# "open" only applies to the current class and makes semantic children public.
attr = "public"
return GetVisibilityResult(attr, alt, c, last_non_ns_parent)
# Apply high-priority defaults that override parent's visibility.
if in_complete_header:
# details and internal namespaces
if is_internal_namespace:
return GetVisibilityResult("private", None, c, last_non_ns_parent)
if c.spelling.endswith("_forTest"):
return GetVisibilityResult("file_private", None, c, last_non_ns_parent)
if not scanning_parent:
# TODO consider making PROTECTED also default to module private
if c.access_specifier == AccessSpecifier.PRIVATE:
return GetVisibilityResult("private", None, c, last_non_ns_parent)
if c.normalized_parent:
parent_vis = get_visibility(
c.normalized_parent, scanning_parent=True, last_non_ns_parent=last_non_ns_parent
)
else:
parent_vis = GetVisibilityResult("UNKNOWN", None, None, None) # break recursion
# Apply low-priority defaults that defer to parent's visibility
if not scanning_parent and parent_vis.attr == "UNKNOWN" and in_complete_header:
return GetVisibilityResult("private", None, c, last_non_ns_parent)
return parent_vis
complete_headers = set[str]()
incomplete_headers = set[str]()
def make_vis_from(c: DecoratedCursor | None):
if not c:
return None
return {
"usr": c.normalized_usr,
"display_name": fully_qualified(c),
"kind": c.kind.name,
"loc": pretty_location(c.location),
"mod": mod_for_file(c.location.file),
}
@dataclass
class Decl:
display_name: str
usr: str
raw_usr: str
# mangled_name: str
loc: str
kind: str
mod: str | None
linkage: str
defined: bool
spelling: str
visibility: str
alt: str
vis_from: dict[str, str]
vis_from_non_ns: dict[str, str]
sem_par: str
lex_par: str
used_from: dict[str, set[str]] = dataclasses.field(default_factory=dict, compare=False)
def def_or_decled(self) -> str:
return "defined" if self.defined else "declared"
@staticmethod
def from_cursor(c: Cursor, mod=None):
if not isinstance(c, DecoratedCursor):
c = DecoratedCursor(c)
vis = get_visibility(c)
return Decl(
display_name=fully_qualified(c),
spelling=c.spelling,
usr=c.normalized_usr,
raw_usr=c.raw_usr,
# mangled_name=c.mangled_name,
loc=pretty_location(c.location),
linkage=c.linkage.name,
kind=c.kind.name,
mod=mod or mod_for_file(c.location.file),
defined=c.has_definition,
visibility=vis.attr,
alt=vis.alt,
vis_from=make_vis_from(vis.parent),
vis_from_non_ns=make_vis_from(vis.non_ns_parent),
sem_par=c.normalized_parent.normalized_usr if c.normalized_parent else None,
lex_par=(
DecoratedCursor(c.lexical_parent).normalized_usr
if not is_tu(c.lexical_parent)
else None
),
)
def pretty_location(loc: clang.SourceLocation | clang.Cursor):
if isinstance(loc, Cursor):
if loc.location.file:
loc = loc.location
else:
# Clang bug: For some reason, usages of conversion operators lack a
# location, but have an extent. Use the start of the extent instead.
extent_start = loc.extent.start # type: clang.SourceLocation
loc = extent_start
# NOTE: not using normpath_for_file() here because we don't want to convert
# bazel-out/blah/src/mongo/beep to src/mongo/beep. All paths output by pretty_location
# should be relative to the repo root. This is important for the browser to be able to
# load the file. We still want to use os.path.normpath to fix up foo/bar/../baz to foo/baz.
name = os.path.normpath(loc.file.name) if loc.file else "<unknown>"
# return f"{name}({loc.line},{loc.column})" # MSVC format
return f"{name}:{loc.line}:{loc.column}" # gcc format
decls = dict[str, Decl]()
def fully_qualified(c: DecoratedCursor, kind: Literal["displayname", "spelling"] = "displayname"):
parts = []
for c in itertools.chain((c,), c.normalized_parents):
spelling = getattr(c, kind)
if spelling:
if c.is_const_method():
spelling += " const"
match c.type.get_ref_qualifier():
case RefQualifierKind.LVALUE:
spelling += " &"
case RefQualifierKind.RVALUE:
spelling += " &&"
parts.append(spelling)
if not parts:
return ""
if parts[-1] == "mongo":
parts.pop()
else:
parts.append("")
parts.reverse()
return "::".join(parts)
def add_decl(d: Decl):
if d.usr not in decls:
decls[d.usr] = d
return
old = decls[d.usr]
if old.mod != d.mod:
perr(
f"{d.loc}:warning: {d.kind} {d.display_name} {d.def_or_decled()} in module {d.mod} "
+ f"after previously being {old.def_or_decled()} in module {old.mod}"
)
perr(f"{old.loc}:note: prior definition here")
if d.defined and old.defined:
# print(d.kind)
# print(d.kind == CursorKind.TYPEDEF_DECL)
# if d.kind == CursorKind.TYPEDEF_DECL:
# return # TODO: how to handle this?
if d == old:
return # it doesn't matter, ignore it
if not any(
special_case in d.display_name
for special_case in ("(unnamed ", "UFDeductionHelper", "<IsConst, IndexScanStats>")
) and not d.spelling.startswith("(anonymous "):
return # ignore
print("detected duplicate definitions!")
print(d.loc, d)
print(old.loc, old)
assert not (d.defined and old.defined)
if d.defined and not old.defined:
assert not d.used_from
d.used_from = old.used_from
decls[d.usr] = d
# TODO consider merging otherwise?
# These are completely skipped during decl finding
skip_kinds = {
# parameters
CursorKind.PARM_DECL,
CursorKind.TEMPLATE_TYPE_PARAMETER,
CursorKind.TEMPLATE_NON_TYPE_PARAMETER,
CursorKind.TEMPLATE_TEMPLATE_PARAMETER,
# Function bodies
CursorKind.COMPOUND_STMT,
CursorKind.CXX_TRY_STMT,
# Useless
CursorKind.CXX_ACCESS_SPEC_DECL, # doesn't have children
CursorKind.STATIC_ASSERT,
#
# TODO Consider for future for things like hidden friends
CursorKind.FRIEND_DECL,
}
skip_mods: tuple[str, ...] = ()
def find_decls(mod: str, c: Cursor):
if c.location.file:
assert mod_for_file(c.location.file) == mod # maybe
if c.kind.is_declaration() and c.kind != CursorKind.NAMESPACE and c.spelling:
add_decl(Decl.from_cursor(c))
if c.kind == CursorKind.TYPE_ALIAS_TEMPLATE_DECL:
return
for child in c.get_children():
if child.kind in skip_kinds:
continue
if child.kind.is_attribute():
continue
find_decls(mod, child)
function_kinds = {
CursorKind.CONSTRUCTOR,
CursorKind.CONVERSION_FUNCTION,
CursorKind.CXX_METHOD,
CursorKind.DESTRUCTOR,
CursorKind.FUNCTION_DECL,
CursorKind.FUNCTION_TEMPLATE,
}
def is_local_decl(c: Cursor):
assert c.kind.is_declaration
# Checking linkage first avoids doing expensive check for things we know can't be local.
if c.linkage not in (LinkageKind.NO_LINKAGE, LinkageKind.INTERNAL):
return False
# Important: this skips over the input c itself, since we don't want to consider
# functions as local decls, unless they are inside of another function.
while (c := c.semantic_parent) and not is_tu(c):
if c.kind in function_kinds:
return True
return False
context_kinds = (
function_kinds
| DecoratedCursor.TYPE_KINDS
| { # Type Aliases
CursorKind.TYPE_ALIAS_DECL,
CursorKind.TYPEDEF_DECL,
CursorKind.TYPE_ALIAS_TEMPLATE_DECL,
}
| { # Misc
CursorKind.FIELD_DECL, # Member variables
CursorKind.UNEXPOSED_DECL, # template variables
CursorKind.CONCEPT_DECL,
}
)
# These are only considered for context at namespace scope (when context is None)
namespace_scope_context_kinds = {
CursorKind.VAR_DECL,
CursorKind.STATIC_ASSERT,
}
def find_usages(mod: str, c: Cursor, context: DecoratedCursor | None):
if c.kind == CursorKind.ANNOTATE_ATTR and c.spelling.startswith("mongo::mod::"):
if not any(normpath_for_file(c) in s for s in (complete_headers, incomplete_headers)):
perr_exit(
f"{pretty_location(c)}:ERROR: usage of MONGO_MOD macro without directly including "
+ '"mongo/util/modules.h" or modules_incompletely_marked_header.h'
)
if c.kind in context_kinds or (context is None and c.kind in namespace_scope_context_kinds):
context = DecoratedCursor(c)
ref = c.referenced
# Handle children first. This makes it possible to use early returns below
for child in c.get_children():
# Don't count friendship as a "usage". This causes problems since the friend decl
# becomes the canonical decl for the type for any TU that doesn't see the definition.
# "Hidden friend" definitions *are* traversed.
if c.kind == CursorKind.FRIEND_DECL and not child.is_definition():
return
assert child != c
assert ref is None or child != ref or ref.kind == CursorKind.OVERLOADED_DECL_REF
find_usages(mod, child, context)
if ref is None or ref == c:
return
if ref.kind in (
CursorKind.NAMESPACE,
CursorKind.NAMESPACE_ALIAS,
CursorKind.TEMPLATE_TEMPLATE_PARAMETER,
CursorKind.TEMPLATE_TYPE_PARAMETER,
CursorKind.TEMPLATE_NON_TYPE_PARAMETER,
CursorKind.PARM_DECL,
CursorKind.NO_DECL_FOUND,
):
return
if ref.kind == CursorKind.OVERLOADED_DECL_REF:
# These come up when parsing a dependently-typed call. Unfortunately they
# are not very useful, so they are one of many cases where we can't get
# good info out of templates.
assert not ref.get_usr()
return
# NOTE: This is for templated variables and their specializations. Ideally these
# would be tracked, but we only have 27 template variables (4 of which are used
# cross-module) and they are generating thousands of unique declarations because
# libclang doesn't expose enough info for us to merge them well. This massively
# skews the results because they are 10% of all decls!
# TODO: we should at least check that private decls aren't used from the wrong mod
# before returning.
if ref.kind == CursorKind.UNEXPOSED_DECL:
return
if not ref.canonical.location.file:
# These are pre-declared in the compiler with no source location. In some cases,
# they are redeclared in the stdlib, but canonicalization points them back
# at the internal declaration. Make sure that this isn't causing us to skip
# any first-party declarations.
assert not ref.location.file or mod_for_file(ref.location.file) is None
return
if c.kind == CursorKind.CALL_EXPR and c.referenced.kind != CursorKind.CONSTRUCTOR:
# For a call expression like a.b(c) or a::b(c) the whole thing is considered a reference of
# a.b, but unfortunately the reported location is that of a, while we'd really want it to be
# that of b. This was frequently resulting in two usage locations being reported for each
# method call. Luckily, in most cases, the first child of the call expression (or one of its
# transitive children) is the sub-expression a.b, which has a reference to b with the right
# location. So we can safely ignore the call expression and rely on the post-order traversal
# already adding a relevant used_from reference for this expression. The one exception is
# that for constructor calls, the child refers to the *type* a::b, rather than the specific
# constructor a::b::b() chosen, so we still need to add this location (even if not ideal) to
# ensure that we mark the constructor's usage. I ran this with an assert to check that this
# doesn't cause us to lose any usages, but it is too slow to keep (O(n^2) for n calls to a
# method in a TU, causing some TUs to take several minutes).
return
if is_local_decl(ref):
return
# Unfortunately libclang's c api doesn't handle implicitly declared methods
# well. In particular it often points at a location of a forward decl of the
# class rather than the definition, even if both are visible. And then the
# rest of our handling doesn't work correctly. And it also doesn't have a
# way to distinguish implicit methods from explicitly defaulted ones. So we
# just resolve all defaulted methods to the type and continue from there.
if ref.is_default_method():
ref = ref.semantic_parent
# assert not c.location.file or mod_for_file(c.location.file) == mod
ref = DecoratedCursor.normalize(ref)
# Ignore any declarations not declared in a header.
# TODO what if a local type is passed to a template? For now doesn't matter because we
# don't look at usages from instantiations.
if ref.location.file.name.endswith(".cpp") or ref.location.file.name.endswith(".cc"):
return
usr = ref.normalized_usr
if not usr:
return
if usr in decls:
# We've already done the work to get the info for this decl.
d = decls[usr]
else:
decl_mod = mod_for_file(ref.location.file)
if not decl_mod or decl_mod in skip_mods:
return
d = Decl.from_cursor(ref, decl_mod)
decls[usr] = d
if ref.definition and ref != ref.definition:
def_mod = mod_for_file(ref.definition.location.file)
# Note def_mod is None means third_party, not __NONE__ module
if def_mod != decl_mod and def_mod is not None:
print(f"WARNING: {d.display_name} is declared and defined in different modules")
print(f" decl: {pretty_location(ref)} ({decl_mod})")
print(f" defn: {pretty_location(ref.definition)} ({def_mod})")
# ignore usages from the same module
# if d.mod == mod or mod.startswith(d.mod):
# return
# if this fails, something is missing in context_kinds or namespace_scope_context_kinds
assert context
usage = f"{pretty_location(c.location)} {context.string_for_context}"
d.used_from.setdefault(mod, set()).add(usage)
# Check that cross-module inheritance only uses "open" bases that explicitly chose to allow it.
# For now we also allow "UNKNOWN" visibility, because that means that the base class hasn't been
# marked for visibility. When the owner of that class gets to it, they see errors if other modules
# inherit from it, and can then decide whether to mark it as open or not.
# NOTE: the way that this check is implemented, it allows trivially bypassing by using something like
# class Bad : public std::type_identity_t<ClosedBase> {}; but we are trying to catch accidental misuses,
# not malicious ones. If we find this is a problem, we can probably improve it by asking clang to
# resolve any typedefs. It won't help if the consumer is a template and the base is a dependent type,
# but that is a general problem with the scanner.
if c.kind == CursorKind.CXX_BASE_SPECIFIER and not (
d.visibility in ("open", "UNKNOWN") or d.mod == mod or mod.startswith(d.mod)
):
perr_exit(
f"ERROR: {d.display_name} is used as a base class of {fully_qualified(context)}, "
f"but they are in different modules. This is only allowed for classes marked MONGO_MOD_OPEN."
f"\n base: {d.loc} ({d.mod})"
f"\n child: {pretty_location(c)} ({mod})"
f"\n If you think that the base should be open, please contact the owner."
# TODO extract the slack channel from the module metadata and add it here
)
seen = set[Cursor]()
def ast(node: Cursor):
templ = node.specialized_template
usr = node.get_usr()
if node in seen:
return {
"b_kind": node.kind.name,
"c_usr": usr,
"d_display": node.displayname,
"e_location": pretty_location(node.location),
}
seen.add(node)
if 0: # toggle filtering
children = [ast(c) for c in node.get_children()]
else:
children = []
for c in node.get_children():
if c.location.file is None:
children.append(ast(c))
continue
if "src/mongo/" not in c.location.file.name:
continue
if c.kind == CursorKind.COMPOUND_STMT:
continue
children.append(ast(c))
return {
"b_kind": node.kind.name,
"c_par_usr": str(node.semantic_parent.get_usr() if node.semantic_parent else None),
"c_usr": str(usr),
"d_display": str(node.displayname),
"d_spelling": str(node.spelling),
"e_location": pretty_location(node.location),
"ee_mod": mod_for_file(node.location.file),
# "f_extent.start": str(node.extent.start),
# "g_extent.end": str(node.extent.end),
"h_is_definition": node.is_definition(),
"h_is_decl": node.kind.is_declaration(),
"h_linkage": node.linkage.name,
"z_ref": (ast(node.referenced) if node.referenced and node.referenced != node else None),
"z_templ": ast(templ) if templ else None,
"zz_children": children,
}
class Timer:
def __init__(self):
self.start = datetime.now()
def mark(self, label: str):
if is_local:
elapsed = datetime.now() - self.start
print(f"{label}: {elapsed}")
timer = Timer()
def parseTU(args: list[str] | str):
if not Config.loaded:
Config.set_compatibility_check(False)
external = "external" if os.path.exists("external") else "bazel-out/../../../external"
paths_to_try = [
f"{external}/mongo_toolchain_v5/v5/lib/libclang.so",
f"{external}/mongo_toolchain_v4/v4/lib/libclang.so",
f"{external}/mongo_toolchain/v4/lib/libclang.so",
]
for path in paths_to_try:
if os.path.exists(path):
Config.set_library_file(path)
break
else:
path_lines = "\n\t".join(paths_to_try) # can't have \ in f-string expr
perr_exit(f"Unable to find libclang.so. Paths tried:\n\t{path_lines}")
# Config.set_library_file("/home/ubuntu/clang+llvm-19.1.1-aarch64-linux-gnu/lib/libclang.so")
if type(args) == str:
args = [args]
if len(args) == 1:
compdb = clang.CompilationDatabase.fromDirectory(".")
commands = compdb.getCompileCommands(args[0])
if commands is None:
perr_exit(f"no compile commands for {args[0]}")
if len(commands) != 1:
perr_exit(f"too many compile commands for {args[0]}", commands)
# print(" ".join(commands[0].arguments))
args = list(commands[0].arguments)[1:] # skip executable
# somehow clang implicitly adds args that it warns about
cleanArgs = ["-Wno-unused-command-line-argument"]
for arg in args:
if arg in ("-MD", "-MMD", "-MF"):
continue
if arg.endswith(".d"):
continue
cleanArgs.append(arg)
# print(arg)
# Disable all warnings. Don't waste time on them when parsing.
cleanArgs.append("-w")
index = Index.create()
timer.mark("preparse")
tu = index.parse(None, cleanArgs)
if not tu:
raise RuntimeError("unable to load input")
for d in tu.diagnostics:
perr(d)
timer.mark("parsed")
for include in tu.get_includes():
if "src/mongo" not in include.include.name:
continue
# Note: using bytes to avoid unicode handling overhead since the
# needles we are looking for are ascii-only.
content = Path(include.include.name).read_bytes()
if b'"mongo/util/modules.h"' in content:
complete_headers.add(normpath_for_file(include.include))
elif b'"mongo/util/modules_incompletely_marked_header.h"' in content:
incomplete_headers.add(normpath_for_file(include.include))
timer.mark("checked header completeness")
return tu
def dump_unused_inputs(outPath: str, tu: TranslationUnit):
# only looking in src/mongo to cut down on resources, and to reduce the risk of accidentally
# including some file we shouldn't. Assumption is that third_party and generated sources won't
# change in a tight feedback loop.
universe = set(glob("src/mongo/**/*.h", recursive=True))
timer.mark("globbed")
for include in tu.get_includes():
if include.include:
universe.discard(include.include.name)
with open(outPath, "w") as file:
file.write("\n".join(sorted(universe)))
timer.mark("outfile written")
def main():
args = sys.argv[1:]
if len(args) == 0:
perr_exit("invalid number of arguments")
if len(args) == 1 and args[0].startswith("--"):
perr_exit(f"{sys.argv[0]} doesn't support runtime options")
tu = parseTU(args)
if unused_input_path := os.environ.get("MOD_SCANNER_UNUSED", None):
dump_unused_inputs(unused_input_path, tu)
assert is_tu(tu.cursor)
if "DUMP_AST" in os.environ and is_local: # useful for debugging (never on bazel)
out = ast(tu.cursor)
timer.mark("ast processed")
with open("ast.yaml", "w") as f:
yaml.dump(out, f, Dumper=Dumper)
timer.mark("ast dumped")
# for top_level in tu.cursor.get_children():
# if "src/mongo/" not in top_level.location.file.name:
# continue
# find_decls(mod_for_file(top_level.location.file), top_level)
# timer.mark("found decls")
for top_level in tu.cursor.get_children():
if "src/mongo/" not in top_level.location.file.name:
continue
find_usages(mod_for_file(top_level.location.file), top_level, None)
timer.mark("found usages")
out_file_name = out_from_env if out_from_env else "decls.yaml"
if out_file_name.endswith(".zst"):
uncompressed_file_name = out_file_name[: -len(".zst")]
open_func = functools.partial(pyzstd.ZstdFile, write_size=2 * 1024 * 1024)
else:
uncompressed_file_name = out_file_name
open_func = open
with open_func(out_file_name, "w") as f:
out = [dict(d.__dict__) for d in decls.values() if d.mod not in skip_mods]
for decl in out:
# del decl["spelling"]
del decl["linkage"]
del decl["raw_usr"] # Can be helpful when debugging but not worth aggregating.
# del decl["defined"]
decl["used_from"] = {k: sorted(v) for k, v in decl["used_from"].items()}
# This makes us only output decls used cross-module. It makes merging much faster,
# but, it means that we can mask some cross-module usages if something is forward
# declared in the wrong module. Also this hides definitions from the
# merger so it can't choose canonical versions. There is still the problem of
# definitions not used from any TU where they are defined.
if 0:
for decl in out:
if decl["mod"] in decl["used_from"]:
del decl["used_from"][decl["mod"]]
out = list(filter(lambda d: d["used_from"], out))
timer.mark("processed")
if uncompressed_file_name.endswith(".json"):
f.write(json.dumps(out).encode())
else:
assert out_file_name.endswith(".yaml")
yaml.dump(out, f, Dumper=Dumper)
timer.mark("dumped")
if __name__ == "__main__":
main()
# cspell: words perr decled displayname cindex templ defn