mongo/modules_poc/merge_decls.py

212 lines
6.9 KiB
Python
Executable File

#!/usr/bin/env python3
import glob
import json
import multiprocessing
import os
from concurrent.futures import ProcessPoolExecutor, as_completed, wait
from datetime import datetime
from typing import Any, TypedDict
import typer # nicer error dump on exceptions
import yaml
from progressbar import ProgressBar, progressbar
try:
from yaml import CDumper as Dumper
from yaml import CLoader as Loader # noqa: F401 Not used right now but may be again
except ImportError:
raise RuntimeError("Why no cYaml?")
# from yaml import Loader, Dumper
class Decl(TypedDict):
display_name: str
kind: str
loc: str
mod: str
other_mods: dict[str, set[str]] # merged
used_from: dict[str, set[str]] # merged
usr: str
defined: bool
all_decls: dict[str, Decl] = {}
def merge_decls(decls: list[Decl]):
for decl in decls:
decls = [] # hide from traceback
merge_decl(decl)
def merge_decl(decl: Decl):
other_mods = decl.get("other_mods", {})
used_from = decl["used_from"]
usr = decl["usr"]
if usr not in all_decls:
# First time seeing this decl - no merging needed
for mod in used_from:
if type(used_from[mod]) != set:
used_from[mod] = set(used_from[mod])
all_decls[usr] = decl
return
old = all_decls[usr]
# Merge used_from into old_used_from
old_used_from = old["used_from"]
for mod, locs in used_from.items():
if not mod:
mod = "__NONE__"
old_used_from.setdefault(mod, set()).update(locs)
old_other_mods = old.get("other_mods", {})
# Merge other_mods into old_other_mods
for other, val in other_mods.items():
if isinstance(val, set):
old_other_mods.setdefault(other, set()).update(val)
else:
old_other_mods.setdefault(other, set()).add(val)
mod = decl["mod"]
replace = decl["defined"] and not old["defined"]
if replace:
# Make this the primary decl, even if from same mod
all_decls[usr] = decl
decl["used_from"] = old_used_from
if mod != old["mod"]:
if mod in old_other_mods:
del old_other_mods[mod] # we are not an "other"
old_other_mods.setdefault(old["mod"], set()).add(old["loc"])
if old_other_mods:
decl["other_mods"] = old_other_mods
else:
if mod != old["mod"]:
old_other_mods.setdefault(mod, set()).add(decl["loc"])
if old_other_mods:
old["other_mods"] = old_other_mods
# assert decl["loc"] == old["loc"]
assert (
decl["kind"] == old["kind"]
# These are weird special cases where it sometimes ends up on
# CLASS_DECL rather than the CLASS_TEMPLATE. Not sure why?
or decl["display_name"].startswith("StackBufBuilderBase")
or decl["display_name"].startswith("Sorter")
or decl["display_name"].startswith("SortIteratorInterface")
)
# assert decl["display_name"] == old["display_name"] # TODO ugh sometimes mongo:: screws it up
class Timer:
def __init__(self):
self.start = datetime.now()
def mark(self, label: str):
if 1:
elapsed = datetime.now() - self.start
print(f"{label}: {elapsed}")
def worker(paths: list[bytes]):
# for path in paths:
while True:
try:
path = paths.pop()
except IndexError:
return list(all_decls.values())
with open(path) as f:
merge_decls(json.loads(f.read()))
def main(
jobs: int = typer.Option(os.cpu_count(), "--jobs", "-j"),
intra_module: bool = typer.Option(False, help="Include intra-module accesses"),
generate_yaml: bool = False,
):
timer = Timer()
paths = glob.glob(b"bazel-bin/**/*.mod_scanner_decls.json", recursive=True)
num_paths = len(paths)
timer.mark("globbed")
if jobs > 1:
with multiprocessing.Manager() as manager:
with ProcessPoolExecutor(jobs) as pool:
workers = set()
shared_paths = manager.list(paths)
for _ in range(jobs):
workers.add(pool.submit(worker, shared_paths)) # type:ignore
with ProgressBar(max_value=num_paths, prefix="processing inputs: ") as bar:
while True:
done, _ = wait(workers, timeout=0.1, return_when="FIRST_EXCEPTION")
for d in done:
if d.exception():
raise d.exception()
remaining_files = len(shared_paths) + jobs - len(done)
bar.update(num_paths - remaining_files)
if remaining_files == 0:
break
timer.mark("all paths consumed")
for result in as_completed(workers):
merge_decls(result.result())
timer.mark("merged results")
else:
for path in progressbar(paths):
with open(path) as f:
merge_decls(json.loads(f.read()))
timer.mark("processed input")
if not intra_module:
for decl in all_decls.values():
if decl["mod"] in decl["used_from"]:
del decl["used_from"][decl["mod"]]
out: Any = [d for d in all_decls.values() if d["used_from"]]
for decl in out:
# go from {$MOD: $LOCS} map to [{mod: $MOD, locs: $LOCS}] list of
# objects which is easier to work with in mongo aggregations
decl["used_from"] = [{"mod": k, "locs": sorted(v)} for k, v in decl["used_from"].items()] # type: ignore
if "other_mods" in decl:
decl["other_mods"] = {k: sorted(v) for k, v in decl["other_mods"].items()} # type: ignore
timer.mark("massaged output for json")
with open("merged_decls.json", "w") as f:
json.dump(out, f)
timer.mark("dumped json")
if generate_yaml:
for decl in out:
decl["used_from"] = {u["mod"]: u["locs"] for u in decl["used_from"]} # type: ignore
# sort by file to make it easier to use
out.sort(key=lambda d: d["loc"])
timer.mark("massaged output for yaml")
with open("merged_decls.yaml", "w") as f:
yaml.dump(out, f, Dumper=Dumper, width=1000000)
timer.mark("dumped yaml")
out = list(
{k: v for k, v in d.items() if not k == "used_from"}
for d in all_decls.values()
if d["mod"] == "__NONE__"
# These are parts of other things (classes and enums) that should already be included.
and d["kind"] not in ("CXX_METHOD", "CONSTRUCTOR", "ENUM_CONSTANT_DECL", "FIELD_DECL")
)
out.sort(key=lambda d: d["display_name"])
timer.mark("massaged output for unowned.yaml")
with open("unowned.yaml", "w") as f:
yaml.dump(out, f, Dumper=Dumper, width=1000000)
timer.mark("dumped unowned.yaml")
if __name__ == "__main__":
typer.run(main)