mongo/buildscripts/bazel_custom_formatter.py

import argparse
import hashlib
import json
import os
import platform
import stat
import subprocess
import sys
import time
import urllib.request
from collections import deque
from pathlib import Path
from typing import Dict, List

from retry import retry

sys.path.append(".")

from buildscripts.install_bazel import install_bazel
from buildscripts.simple_report import make_report, put_report, try_combine_reports

RELEASE_URL = "https://github.com/bazelbuild/buildtools/releases/download/v7.3.1/"

groups_sort_keys = {
    "first": 1,
    "second": 2,
    "third": 3,
    "fourth": 4,
    "fifth": 5,
    "sixth": 6,
    "seventh": 7,
    "eighth": 8,
}


@retry(tries=3, delay=5)
def _download_with_retry(*args, **kwargs):
    return urllib.request.urlretrieve(*args, **kwargs)


def determine_platform():
    syst = platform.system()
    pltf = None
    if syst == "Darwin":
        pltf = "darwin"
    elif syst == "Windows":
        pltf = "windows"
    elif syst == "Linux":
        pltf = "linux"
    else:
        raise RuntimeError("Platform cannot be inferred.")
    return pltf


def determine_architecture():
    arch = None
    machine = platform.machine()
    if machine in ("AMD64", "x86_64"):
        arch = "amd64"
    elif machine in ("arm", "arm64", "aarch64"):
        arch = "arm64"
    else:
        raise RuntimeError(f"Detected architecture is not supported: {machine}")

    return arch


def download_buildozer(download_location: str = "./"):
    operating_system = determine_platform()
    architechture = determine_architecture()
    if operating_system == "windows" and architechture == "arm64":
        raise RuntimeError("There are no published arm windows releases for buildozer.")

    extension = ".exe" if operating_system == "windows" else ""
    binary_name = f"buildozer-{operating_system}-{architechture}{extension}"
    url = f"{RELEASE_URL}{binary_name}"

    file_location = os.path.join(download_location, f"buildozer{extension}")
    _download_with_retry(url, file_location)
    os.chmod(file_location, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
    return file_location


def find_group(unittest_paths):
    groups = {
        # group1
        "0": "first",
        "1": "first",
        # group2
        "2": "second",
        "3": "second",
        # group3
        "4": "third",
        "5": "third",
        # group4
        "6": "fourth",
        "7": "fourth",
        # group5
        "8": "fifth",
        "9": "fifth",
        # group6
        "a": "sixth",
        "b": "sixth",
        # group7
        "c": "seventh",
        "d": "seventh",
        # group8
        "e": "eighth",
        "f": "eighth",
    }

    group_to_path: Dict[str, List[str]] = {}

    for path in unittest_paths:
        norm_path = path.replace(":", "/").replace("\\", "/")
        if norm_path.startswith("//"):
            norm_path = norm_path[2:]
        if not norm_path.startswith("src/"):
            print(f"ERROR: {path} not relative to mongo repo root")
            sys.exit(1)

        basename = os.path.basename(norm_path)

        if basename.startswith("lib"):
            basename = basename[3:]

        ext = basename.find(".")
        if ext != -1:
            basename = basename[:ext]
        dirname = os.path.dirname(norm_path)

        hash_path = os.path.join(dirname, basename).replace("\\", "/")
        first_char = hashlib.sha256(hash_path.encode()).hexdigest()[0]
        group = groups[first_char]
        if group not in group_to_path:
            group_to_path[group] = []
        group_to_path[group].append(path)

    return json.dumps(group_to_path, indent=4)


def find_multiple_groups(test, groups):
    tagged_groups = []
    for group in groups:
        if test in groups[group]:
            tagged_groups.append(group)
    return tagged_groups


def iter_clang_tidy_files(root: str | Path) -> list[Path]:
    """Return a list of repo-relative Paths to '.clang-tidy' files.
    - Uses os.scandir for speed
    - Does NOT follow symlinks
    """
    root = Path(root).resolve()
    results: list[Path] = []
    stack = deque([root])

    while stack:
        current = stack.pop()
        try:
            with os.scandir(current) as it:
                for entry in it:
                    name = entry.name
                    if entry.is_dir(follow_symlinks=False):
                        stack.append(Path(entry.path))
                    elif entry.is_file(follow_symlinks=False) and name == ".clang-tidy":
                        # repo-relative path
                        results.append(Path(entry.path).resolve().relative_to(root))
        except PermissionError:
            continue
    return results


def validate_clang_tidy_configs(generate_report, fix):
    buildozer = download_buildozer()

    mongo_dir = "src/mongo"

    tidy_files = iter_clang_tidy_files("src/mongo")

    p = subprocess.run(
        [buildozer, "print label srcs", "//:clang_tidy_config_files"],
        capture_output=True,
        text=True,
    )
    tidy_targets = None
    for line in p.stdout.splitlines():
        if line.startswith("//") and line.endswith("]"):
            tokens = line.split("[")
            tidy_targets = tokens[1][:-1].split(" ")
            break
    if tidy_targets is None:
        print(p.stderr)
        raise Exception(f"could not parse tidy config targets from '{p.stdout}'")

    if tidy_targets == [""]:
        tidy_targets = []

    all_targets = []
    for tidy_file in tidy_files:
        tidy_file_target = (
            "//" + os.path.dirname(os.path.join(mongo_dir, tidy_file)) + ":clang_tidy_config"
        )
        all_targets.append(tidy_file_target)

    if all_targets != tidy_targets:
        msg = f"Incorrect clang tidy config targets: {all_targets} != {tidy_targets}"
        print(msg)
        if generate_report:
            report = make_report("//:clang_tidy_config_files", msg, 1)
            try_combine_reports(report)
            put_report(report)

    if fix:
        subprocess.run(
            [buildozer, f"set srcs {' '.join(all_targets)}", "//:clang_tidy_config_files"]
        )


def validate_bazel_groups(generate_report, fix):
    buildozer = download_buildozer()

    bazel_bin = install_bazel(".")

    query_opts = [
        "--implicit_deps=False",
        "--tool_deps=False",
        "--include_aspects=False",
        "--bes_backend=",
        "--bes_results_url=",
    ]

    try:
        start = time.time()
        sys.stdout.write("Query all unittest binaries... ")
        sys.stdout.flush()
        query_proc = subprocess.run(
            [
                bazel_bin,
                "query",
                'kind(extract_debug, attr(tags, "[\[ ]mongo_unittest[,\]]", //src/...))',
            ]
            + query_opts,
            capture_output=True,
            text=True,
            check=True,
        )
        bazel_unittests = query_proc.stdout.splitlines()
        sys.stdout.write("{:0.2f}s\n".format(time.time() - start))
    except subprocess.CalledProcessError as exc:
        print("BAZEL ERROR:")
        print(exc.stdout)
        print(exc.stderr)
        sys.exit(exc.returncode)

    buildozer_update_cmds = []

    groups = json.loads(find_group(bazel_unittests))
    failures = []
    for group in sorted(groups, key=lambda x: groups_sort_keys[x]):
        try:
            start = time.time()
            sys.stdout.write(f"Query all mongo_unittest_{group}_group unittests... ")
            sys.stdout.flush()
            query_proc = subprocess.run(
                [
                    bazel_bin,
                    "query",
                    f'kind(extract_debug, attr(tags, "[\[ ]mongo_unittest_{group}_group[,\]]", //src/...))',
                ]
                + query_opts,
                capture_output=True,
                text=True,
                check=True,
            )
            sys.stdout.write("{:0.2f}s\n".format(time.time() - start))
            group_tests = query_proc.stdout.splitlines()
        except subprocess.CalledProcessError as exc:
            print("BAZEL ERROR:")
            print(exc.stdout)
            print(exc.stderr)
            sys.exit(exc.returncode)

        if groups[group] != group_tests:
            for test in group_tests:
                if test not in bazel_unittests:
                    failures.append(
                        [
                            test + " tag",
                            f"{test} not a 'mongo_unittest' but has 'mongo_unittest_{group}_group' tag.",
                        ]
                    )
                    print(failures[-1][1])
                    if fix:
                        buildozer_update_cmds += [
                            [f"remove tags mongo_unittest_{group}_group", test]
                        ]

            for test in groups[group]:
                if test not in group_tests:
                    failures.append(
                        [test + " tag", f"{test} missing 'mongo_unittest_{group}_group'"]
                    )
                    print(failures[-1][1])
                    if fix:
                        buildozer_update_cmds += [[f"add tags mongo_unittest_{group}_group", test]]

            for test in group_tests:
                if test not in groups[group]:
                    failures.append(
                        [
                            test + " tag",
                            f"{test} is tagged in the wrong group.",
                        ]
                    )
                    print(failures[-1][1])
                    if fix:
                        buildozer_update_cmds += [
                            [f"remove tags mongo_unittest_{group}_group", test]
                        ]

    if fix:
        for cmd in buildozer_update_cmds:
            subprocess.run([buildozer] + cmd)

    if failures:
        for failure in failures:
            if generate_report:
                report = make_report(failure[0], failure[1], 1)
                try_combine_reports(report)
                put_report(report)


def validate_idl_naming(generate_report: bool, fix: bool) -> None:
    """
    Enforce:
      idl_generator(
        name = "<stem>_gen",
        src  = "<stem>.idl" | ":gen_target"  # where gen_target produces exactly one .idl
      )
    Single `bazel query --output=xml`, parse in-process. Also resolves src labels to generators.
    """
    import xml.etree.ElementTree as ET

    bazel_bin = install_bazel(".")
    qopts = [
        "--implicit_deps=False",
        "--tool_deps=False",
        "--include_aspects=False",
        "--bes_backend=",
        "--bes_results_url=",
    ]

    # One narrowed query: only rules created by the idl_generator macro
    try:
        proc = subprocess.run(
            [
                bazel_bin,
                "query",
                "attr(generator_function, idl_generator, //src/...)",
                "--output=xml",
            ]
            + qopts,
            capture_output=True,
            text=True,
            check=True,
        )
    except subprocess.CalledProcessError as exc:
        print("BAZEL ERROR (narrowed xml):")
        print(exc.stdout)
        print(exc.stderr)
        sys.exit(exc.returncode)

    root = ET.fromstring(proc.stdout)
    failures: list[tuple[str, str]] = []

    def _val(rule, kind, attr):
        n = rule.find(f'./{kind}[@name="{attr}"]')
        return n.get("value") if n is not None else None

    # Prepass: map rule label -> outputs so we can resolve src labels that generate an .idl
    outputs_by_rule: dict[str, list[str]] = {}
    for r in root.findall(".//rule"):
        rname = r.get("name")
        if not rname:
            continue
        outs = [n.get("name") for n in r.findall("./rule-output") if n.get("name")]
        outputs_by_rule[rname] = outs

    for rule in root.findall(".//rule"):
        # Already narrowed, but keep the sentinel check cheap
        if _val(rule, "string", "generator_function") != "idl_generator":
            continue

        rlabel = rule.get("name") or ""
        if not (rlabel.startswith("//") and ":" in rlabel):
            failures.append((rlabel or "<unknown>", "Malformed idl_generator rule label"))
            continue
        pkg, name = rlabel[2:].split(":", 1)

        # Resolve src from label/string/srcs list
        src_val = _val(rule, "label", "src") or _val(rule, "string", "src")
        if not src_val:
            srcs_vals = []
            for lst in rule.findall('./list[@name="srcs"]'):
                srcs_vals += [n.get("value") for n in lst.findall("./label") if n.get("value")]
                srcs_vals += [n.get("value") for n in lst.findall("./string") if n.get("value")]
            if len(srcs_vals) == 1:
                src_val = srcs_vals[0]
            else:
                failures.append(
                    (rlabel, f"'src'/'srcs' must have exactly one entry, got: {srcs_vals}")
                )
                continue

        src = src_val.replace("\\", "/")
        src_base: str | None = None

        if src.startswith("//"):
            spkg, sname = src[2:].split(":")
            if spkg != pkg:
                failures.append((rlabel, f"'src' must be in same package '{pkg}', got '{src}'"))
            if sname.endswith(".idl"):
                src_base = os.path.basename(sname)
            else:
                idl_outs = [o for o in outputs_by_rule.get(src, []) if o.endswith(".idl")]
                if len(idl_outs) != 1:
                    failures.append(
                        (
                            rlabel,
                            f"'src' '{src}' must produce exactly one .idl, got: {idl_outs or outputs_by_rule.get(src, [])}",
                        )
                    )
                    continue
                src_base = os.path.basename(idl_outs[0].split(":", 1)[1])

        elif src.startswith(":"):
            sname = src[1:]
            if sname.endswith(".idl"):
                src_base = os.path.basename(sname)
            else:
                abs_label = f"//{pkg}:{sname}"
                idl_outs = [o for o in outputs_by_rule.get(abs_label, []) if o.endswith(".idl")]
                if len(idl_outs) != 1:
                    failures.append(
                        (
                            rlabel,
                            f"'src' '{src}' must produce exactly one .idl, got: {idl_outs or outputs_by_rule.get(abs_label, [])}",
                        )
                    )
                    continue
                src_base = os.path.basename(idl_outs[0].split(":", 1)[1])

        else:
            if src.startswith("../") or "/../" in src:
                failures.append((rlabel, f"'src' must be within package '{pkg}', got '{src}'"))
            src_base = os.path.basename(src)

        if not (src_base and src_base.endswith(".idl")):
            failures.append((rlabel, f"'src' must resolve to a .idl file, got: {src_base or src}"))
            continue

        if not name.endswith("_gen"):
            failures.append((rlabel, "Target name must end with '_gen'"))

        stem_from_name = name[:-4] if name.endswith("_gen") else name
        stem_from_src = src_base[:-4]
        if stem_from_name != stem_from_src:
            failures.append(
                (
                    rlabel,
                    f"Stem mismatch: name '{name}' vs src '{src_base}'. "
                    f"Expected src basename '{stem_from_name}.idl'.",
                )
            )

    if failures:
        for lbl, msg in failures:
            print(f"IDL naming violation: {lbl}: {msg}")
            if generate_report:
                report = make_report(lbl, msg, 1)
                try_combine_reports(report)
                put_report(report)

    # print(time.time() - start)
    if fix and failures:
        sys.exit(1)


def validate_private_headers(generate_report: bool, fix: bool) -> None:
    """
    Fast header linter/fixer using concurrent buildozer reads:
      buildozer print label srcs //<scope>:%<macro>

    - Lints if any header appears anywhere in the printed block (including select()/glob()).
    - Auto-fixes ONLY concrete items in the first [...] (top-level list).
    - Fails the run if a non-concrete header is detected (select()/glob()).
    """
    import re
    import subprocess
    import sys
    from concurrent.futures import ThreadPoolExecutor, as_completed
    from shlex import split as shlex_split

    # ---- Config ----
    HEADER_EXTS = (".h", ".hh", ".hpp", ".hxx")
    HEADER_RE = re.compile(r"\.(h|hh|hpp|hxx)\b")
    PUBLIC_KEEP = {
        "//src/mongo/platform:basic.h",
        "//src/mongo/platform:windows_basic.h",
    }
    SCOPE = "//src/mongo/..."  # limit to your subtree
    MACRO_SELECTORS = [
        "%mongo_cc_library",
        "%mongo_cc_binary",
        "%mongo_cc_unit_test",
        "%mongo_cc_benchmark",
        "%mongo_cc_integration_test",
        "%mongo_cc_fuzzer_test",
        "%mongo_cc_extension_shared_library",
    ]
    SKIP_SUFFIXES = ("_shared_archive", "_hdrs_wrap")
    SKIP_PKG_SUBSTR = "/third_party/"
    # If True, exit(1) whenever a header is found only via select()/glob()
    FAIL_ON_STRUCTURED = True

    buildozer = download_buildozer()

    def _run_print(selector: str) -> tuple[str, str]:
        """Run one buildozer print invocation; return (selector, stdout)."""
        try:
            out = subprocess.run(
                [buildozer, "print label srcs", f"{SCOPE}:{selector}"],
                capture_output=True,
                text=True,
                check=True,
            ).stdout
            return selector, out
        except subprocess.CalledProcessError as exc:
            # surface error and keep going (treated as empty output)
            print(f"BUILDOZER ERROR (print label srcs) for selector {selector}:", file=sys.stderr)
            print(exc.stdout, file=sys.stderr)
            print(exc.stderr, file=sys.stderr)
            return selector, ""

    # 1) Run all macro prints concurrently
    outputs: list[str] = []
    with ThreadPoolExecutor(max_workers=min(4, max(1, len(MACRO_SELECTORS)))) as ex:
        futs = [ex.submit(_run_print, sel) for sel in MACRO_SELECTORS]
        for fut in as_completed(futs):
            _, stdout = fut.result()
            if stdout:
                outputs.append(stdout)

    if not outputs:
        return

    combined = "\n".join(outputs)

    # 2) Parse into target blocks: start at lines beginning with //src/mongo...
    target_line_re = re.compile(r"^//src/mongo/[^:\s\[]+:[^\s\[]+")
    lines = combined.splitlines()
    blocks: list[tuple[str, list[str]]] = []
    cur_target: str | None = None
    cur_buf: list[str] = []

    def flush():
        nonlocal cur_target, cur_buf
        if cur_target is not None:
            blocks.append((cur_target, cur_buf))
        cur_target, cur_buf = None, []

    for line in lines:
        if target_line_re.match(line):
            flush()
            cur_target = line.split()[0]
            cur_buf = [line]
        elif cur_target is not None:
            cur_buf.append(line)
    flush()

    failures: list[tuple[str, str]] = []
    fixes: list[tuple[str, str]] = []  # (cmd, target)
    structured_fail_found = False  # to enforce FAIL_ON_STRUCTURED

    def pkg_of(label: str) -> str:
        return label[2:].split(":", 1)[0]

    def normalize_token(pkg: str, tok: str) -> str | None:
        t = tok.strip().strip(",")
        if not t:
            return None
        if t.startswith(("select(", "glob(")):
            return None
        if t.startswith("//"):
            return t
        if t.startswith(":"):
            return f"//{pkg}:{t[1:]}"
        # bare filename/path → pkg-local
        if not any(ch in t for ch in " []{}:\t\n"):
            return f"//{pkg}:{t}"
        return None

    for target, buf in blocks:
        if target.endswith(SKIP_SUFFIXES) or SKIP_PKG_SUBSTR in target:
            continue

        text = "\n".join(buf)

        # quick lint: any .h* anywhere?
        if not HEADER_RE.search(text):
            continue

        # first [...] only (top-level list)
        m = re.search(r"\[(.*?)\]", text, flags=re.DOTALL)
        top_tokens: list[str] = []
        if m:
            inner = m.group(1).replace("\n", " ").strip()
            if inner:
                try:
                    top_tokens = shlex_split(inner)
                except ValueError:
                    top_tokens = inner.split()

        pkg = pkg_of(target)
        concrete_headers: list[str] = []
        for tok in top_tokens:
            norm = normalize_token(pkg, tok)
            if not norm:
                continue
            if norm in PUBLIC_KEEP:
                continue
            base = norm.split(":", 1)[1]
            if base.endswith(HEADER_EXTS):
                concrete_headers.append(norm)

        structured_has_hdr = False
        if not concrete_headers:
            # If there were headers somewhere but none in first [...], we assume select()/glob()
            structured_has_hdr = True

        if not concrete_headers and not structured_has_hdr:
            continue

        canon_target = target.replace("_with_debug", "")

        parts = []
        if concrete_headers:
            parts.append(f"concrete headers: {concrete_headers}")
        if structured_has_hdr:
            parts.append("headers via select()/glob() (not auto-fixed)")
            structured_fail_found = True

        msg = f"{canon_target} has headers in srcs: " + "; ".join(parts)
        print(msg)
        failures.append((canon_target, msg))

        if fix and concrete_headers:
            for h in concrete_headers:
                fixes.append((f"add private_hdrs {h}", canon_target))
                fixes.append((f"remove srcs {h}", canon_target))

    # 3) Apply fixes (dedupe)
    if fix and fixes:
        seen = set()
        for cmd, tgt in fixes:
            key = (cmd, tgt)
            if key in seen:
                continue
            seen.add(key)
            subprocess.run([buildozer, cmd, tgt])

    # 4) CI reports
    if failures and generate_report:
        for tlabel, msg in failures:
            report = make_report(tlabel, msg, 1)
            try_combine_reports(report)
            put_report(report)

    # 5) Failing rules
    # - Always fail if any violation and not fixing (your existing behavior)
    # - Also fail if we saw non-concrete (structured) headers anywhere (requested)
    if (failures and not fix) or (structured_fail_found and FAIL_ON_STRUCTURED):
        sys.exit(1)


def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--generate-report", default=False, action="store_true")
    parser.add_argument("--fix", default=False, action="store_true")
    args = parser.parse_args()
    validate_clang_tidy_configs(args.generate_report, args.fix)
    validate_bazel_groups(args.generate_report, args.fix)
    validate_idl_naming(args.generate_report, args.fix)
    validate_private_headers(args.generate_report, args.fix)


if __name__ == "__main__":
    main()