This commit is contained in:
konstin 2025-09-21 22:18:58 +02:00
parent 835753242b
commit 88681c97b5
1 changed files with 72 additions and 63 deletions

View File

@ -33,10 +33,56 @@ class Summary:
time: float time: float
def communicate(process: subprocess.Popen, stdin: str) -> tuple[str, str]: def run_uv(
"""We have `Popen.communicate` at home. cmd: list[str], package: str, output_dir: Path, version: str | None
) -> Summary:
"""Run a uv subprocess.
Start threads to drain the pipes to avoid deadlocks on full pipes, but don't use The logic captures the max RSS from the process and avoids deadlocks from full
pipes.
"""
start = time.time()
process = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
stdin = f"{package}=={version}" if version else package
stdout, stderr = communicate(process, stdin)
# At this point, the process is a zombie, so has called `exit()`, but we haven't reaped it with `wait4` yet.
# rusage is only available on unix
if os.name == "posix":
# Wait for process and get resource usage
_pid, exit_code, rusage = os.wait4(process.pid, 0)
else:
exit_code = process.wait()
rusage = None
max_rss = rusage.ru_maxrss if rusage else 0
package_dir = output_dir.joinpath(package)
package_dir.mkdir(parents=True, exist_ok=True)
package_dir.joinpath("stdout.txt").write_text(stdout)
package_dir.joinpath("stderr.txt").write_text(stderr)
summary = Summary(
package=package, exit_code=exit_code, max_rss=max_rss, time=time.time() - start
)
package_dir.joinpath("summary.json").write_text(json.dumps(summary.__dict__))
return summary
def communicate(process: subprocess.Popen, stdin: str) -> tuple[str, str]:
"""Like `Popen.communicate`, but without the `os.wait` call.
Start threads to drain the pipes to avoid blocking on full pipes, but don't use
libc's `wait` so we can use `os.wait4` later. libc's `wait` so we can use `os.wait4` later.
""" """
process.stdin.write(stdin) process.stdin.write(stdin)
@ -64,46 +110,6 @@ def communicate(process: subprocess.Popen, stdin: str) -> tuple[str, str]:
return stdout[0], stderr[0] return stdout[0], stderr[0]
def run_uv(
cmd: list[str], package: str, output_dir: Path, version: str | None
) -> Summary:
"""Run a uv subprocess.
The logic captures the max RSS from the process and avoids deadlocks from full
pipes."""
start = time.time()
process = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
stdin = f"{package}=={version}" if version else package
stdout, stderr = communicate(process, stdin)
# At this point, the process is a zombie, so has called `exit()`, but we haven't reaped it with `wait4` yet.
# Wait for process and get resource usage
_pid, exit_code, rusage = os.wait4(process.pid, 0)
max_rss = rusage.ru_maxrss
package_dir = output_dir.joinpath(package)
package_dir.mkdir(parents=True, exist_ok=True)
package_dir.joinpath("stdout.txt").write_text(stdout)
package_dir.joinpath("stderr.txt").write_text(stderr)
summary = Summary(
package=package, exit_code=exit_code, max_rss=max_rss, time=time.time() - start
)
package_dir.joinpath("summary.json").write_text(json.dumps(summary.__dict__))
return summary
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--python", "-p", type=str, default="3.13") parser.add_argument("--python", "-p", type=str, default="3.13")
@ -119,27 +125,27 @@ def main():
top_15k_pypi = [pkg["project"] for pkg in top_15k_pypi["rows"]] top_15k_pypi = [pkg["project"] for pkg in top_15k_pypi["rows"]]
if args.latest: if args.latest:
latest_versions = cwd.joinpath("package_versions.csv").read_text() with cwd.joinpath("package_versions.csv").open() as f:
latest_versions = { latest_versions = {
row["package_name"]: row["latest_version"] row["package_name"]: row["latest_version"] for row in csv.DictReader(f)
for row in csv.DictReader(latest_versions.splitlines()) }
}
else: else:
latest_versions = None latest_versions = None
# 5000 releases, no solution excluded_packages = [
top_15k_pypi.remove("nucliadb") # 5000 releases, no solution
# Remove slow packages "nucliadb",
for slow in [
# These packages have many non-small versions # These packages have many non-small versions
"tf-models-nightly", "tf-models-nightly",
"mtmtrain", "mtmtrain",
"llm-dialog-manager", "llm-dialog-manager",
"edx-enterprise", # Doesn't solve # Slow and have no solution
"edx-enterprise",
"kcli", "kcli",
"emmet-api", "emmet-api",
]: ]
top_15k_pypi.remove(slow) for package in excluded_packages:
top_15k_pypi.remove(package)
output_dir = cwd.joinpath(args.output_dir) output_dir = cwd.joinpath(args.output_dir)
if output_dir.exists(): if output_dir.exists():
@ -183,8 +189,8 @@ def main():
version = None version = None
packages_pending.append(package) packages_pending.append(package)
tasks.append(executor.submit(run_uv, cmd, package, output_dir, version)) tasks.append(executor.submit(run_uv, cmd, package, output_dir, version))
total = len(packages_pending) total = len(packages_pending)
with tqdm(total=total) as progress_bar: with tqdm(total=total) as progress_bar:
for result in concurrent.futures.as_completed(tasks): for result in concurrent.futures.as_completed(tasks):
summary = result.result() summary = result.result()
@ -199,15 +205,9 @@ def main():
if summary.exit_code == 0: if summary.exit_code == 0:
success += 1 success += 1
print(f"Success: {success}/{total}") print(f"Success: {success}/{total} ({success / total:.0%})")
successes = [summary for summary in all_results if summary.exit_code == 0] successes = [summary for summary in all_results if summary.exit_code == 0]
print("\n# top 5 max RSS for successes")
largest_rss = sorted(successes, key=lambda x: x.max_rss, reverse=True)[:5]
for summary in largest_rss:
print(
f"{summary.package}: {summary.max_rss / 1024:.1f} MB (exit code: {summary.exit_code})"
)
print("\n# top 5 slowest resolutions for successes") print("\n# top 5 slowest resolutions for successes")
slowest = sorted(successes, key=lambda x: x.time, reverse=True)[:5] slowest = sorted(successes, key=lambda x: x.time, reverse=True)[:5]
@ -216,6 +216,15 @@ def main():
f"{summary.package}: {summary.time:.2f}s (exit code: {summary.exit_code})" f"{summary.package}: {summary.time:.2f}s (exit code: {summary.exit_code})"
) )
if os.name == "posix":
print("\n# top 5 max RSS for successes")
largest_rss = sorted(successes, key=lambda x: x.max_rss, reverse=True)[:5]
for summary in largest_rss:
# Only linux, max RSS is in KB
print(
f"{summary.package}: {summary.max_rss / 1024:.1f} MB (exit code: {summary.exit_code})"
)
if __name__ == "__main__": if __name__ == "__main__":
main() main()