This commit is contained in:
konstin 2025-09-21 11:33:27 +02:00
parent fc7c2f8b50
commit 34f00f8945
5 changed files with 90335 additions and 0 deletions

View File

@ -0,0 +1,244 @@
#!/usr/bin/env python3
# NB: LLM code ahead
# /// script
# requires-python = ">=3.14"
# dependencies = [
# "psutil",
# "tqdm",
# ]
# ///
import argparse
import concurrent
import csv
import json
import os
import shutil
import signal
import subprocess
import time
from collections import deque
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from threading import Thread
from tqdm.auto import tqdm
cwd = Path(__file__).parent
# Global flag to track ctrl+c
should_stop = False
def signal_handler(signum, frame):
"""Handle Ctrl+C gracefully."""
global should_stop
print("\nReceived interrupt signal. Stopping gracefully...")
should_stop = True
# Send SIGTERM to all processes in our process group
try:
os.killpg(0, signal.SIGTERM)
except ProcessLookupError:
pass
@dataclass
class Summary:
package: str
exit_code: int
max_rss: int
time: float
def run_uv(
cmd: list[str], package: str, output_dir: Path, version: str | None
) -> Summary:
start = time.time()
process = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
process.stdin.write(f"{package}=={version}" if version else package)
process.stdin.close()
# Use thread-safe deques to collect output from threads
stdout_lines = deque()
stderr_lines = deque()
def read_stdout():
for line in iter(process.stdout.readline, ""):
stdout_lines.append(line)
process.stdout.close()
def read_stderr():
for line in iter(process.stderr.readline, ""):
stderr_lines.append(line)
process.stderr.close()
# Start threads to drain the pipes
stdout_thread = Thread(target=read_stdout)
stderr_thread = Thread(target=read_stderr)
stdout_thread.daemon = True
stderr_thread.daemon = True
stdout_thread.start()
stderr_thread.start()
# Wait for process and get resource usage
_pid, exit_code, rusage = os.wait4(process.pid, 0)
# Wait for threads to finish reading
stdout_thread.join()
stderr_thread.join()
stdout = "".join(stdout_lines)
stderr = "".join(stderr_lines)
max_rss = rusage.ru_maxrss
package_dir = output_dir.joinpath(package)
package_dir.mkdir(parents=True, exist_ok=True)
package_dir.joinpath("stdout.txt").write_text(stdout)
package_dir.joinpath("stderr.txt").write_text(stderr)
summary = Summary(
package=package, exit_code=exit_code, max_rss=max_rss, time=time.time() - start
)
# if max_rss > 1000 * 1024:
# print(f"{package} exit code:{exit_code}, {max_rss / 1024:.0f} MB")
package_dir.joinpath("summary.json").write_text(json.dumps(summary.__dict__))
return summary
def main():
# Register signal handler for Ctrl+C
signal.signal(signal.SIGINT, signal_handler)
parser = argparse.ArgumentParser()
parser.add_argument("--python", type=str, default="3.13")
parser.add_argument("--output-dir", type=Path, default="output")
parser.add_argument("--uv", type=Path, default=Path("uv"))
parser.add_argument("--limit", type=int, default=None)
parser.add_argument("--cache", type=Path, default=cwd.joinpath("cache"))
parser.add_argument("--offline", action="store_true")
parser.add_argument("--latest", action="store_true")
args = parser.parse_args()
top_15k_pypi = json.loads(cwd.joinpath("top-pypi-packages.json").read_text())
top_15k_pypi = [pkg["project"] for pkg in top_15k_pypi["rows"]]
if args.latest:
latest_versions = cwd.joinpath("package_versions.csv").read_text()
latest_versions = {
row["package_name"]: row["latest_version"]
for row in csv.DictReader(latest_versions.splitlines())
}
else:
latest_versions = None
# 5000 releases, no solution
top_15k_pypi.remove("nucliadb")
# Remove slow packages
for slow in [
# These packages have many non-small versions
"tf-models-nightly",
"mtmtrain",
"llm-dialog-manager",
"edx-enterprise", # Doesn't solve
"kcli",
"emmet-api",
]:
top_15k_pypi.remove(slow)
output_dir = cwd.joinpath(args.output_dir)
if output_dir.exists():
shutil.rmtree(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
output_dir.joinpath(".gitignore").write_text("*")
cmd = [
args.uv,
"pip",
"compile",
"-p",
args.python,
"-",
"--no-build",
"--cache-dir",
args.cache,
"--color",
"never",
"--no-header",
"--no-annotate",
]
if args.offline:
cmd.append("--offline")
success = 0
all_results = [] # Track all results for analysis
interrupted = False
max_package_len = max(len(package) for package in top_15k_pypi[: args.limit])
try:
with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as executor:
tasks = []
packages_pending = []
for package in top_15k_pypi[: args.limit]:
if latest_versions:
if version := latest_versions.get(package):
pass
else:
tqdm.write(f"Missing version: {package}")
continue
else:
version = None
tasks.append(executor.submit(run_uv, cmd, package, output_dir, version))
packages_pending.append(package)
progress_bar = tqdm(total=len(packages_pending))
for result in concurrent.futures.as_completed(tasks):
summary = result.result()
all_results.append(summary) # Collect all results
progress_bar.update(1)
packages_pending.remove(summary.package)
if len(packages_pending) > 0:
progress_bar.set_postfix_str(
f"{packages_pending[0]:>{max_package_len}}"
)
if summary.exit_code == 0:
success += 1
progress_bar.close()
except KeyboardInterrupt:
print("\nInterrupted. Cleaning up...")
interrupted = True
if interrupted or should_stop:
print(f"Interrupted. Success: {success}/{len(all_results)} (completed tasks)")
else:
print(f"Success: {success}/{len(top_15k_pypi[: args.limit])}")
successes = [summary for summary in all_results if summary.exit_code == 0]
print("\ntop 5 max RSS")
largest_rss = sorted(successes, key=lambda x: x.max_rss, reverse=True)[:5]
for summary in largest_rss:
print(
f"{summary.package}: {summary.max_rss / 1024:.1f} MB (exit code: {summary.exit_code})"
)
print("\ntop 5 slowest resolutions")
slowest = sorted(successes, key=lambda x: x.time, reverse=True)[:5]
for summary in slowest:
print(
f"{summary.package}: {summary.time:.2f}s (exit code: {summary.exit_code})"
)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,66 @@
#!/usr/bin/env python3
# NB: LLM code ahead
# /// script
# requires-python = ">=3.13"
# dependencies = ["httpx", "orjson", "tqdm"]
# ///
import asyncio
import csv
from pathlib import Path
import httpx
import orjson
from tqdm.asyncio import tqdm
async def get_latest_version(
client: httpx.AsyncClient, package_name: str
) -> tuple[str, str | None]:
try:
response = await client.get(f"https://pypi.org/pypi/{package_name}/json")
if response.status_code == 200:
data = orjson.loads(response.content)
return package_name, data["info"]["version"]
else:
return package_name, None
except Exception:
return package_name, None
async def main() -> None:
input_file = Path("scripts/ecosystem-testing/top-pypi-packages.csv")
# Read package names
with open(input_file) as f:
package_names: list[str] = [row["project"] for row in csv.DictReader(f)]
print(f"Processing {len(package_names)} packages...")
# Fetch versions concurrently
results: dict[str, str | None] = {}
async with httpx.AsyncClient() as client:
semaphore = asyncio.Semaphore(50)
async def fetch(pkg: str) -> tuple[str, str | None]:
async with semaphore:
return await get_latest_version(client, pkg)
tasks = [fetch(pkg) for pkg in package_names]
for future in tqdm(asyncio.as_completed(tasks), total=len(package_names)):
name, version = await future
results[name] = version
# Write results
with open("package_versions.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["package_name", "latest_version"])
for name in package_names:
writer.writerow([name, results.get(name, "")])
success_count = sum(1 for v in results.values() if v)
print(f"Completed: {success_count}/{len(package_names)} successful")
asyncio.run(main())

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff