This commit is contained in:
konstin 2025-09-22 13:38:03 +02:00
parent 4432b2daf0
commit 98628667f0
7 changed files with 2107 additions and 60126 deletions

View File

@ -15,7 +15,7 @@ def main():
parser = argparse.ArgumentParser()
parser.add_argument("base", type=Path)
parser.add_argument("branch", type=Path)
parser.add_argument("--project", action="store_true")
parser.add_argument("--mode", choices=["compile", "lock", "pyproject-toml"])
parser.add_argument(
"--markdown",
action="store_true",
@ -53,21 +53,21 @@ def main():
# also `uv.lock` doesn't exist for failed resolutions
continue
if args.project:
if args.mode == "compile":
resolution = package_dir.joinpath("stdout.txt").read_text()
else:
resolution = package_dir.joinpath("uv.lock").read_text()
if package_dir.joinpath("stdout.txt").read_text().strip():
raise RuntimeError(f"Stdout not empty (base): {package}")
else:
resolution = package_dir.joinpath("stdout.txt").read_text()
stderr = package_dir.joinpath("stderr.txt").read_text()
stderr = redact_time.sub(r"[TIME]", stderr)
if args.project:
if args.mode == "compile":
resolution_branch = package_branch.joinpath("stdout.txt").read_text()
else:
resolution_branch = package_branch.joinpath("uv.lock").read_text()
if package_branch.joinpath("stdout.txt").read_text().strip():
raise RuntimeError(f"Stdout not empty (branch): {package}")
else:
resolution_branch = package_branch.joinpath("stdout.txt").read_text()
stderr_branch = package_branch.joinpath("stderr.txt").read_text()
stderr_branch = redact_time.sub(r"[TIME]", stderr_branch)
@ -77,19 +77,34 @@ def main():
)
if args.markdown:
print("# Ecosystem testing report")
print(
f"Dataset: "
f"`{'uv pip compile' if not parameters['project'] else 'uv lock'}` with `--no-build` "
f"on each of the top 15k PyPI packages on Python {parameters['python']} "
"pinned to the latest package version. "
if parameters["latest"]
else ". "
"A handful of pathological cases were filtered out. "
"Only success resolutions can be compared.\n"
"## Ecosystem testing report "
f"({args.mode.replace('pyproject-toml', 'pyproject.toml')})"
)
print(f"Successfully resolved packages: {successful}/{total}\n")
print(f"Different packages: {len(differences)}/{total}\n")
if args.mode == "pyproject-toml":
print(
" * Dataset: A set of top level `pyproject.toml` from GitHub projects popular in 2025. "
+ "Only `pyproject.toml` files with a `[project]` section and static dependencies are included."
)
else:
print(
" * Dataset: The top 15k PyPI packages. A handful of pathological cases were filtered out."
)
print(
" * Command: "
+ f"`{'uv pip compile' if args.mode == 'compile' else 'uv lock'}` with `--no-build` "
+ f"on Python {parameters['python']} "
+ (
"pinned to the latest package version. "
if parameters["latest"]
else ". "
)
)
print(
f" * Successfully resolved packages: {successful}/{total} ({successful / total:.0%}). "
+ "Only success resolutions can be compared."
)
print(f" * Different packages: {len(differences)}/{successful}")
for (
package,
@ -98,10 +113,10 @@ def main():
stderr,
stderr_branch,
) in differences:
if args.project:
context_window = 3
else:
if args.mode == "compile":
context_window = 999999
else:
context_window = 3
print(f"\n<details>\n<summary>{package}</summary>\n")
if resolution != resolution_branch:
print("```diff")
@ -129,7 +144,7 @@ def main():
)
)
print("```")
print("</details>")
print("</details>\n")
else:
for (
package,
@ -159,9 +174,9 @@ def main():
)
)
print(
f"Successfully resolved packages: {successful}/{total} ({successful}/{total}:.0%)"
f"Successfully resolved packages: {successful}/{total} ({successful / total:.0%})"
)
print(f"Different packages: {len(differences)}/{total}")
print(f"Different packages: {len(differences)}/{successful}")
if __name__ == "__main__":

View File

@ -3,7 +3,8 @@
# /// script
# requires-python = ">=3.13"
# dependencies = [
# "tqdm>=4,<5",
# "tomli-w>=1.2.0,<2.0.0",
# "tqdm>=4.67.1,<5.0.0",
# ]
# ///
@ -16,11 +17,13 @@ import platform
import shutil
import subprocess
import time
import tomllib
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from threading import Thread
import tomli_w
from tqdm.auto import tqdm
cwd = Path(__file__).parent
@ -35,63 +38,36 @@ class Summary:
def run_uv(
package: str,
specification: str,
uv: Path,
project: bool,
mode: str,
python: str,
cache: Path,
offline: bool,
package: str,
output_dir: Path,
version: str | None,
output: Path,
) -> Summary:
"""Run a uv subprocess.
"""Resolve in a uv subprocess.
The logic captures the max RSS from the process and avoids deadlocks from full
pipes.
"""
package_dir = output.joinpath(package)
package_dir.mkdir()
command = prepare_uv_command(
specification,
uv,
mode,
cache,
offline,
package_dir,
python,
)
start = time.time()
requirement = f"{package}=={version}" if version else package
shared_args = [
"--no-build",
"--cache-dir",
cache,
"--color",
"never",
]
if offline:
shared_args.append("--offline")
package_dir = output_dir.joinpath(package)
package_dir.mkdir(parents=True, exist_ok=True)
if project:
package_dir.joinpath("pyproject.toml").write_text(
f"""
[project]
name = "testing"
version = "0.1.0"
requires-python = ">={python}"
dependencies = ["{requirement}"]
"""
)
cmd = [uv, "lock", *shared_args]
else:
cmd = [
uv,
"pip",
"compile",
"-",
"-p",
python,
# The results are more reproducible if they are platform independent
"--universal",
"--no-header",
"--no-annotate",
*shared_args,
]
process = subprocess.Popen(
cmd,
command,
cwd=package_dir,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
@ -99,7 +75,7 @@ def run_uv(
text=True,
)
stdout, stderr = communicate(process, requirement if not project else None)
stdout, stderr = communicate(process, specification if mode == "compile" else None)
# At this point, the process is a zombie, so has called `exit()`, but we haven't reaped it with `wait4` yet.
@ -122,6 +98,57 @@ def run_uv(
return summary
def prepare_uv_command(
specification: str,
uv: Path,
mode: str,
cache: Path,
offline: bool,
package_dir: Path,
python: str,
) -> list[Path | str]:
shared_args = [
"--no-build",
"--cache-dir",
cache,
"--color",
"never",
]
if offline:
shared_args.append("--offline")
if mode == "pyproject-toml":
package_dir.joinpath("pyproject.toml").write_text(specification)
command = [uv, "lock", *shared_args]
elif mode == "lock":
package_dir.joinpath("pyproject.toml").write_text(
f"""
[project]
name = "testing"
version = "0.1.0"
requires-python = ">={python}"
dependencies = ["{specification}"]
"""
)
command = [uv, "lock", *shared_args]
elif mode == "compile":
command = [
uv,
"pip",
"compile",
"-",
"-p",
python,
# The results are more reproducible if they are platform independent
"--universal",
"--no-header",
"--no-annotate",
*shared_args,
]
else:
raise ValueError(f"Unknown mode: {mode}")
return command
def communicate(process: subprocess.Popen, stdin: str | None) -> tuple[str, str]:
"""Like `Popen.communicate`, but without the `os.wait` call.
@ -157,12 +184,18 @@ def communicate(process: subprocess.Popen, stdin: str | None) -> tuple[str, str]
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--project",
action="store_true",
help="Use `uv lock` instead of `uv pip compile`",
"--input", type=Path, default=cwd.joinpath("top-pypi-packages.json")
)
parser.add_argument(
"--mode",
choices=["compile", "lock", "pyproject-toml"],
default="compile",
help="`compile`: `uv pip compile`, "
"`lock`: `uv lock` from a single requirement"
"`pyproject-toml`: `uv lock` from a directory of `pyproject.toml` files",
)
parser.add_argument("--python", "-p", type=str, default="3.13")
parser.add_argument("--output-dir", type=Path, default="output")
parser.add_argument("--output", type=Path, default="output")
parser.add_argument("--uv", type=Path, default=Path("uv"))
parser.add_argument("--limit", type=int, default=None)
parser.add_argument("--cache", type=Path, default=cwd.joinpath("cache"))
@ -170,16 +203,62 @@ def main():
parser.add_argument("--latest", action="store_true")
args = parser.parse_args()
top_15k_pypi = json.loads(cwd.joinpath("top-pypi-packages.json").read_text())
top_15k_pypi = [pkg["project"] for pkg in top_15k_pypi["rows"]]
if args.mode == "pyproject-toml":
project_tomls = sorted((file.stem, file) for file in args.input.iterdir())
jobs = {}
no_project = 0
dynamic_dependencies = 0
for package, file in project_tomls:
if len(jobs) >= args.limit:
break
if file.suffix != ".toml":
continue
project_toml = file.read_text()
data = tomllib.loads(project_toml)
project = data.get("project")
if not project:
no_project += 1
continue
if dynamic := project.get("dynamic"):
if "dependencies" in dynamic:
dynamic_dependencies += 1
continue
if "version" in dynamic:
dynamic.remove("version")
# Usually there are no cycles back to the current project, so any version works
project["version"] = "1.0.0"
if args.latest:
with cwd.joinpath("package_versions.csv").open() as f:
latest_versions = {
row["package_name"]: row["latest_version"] for row in csv.DictReader(f)
}
jobs[package] = tomli_w.dumps(data)
print(f"`pyproject.toml`s without `[project]`: {no_project}")
print(
f"`pyproject.toml`s with `dynamic = ['dependencies']`: {dynamic_dependencies}"
)
if args.latest:
raise ValueError("Latest versions are not supported in pyproject-toml mode")
else:
latest_versions = None
project_names = json.loads(args.input.read_text())
project_names = sorted(pkg["project"] for pkg in project_names["rows"])
if args.latest:
with cwd.joinpath("package_versions.csv").open() as f:
latest_versions = {
row["package_name"]: row["latest_version"]
for row in csv.DictReader(f)
}
else:
latest_versions = None
jobs = {}
for package in project_names[: args.limit]:
if latest_versions:
if version := latest_versions.get(package):
jobs[package] = f"{package}=={version}"
else:
tqdm.write(f"Missing version: {package}")
continue
else:
jobs[package] = package
excluded_packages = [
# 5000 releases, no solution
@ -188,53 +267,47 @@ def main():
"tf-models-nightly",
"mtmtrain",
"llm-dialog-manager",
"python-must",
# Slow and have no solution
"edx-enterprise",
"kcli",
"emmet-api",
]
for package in excluded_packages:
top_15k_pypi.remove(package)
jobs.pop(package, None)
if args.output_dir.exists():
shutil.rmtree(args.output_dir)
args.output_dir.mkdir(parents=True, exist_ok=True)
args.output_dir.joinpath(".gitignore").write_text("*")
if args.output.exists():
shutil.rmtree(args.output)
args.output.mkdir(parents=True)
args.output.joinpath(".gitignore").write_text("*")
parameters = {
"project": args.project,
"mode": args.mode,
"python": args.python,
"latest": args.latest,
}
args.output_dir.joinpath("parameters.json").write_text(json.dumps(parameters))
args.output.joinpath("parameters.json").write_text(json.dumps(parameters))
success = 0
all_results = [] # Track all results for analysis
max_package_len = max(len(package) for package in top_15k_pypi[: args.limit])
max_package_len = max(len(package) for package in jobs)
with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as executor:
tasks = []
packages_pending = []
for package in top_15k_pypi[: args.limit]:
if latest_versions:
if version := latest_versions.get(package):
pass
else:
tqdm.write(f"Missing version: {package}")
continue
else:
version = None
for package, specification in jobs.items():
packages_pending.append(package)
tasks.append(
executor.submit(
run_uv,
package,
specification,
args.uv,
args.project,
args.mode,
args.python,
args.cache,
args.offline,
package,
args.output_dir,
version,
args.output,
)
)
total = len(packages_pending)

View File

@ -0,0 +1,107 @@
# /// script
# requires-python = ">=3.13"
# dependencies = [
# "httpx>=0.28.1,<0.29.0",
# "tqdm>=4.67.1,<5.0.0",
# ]
# ///
import argparse
import asyncio
import csv
import shutil
from dataclasses import dataclass
from pathlib import Path
import httpx
from httpx import AsyncClient
from tqdm.auto import tqdm
@dataclass
class Repository:
org: str
repo: str
ref: str
async def fetch_pyproject(
client: AsyncClient, repository: Repository, output_dir: Path
):
url = f"https://raw.githubusercontent.com/{repository.org}/{repository.repo}/{repository.ref}/pyproject.toml"
try:
response = await client.get(url)
response.raise_for_status()
except httpx.HTTPError as e:
# The bigquery data is sometimes missing the master -> main transition
url = f"https://raw.githubusercontent.com/{repository.org}/{repository.repo}/refs/heads/main/pyproject.toml"
try:
response = await client.get(url)
response.raise_for_status()
except httpx.HTTPError:
# Ignore the error from the main fallback if it didn't work
if hasattr(e, "response") and e.response.status_code == 404:
tqdm.write(
f"Not found: https://github.com/{repository.org}/{repository.repo}"
)
else:
tqdm.write(
f"Error for https://github.com/{repository.org}/{repository.repo}: {e}"
)
return None
output_dir.joinpath(f"{repository.repo}.toml").write_text(response.text)
return True
async def main():
parser = argparse.ArgumentParser()
parser.add_argument("--input", type=Path, default=Path("top500_2025_gh_stars.csv"))
parser.add_argument("--output", type=Path, default=Path("pyproject_toml"))
args = parser.parse_args()
with args.input.open() as f:
repositories = []
seen = set()
for row in csv.DictReader(f):
if row["repo_name"] in seen:
continue
seen.add(row["repo_name"])
repositories.append(
Repository(
org=row["repo_name"].split("/")[0],
repo=row["repo_name"].split("/")[1],
ref=row["ref"],
)
)
if args.output.exists():
shutil.rmtree(args.output)
args.output.mkdir(parents=True)
args.output.joinpath(".gitignore").write_text("*")
semaphore = asyncio.Semaphore(50)
async def fetch_with_semaphore(
client: AsyncClient, repository: Repository, output_dir: Path
):
async with semaphore:
return await fetch_pyproject(client, repository, output_dir)
async with httpx.AsyncClient() as client:
with tqdm(total=len(repositories)) as pbar:
tasks = [
fetch_with_semaphore(client, repository, args.output)
for repository in repositories
]
results = []
for future in asyncio.as_completed(tasks):
results.append(await future)
pbar.update(1)
success = sum(1 for result in results if result is True)
print(f"Successes: {success}/{len(repositories)}")
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,18 @@
#!/bin/bash
set -ex
script_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
limit=50000
uv run $script_dir/ecosystem_testing.py --uv $1 --mode compile --output $script_dir/base-compile --limit $limit
uv run $script_dir/ecosystem_testing.py --uv $2 --mode compile --output $script_dir/branch-compile --limit $limit
uv run $script_dir/ecosystem_testing.py --uv $1 --mode lock --output $script_dir/base-lock --limit $limit
uv run $script_dir/ecosystem_testing.py --uv $2 --mode lock --output $script_dir/branch-lock --limit $limit
uv run $script_dir/ecosystem_testing.py --uv $1 --mode pyproject-toml --input $script_dir/pyproject_toml --output $script_dir/base-pyproject-toml --limit $limit
uv run $script_dir/ecosystem_testing.py --uv $2 --mode pyproject-toml --input $script_dir/pyproject_toml --output $script_dir/branch-pyproject-toml --limit $limit
rm $script_dir/report.md
uv run $script_dir/create_report.py $script_dir/base-compile $script_dir/branch-compile --mode compile --markdown >> $script_dir/report.md
uv run $script_dir/create_report.py $script_dir/base-lock $script_dir/branch-lock --mode lock --markdown >> $script_dir/report.md
uv run $script_dir/create_report.py $script_dir/base-pyproject-toml $script_dir/branch-pyproject-toml --mode pyproject-toml --markdown >> $script_dir/report.md

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,20 @@
# BigQuery SQL for top5k_pyproject_toml_2025_gh_stars.csv
# Run in https://console.cloud.google.com/bigquery
SELECT
f.repo_name,
f.ref,
COUNT(e.id) AS stars
FROM
`bigquery-public-data.github_repos.files` f
JOIN
`githubarchive.month.2025*` e
ON
f.repo_name = e.repo.name
WHERE
f.path = 'pyproject.toml'
AND e.type = 'WatchEvent'
GROUP BY
f.repo_name, f.ref
ORDER BY
stars DESC
LIMIT 5000;