This commit is contained in:
konstin 2025-09-22 13:38:03 +02:00
parent 4432b2daf0
commit 98628667f0
7 changed files with 2107 additions and 60126 deletions

View File

@ -15,7 +15,7 @@ def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("base", type=Path) parser.add_argument("base", type=Path)
parser.add_argument("branch", type=Path) parser.add_argument("branch", type=Path)
parser.add_argument("--project", action="store_true") parser.add_argument("--mode", choices=["compile", "lock", "pyproject-toml"])
parser.add_argument( parser.add_argument(
"--markdown", "--markdown",
action="store_true", action="store_true",
@ -53,21 +53,21 @@ def main():
# also `uv.lock` doesn't exist for failed resolutions # also `uv.lock` doesn't exist for failed resolutions
continue continue
if args.project: if args.mode == "compile":
resolution = package_dir.joinpath("stdout.txt").read_text()
else:
resolution = package_dir.joinpath("uv.lock").read_text() resolution = package_dir.joinpath("uv.lock").read_text()
if package_dir.joinpath("stdout.txt").read_text().strip(): if package_dir.joinpath("stdout.txt").read_text().strip():
raise RuntimeError(f"Stdout not empty (base): {package}") raise RuntimeError(f"Stdout not empty (base): {package}")
else:
resolution = package_dir.joinpath("stdout.txt").read_text()
stderr = package_dir.joinpath("stderr.txt").read_text() stderr = package_dir.joinpath("stderr.txt").read_text()
stderr = redact_time.sub(r"[TIME]", stderr) stderr = redact_time.sub(r"[TIME]", stderr)
if args.project: if args.mode == "compile":
resolution_branch = package_branch.joinpath("stdout.txt").read_text()
else:
resolution_branch = package_branch.joinpath("uv.lock").read_text() resolution_branch = package_branch.joinpath("uv.lock").read_text()
if package_branch.joinpath("stdout.txt").read_text().strip(): if package_branch.joinpath("stdout.txt").read_text().strip():
raise RuntimeError(f"Stdout not empty (branch): {package}") raise RuntimeError(f"Stdout not empty (branch): {package}")
else:
resolution_branch = package_branch.joinpath("stdout.txt").read_text()
stderr_branch = package_branch.joinpath("stderr.txt").read_text() stderr_branch = package_branch.joinpath("stderr.txt").read_text()
stderr_branch = redact_time.sub(r"[TIME]", stderr_branch) stderr_branch = redact_time.sub(r"[TIME]", stderr_branch)
@ -77,19 +77,34 @@ def main():
) )
if args.markdown: if args.markdown:
print("# Ecosystem testing report")
print( print(
f"Dataset: " "## Ecosystem testing report "
f"`{'uv pip compile' if not parameters['project'] else 'uv lock'}` with `--no-build` " f"({args.mode.replace('pyproject-toml', 'pyproject.toml')})"
f"on each of the top 15k PyPI packages on Python {parameters['python']} "
"pinned to the latest package version. "
if parameters["latest"]
else ". "
"A handful of pathological cases were filtered out. "
"Only success resolutions can be compared.\n"
) )
print(f"Successfully resolved packages: {successful}/{total}\n") if args.mode == "pyproject-toml":
print(f"Different packages: {len(differences)}/{total}\n") print(
" * Dataset: A set of top level `pyproject.toml` from GitHub projects popular in 2025. "
+ "Only `pyproject.toml` files with a `[project]` section and static dependencies are included."
)
else:
print(
" * Dataset: The top 15k PyPI packages. A handful of pathological cases were filtered out."
)
print(
" * Command: "
+ f"`{'uv pip compile' if args.mode == 'compile' else 'uv lock'}` with `--no-build` "
+ f"on Python {parameters['python']} "
+ (
"pinned to the latest package version. "
if parameters["latest"]
else ". "
)
)
print(
f" * Successfully resolved packages: {successful}/{total} ({successful / total:.0%}). "
+ "Only success resolutions can be compared."
)
print(f" * Different packages: {len(differences)}/{successful}")
for ( for (
package, package,
@ -98,10 +113,10 @@ def main():
stderr, stderr,
stderr_branch, stderr_branch,
) in differences: ) in differences:
if args.project: if args.mode == "compile":
context_window = 3
else:
context_window = 999999 context_window = 999999
else:
context_window = 3
print(f"\n<details>\n<summary>{package}</summary>\n") print(f"\n<details>\n<summary>{package}</summary>\n")
if resolution != resolution_branch: if resolution != resolution_branch:
print("```diff") print("```diff")
@ -129,7 +144,7 @@ def main():
) )
) )
print("```") print("```")
print("</details>") print("</details>\n")
else: else:
for ( for (
package, package,
@ -159,9 +174,9 @@ def main():
) )
) )
print( print(
f"Successfully resolved packages: {successful}/{total} ({successful}/{total}:.0%)" f"Successfully resolved packages: {successful}/{total} ({successful / total:.0%})"
) )
print(f"Different packages: {len(differences)}/{total}") print(f"Different packages: {len(differences)}/{successful}")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -3,7 +3,8 @@
# /// script # /// script
# requires-python = ">=3.13" # requires-python = ">=3.13"
# dependencies = [ # dependencies = [
# "tqdm>=4,<5", # "tomli-w>=1.2.0,<2.0.0",
# "tqdm>=4.67.1,<5.0.0",
# ] # ]
# /// # ///
@ -16,11 +17,13 @@ import platform
import shutil import shutil
import subprocess import subprocess
import time import time
import tomllib
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from threading import Thread from threading import Thread
import tomli_w
from tqdm.auto import tqdm from tqdm.auto import tqdm
cwd = Path(__file__).parent cwd = Path(__file__).parent
@ -35,63 +38,36 @@ class Summary:
def run_uv( def run_uv(
package: str,
specification: str,
uv: Path, uv: Path,
project: bool, mode: str,
python: str, python: str,
cache: Path, cache: Path,
offline: bool, offline: bool,
package: str, output: Path,
output_dir: Path,
version: str | None,
) -> Summary: ) -> Summary:
"""Run a uv subprocess. """Resolve in a uv subprocess.
The logic captures the max RSS from the process and avoids deadlocks from full The logic captures the max RSS from the process and avoids deadlocks from full
pipes. pipes.
""" """
package_dir = output.joinpath(package)
package_dir.mkdir()
command = prepare_uv_command(
specification,
uv,
mode,
cache,
offline,
package_dir,
python,
)
start = time.time() start = time.time()
requirement = f"{package}=={version}" if version else package
shared_args = [
"--no-build",
"--cache-dir",
cache,
"--color",
"never",
]
if offline:
shared_args.append("--offline")
package_dir = output_dir.joinpath(package)
package_dir.mkdir(parents=True, exist_ok=True)
if project:
package_dir.joinpath("pyproject.toml").write_text(
f"""
[project]
name = "testing"
version = "0.1.0"
requires-python = ">={python}"
dependencies = ["{requirement}"]
"""
)
cmd = [uv, "lock", *shared_args]
else:
cmd = [
uv,
"pip",
"compile",
"-",
"-p",
python,
# The results are more reproducible if they are platform independent
"--universal",
"--no-header",
"--no-annotate",
*shared_args,
]
process = subprocess.Popen( process = subprocess.Popen(
cmd, command,
cwd=package_dir, cwd=package_dir,
stdin=subprocess.PIPE, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
@ -99,7 +75,7 @@ def run_uv(
text=True, text=True,
) )
stdout, stderr = communicate(process, requirement if not project else None) stdout, stderr = communicate(process, specification if mode == "compile" else None)
# At this point, the process is a zombie, so has called `exit()`, but we haven't reaped it with `wait4` yet. # At this point, the process is a zombie, so has called `exit()`, but we haven't reaped it with `wait4` yet.
@ -122,6 +98,57 @@ def run_uv(
return summary return summary
def prepare_uv_command(
specification: str,
uv: Path,
mode: str,
cache: Path,
offline: bool,
package_dir: Path,
python: str,
) -> list[Path | str]:
shared_args = [
"--no-build",
"--cache-dir",
cache,
"--color",
"never",
]
if offline:
shared_args.append("--offline")
if mode == "pyproject-toml":
package_dir.joinpath("pyproject.toml").write_text(specification)
command = [uv, "lock", *shared_args]
elif mode == "lock":
package_dir.joinpath("pyproject.toml").write_text(
f"""
[project]
name = "testing"
version = "0.1.0"
requires-python = ">={python}"
dependencies = ["{specification}"]
"""
)
command = [uv, "lock", *shared_args]
elif mode == "compile":
command = [
uv,
"pip",
"compile",
"-",
"-p",
python,
# The results are more reproducible if they are platform independent
"--universal",
"--no-header",
"--no-annotate",
*shared_args,
]
else:
raise ValueError(f"Unknown mode: {mode}")
return command
def communicate(process: subprocess.Popen, stdin: str | None) -> tuple[str, str]: def communicate(process: subprocess.Popen, stdin: str | None) -> tuple[str, str]:
"""Like `Popen.communicate`, but without the `os.wait` call. """Like `Popen.communicate`, but without the `os.wait` call.
@ -157,12 +184,18 @@ def communicate(process: subprocess.Popen, stdin: str | None) -> tuple[str, str]
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
"--project", "--input", type=Path, default=cwd.joinpath("top-pypi-packages.json")
action="store_true", )
help="Use `uv lock` instead of `uv pip compile`", parser.add_argument(
"--mode",
choices=["compile", "lock", "pyproject-toml"],
default="compile",
help="`compile`: `uv pip compile`, "
"`lock`: `uv lock` from a single requirement"
"`pyproject-toml`: `uv lock` from a directory of `pyproject.toml` files",
) )
parser.add_argument("--python", "-p", type=str, default="3.13") parser.add_argument("--python", "-p", type=str, default="3.13")
parser.add_argument("--output-dir", type=Path, default="output") parser.add_argument("--output", type=Path, default="output")
parser.add_argument("--uv", type=Path, default=Path("uv")) parser.add_argument("--uv", type=Path, default=Path("uv"))
parser.add_argument("--limit", type=int, default=None) parser.add_argument("--limit", type=int, default=None)
parser.add_argument("--cache", type=Path, default=cwd.joinpath("cache")) parser.add_argument("--cache", type=Path, default=cwd.joinpath("cache"))
@ -170,16 +203,62 @@ def main():
parser.add_argument("--latest", action="store_true") parser.add_argument("--latest", action="store_true")
args = parser.parse_args() args = parser.parse_args()
top_15k_pypi = json.loads(cwd.joinpath("top-pypi-packages.json").read_text()) if args.mode == "pyproject-toml":
top_15k_pypi = [pkg["project"] for pkg in top_15k_pypi["rows"]] project_tomls = sorted((file.stem, file) for file in args.input.iterdir())
jobs = {}
no_project = 0
dynamic_dependencies = 0
for package, file in project_tomls:
if len(jobs) >= args.limit:
break
if file.suffix != ".toml":
continue
project_toml = file.read_text()
data = tomllib.loads(project_toml)
project = data.get("project")
if not project:
no_project += 1
continue
if dynamic := project.get("dynamic"):
if "dependencies" in dynamic:
dynamic_dependencies += 1
continue
if "version" in dynamic:
dynamic.remove("version")
# Usually there are no cycles back to the current project, so any version works
project["version"] = "1.0.0"
if args.latest: jobs[package] = tomli_w.dumps(data)
with cwd.joinpath("package_versions.csv").open() as f:
latest_versions = { print(f"`pyproject.toml`s without `[project]`: {no_project}")
row["package_name"]: row["latest_version"] for row in csv.DictReader(f) print(
} f"`pyproject.toml`s with `dynamic = ['dependencies']`: {dynamic_dependencies}"
)
if args.latest:
raise ValueError("Latest versions are not supported in pyproject-toml mode")
else: else:
latest_versions = None project_names = json.loads(args.input.read_text())
project_names = sorted(pkg["project"] for pkg in project_names["rows"])
if args.latest:
with cwd.joinpath("package_versions.csv").open() as f:
latest_versions = {
row["package_name"]: row["latest_version"]
for row in csv.DictReader(f)
}
else:
latest_versions = None
jobs = {}
for package in project_names[: args.limit]:
if latest_versions:
if version := latest_versions.get(package):
jobs[package] = f"{package}=={version}"
else:
tqdm.write(f"Missing version: {package}")
continue
else:
jobs[package] = package
excluded_packages = [ excluded_packages = [
# 5000 releases, no solution # 5000 releases, no solution
@ -188,53 +267,47 @@ def main():
"tf-models-nightly", "tf-models-nightly",
"mtmtrain", "mtmtrain",
"llm-dialog-manager", "llm-dialog-manager",
"python-must",
# Slow and have no solution # Slow and have no solution
"edx-enterprise", "edx-enterprise",
"kcli", "kcli",
"emmet-api", "emmet-api",
] ]
for package in excluded_packages: for package in excluded_packages:
top_15k_pypi.remove(package) jobs.pop(package, None)
if args.output_dir.exists(): if args.output.exists():
shutil.rmtree(args.output_dir) shutil.rmtree(args.output)
args.output_dir.mkdir(parents=True, exist_ok=True) args.output.mkdir(parents=True)
args.output_dir.joinpath(".gitignore").write_text("*") args.output.joinpath(".gitignore").write_text("*")
parameters = { parameters = {
"project": args.project, "mode": args.mode,
"python": args.python, "python": args.python,
"latest": args.latest, "latest": args.latest,
} }
args.output_dir.joinpath("parameters.json").write_text(json.dumps(parameters)) args.output.joinpath("parameters.json").write_text(json.dumps(parameters))
success = 0 success = 0
all_results = [] # Track all results for analysis all_results = [] # Track all results for analysis
max_package_len = max(len(package) for package in top_15k_pypi[: args.limit]) max_package_len = max(len(package) for package in jobs)
with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as executor: with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as executor:
tasks = [] tasks = []
packages_pending = [] packages_pending = []
for package in top_15k_pypi[: args.limit]: for package, specification in jobs.items():
if latest_versions:
if version := latest_versions.get(package):
pass
else:
tqdm.write(f"Missing version: {package}")
continue
else:
version = None
packages_pending.append(package) packages_pending.append(package)
tasks.append( tasks.append(
executor.submit( executor.submit(
run_uv, run_uv,
package,
specification,
args.uv, args.uv,
args.project, args.mode,
args.python, args.python,
args.cache, args.cache,
args.offline, args.offline,
package, args.output,
args.output_dir,
version,
) )
) )
total = len(packages_pending) total = len(packages_pending)

View File

@ -0,0 +1,107 @@
# /// script
# requires-python = ">=3.13"
# dependencies = [
# "httpx>=0.28.1,<0.29.0",
# "tqdm>=4.67.1,<5.0.0",
# ]
# ///
import argparse
import asyncio
import csv
import shutil
from dataclasses import dataclass
from pathlib import Path
import httpx
from httpx import AsyncClient
from tqdm.auto import tqdm
@dataclass
class Repository:
org: str
repo: str
ref: str
async def fetch_pyproject(
client: AsyncClient, repository: Repository, output_dir: Path
):
url = f"https://raw.githubusercontent.com/{repository.org}/{repository.repo}/{repository.ref}/pyproject.toml"
try:
response = await client.get(url)
response.raise_for_status()
except httpx.HTTPError as e:
# The bigquery data is sometimes missing the master -> main transition
url = f"https://raw.githubusercontent.com/{repository.org}/{repository.repo}/refs/heads/main/pyproject.toml"
try:
response = await client.get(url)
response.raise_for_status()
except httpx.HTTPError:
# Ignore the error from the main fallback if it didn't work
if hasattr(e, "response") and e.response.status_code == 404:
tqdm.write(
f"Not found: https://github.com/{repository.org}/{repository.repo}"
)
else:
tqdm.write(
f"Error for https://github.com/{repository.org}/{repository.repo}: {e}"
)
return None
output_dir.joinpath(f"{repository.repo}.toml").write_text(response.text)
return True
async def main():
parser = argparse.ArgumentParser()
parser.add_argument("--input", type=Path, default=Path("top500_2025_gh_stars.csv"))
parser.add_argument("--output", type=Path, default=Path("pyproject_toml"))
args = parser.parse_args()
with args.input.open() as f:
repositories = []
seen = set()
for row in csv.DictReader(f):
if row["repo_name"] in seen:
continue
seen.add(row["repo_name"])
repositories.append(
Repository(
org=row["repo_name"].split("/")[0],
repo=row["repo_name"].split("/")[1],
ref=row["ref"],
)
)
if args.output.exists():
shutil.rmtree(args.output)
args.output.mkdir(parents=True)
args.output.joinpath(".gitignore").write_text("*")
semaphore = asyncio.Semaphore(50)
async def fetch_with_semaphore(
client: AsyncClient, repository: Repository, output_dir: Path
):
async with semaphore:
return await fetch_pyproject(client, repository, output_dir)
async with httpx.AsyncClient() as client:
with tqdm(total=len(repositories)) as pbar:
tasks = [
fetch_with_semaphore(client, repository, args.output)
for repository in repositories
]
results = []
for future in asyncio.as_completed(tasks):
results.append(await future)
pbar.update(1)
success = sum(1 for result in results if result is True)
print(f"Successes: {success}/{len(repositories)}")
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,18 @@
#!/bin/bash
set -ex
script_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
limit=50000
uv run $script_dir/ecosystem_testing.py --uv $1 --mode compile --output $script_dir/base-compile --limit $limit
uv run $script_dir/ecosystem_testing.py --uv $2 --mode compile --output $script_dir/branch-compile --limit $limit
uv run $script_dir/ecosystem_testing.py --uv $1 --mode lock --output $script_dir/base-lock --limit $limit
uv run $script_dir/ecosystem_testing.py --uv $2 --mode lock --output $script_dir/branch-lock --limit $limit
uv run $script_dir/ecosystem_testing.py --uv $1 --mode pyproject-toml --input $script_dir/pyproject_toml --output $script_dir/base-pyproject-toml --limit $limit
uv run $script_dir/ecosystem_testing.py --uv $2 --mode pyproject-toml --input $script_dir/pyproject_toml --output $script_dir/branch-pyproject-toml --limit $limit
rm $script_dir/report.md
uv run $script_dir/create_report.py $script_dir/base-compile $script_dir/branch-compile --mode compile --markdown >> $script_dir/report.md
uv run $script_dir/create_report.py $script_dir/base-lock $script_dir/branch-lock --mode lock --markdown >> $script_dir/report.md
uv run $script_dir/create_report.py $script_dir/base-pyproject-toml $script_dir/branch-pyproject-toml --mode pyproject-toml --markdown >> $script_dir/report.md

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,20 @@
# BigQuery SQL for top5k_pyproject_toml_2025_gh_stars.csv
# Run in https://console.cloud.google.com/bigquery
SELECT
f.repo_name,
f.ref,
COUNT(e.id) AS stars
FROM
`bigquery-public-data.github_repos.files` f
JOIN
`githubarchive.month.2025*` e
ON
f.repo_name = e.repo.name
WHERE
f.path = 'pyproject.toml'
AND e.type = 'WatchEvent'
GROUP BY
f.repo_name, f.ref
ORDER BY
stars DESC
LIMIT 5000;