uv/scripts/ecosystem-testing/fetch_pyproject_toml.py

108 lines
3.2 KiB
Python

# /// script
# requires-python = ">=3.13"
# dependencies = [
# "httpx>=0.28.1,<0.29.0",
# "tqdm>=4.67.1,<5.0.0",
# ]
# ///
import argparse
import asyncio
import csv
import shutil
from dataclasses import dataclass
from pathlib import Path
import httpx
from httpx import AsyncClient
from tqdm.auto import tqdm
@dataclass
class Repository:
org: str
repo: str
ref: str
async def fetch_pyproject(
client: AsyncClient, repository: Repository, output_dir: Path
):
url = f"https://raw.githubusercontent.com/{repository.org}/{repository.repo}/{repository.ref}/pyproject.toml"
try:
response = await client.get(url)
response.raise_for_status()
except httpx.HTTPError as e:
# The bigquery data is sometimes missing the master -> main transition
url = f"https://raw.githubusercontent.com/{repository.org}/{repository.repo}/refs/heads/main/pyproject.toml"
try:
response = await client.get(url)
response.raise_for_status()
except httpx.HTTPError:
# Ignore the error from the main fallback if it didn't work
if hasattr(e, "response") and e.response.status_code == 404:
tqdm.write(
f"Not found: https://github.com/{repository.org}/{repository.repo}"
)
else:
tqdm.write(
f"Error for https://github.com/{repository.org}/{repository.repo}: {e}"
)
return None
output_dir.joinpath(f"{repository.repo}.toml").write_text(response.text)
return True
async def main():
parser = argparse.ArgumentParser()
parser.add_argument("--input", type=Path, default=Path("top500_2025_gh_stars.csv"))
parser.add_argument("--output", type=Path, default=Path("pyproject_toml"))
args = parser.parse_args()
with args.input.open() as f:
repositories = []
seen = set()
for row in csv.DictReader(f):
if row["repo_name"] in seen:
continue
seen.add(row["repo_name"])
repositories.append(
Repository(
org=row["repo_name"].split("/")[0],
repo=row["repo_name"].split("/")[1],
ref=row["ref"],
)
)
if args.output.exists():
shutil.rmtree(args.output)
args.output.mkdir(parents=True)
args.output.joinpath(".gitignore").write_text("*")
semaphore = asyncio.Semaphore(50)
async def fetch_with_semaphore(
client: AsyncClient, repository: Repository, output_dir: Path
):
async with semaphore:
return await fetch_pyproject(client, repository, output_dir)
async with httpx.AsyncClient() as client:
with tqdm(total=len(repositories)) as pbar:
tasks = [
fetch_with_semaphore(client, repository, args.output)
for repository in repositories
]
results = []
for future in asyncio.as_completed(tasks):
results.append(await future)
pbar.update(1)
success = sum(1 for result in results if result is True)
print(f"Successes: {success}/{len(repositories)}")
if __name__ == "__main__":
asyncio.run(main())