uv/scripts/uvx_usage_on_gh/fetch_uvx_usage.py

764 lines
24 KiB
Python
Executable File

# /// script
# requires-python = ">=3.13"
# dependencies = [
# "httpx"
# ]
# ///
"""
Use the GitHub Code Search API to find instances of `uvx <package>` in:
- README files (*.md)
- Shell scripts (*.sh, *.bash, *.zsh)
Requirements:
- A GitHub Personal Access Token (PAT) with `public_repo` scope
- Set the GITHUB_TOKEN environment variable or pass --token
Usage:
python scripts/uvx_usage_on_gh/fetch_uvx_usage.py --output crates/uv/src/commands/tool/top_packages.txt
"""
import argparse
import asyncio
import logging
import os
import re
import sys
import time
from collections import Counter
from pathlib import Path
from typing import Any, NamedTuple, Optional
import httpx
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
# GitHub API configuration
GITHUB_API_BASE = "https://api.github.com"
CODE_SEARCH_ENDPOINT = f"{GITHUB_API_BASE}/search/code"
# Rate limiting configuration
RATE_LIMIT_DELAY = 6.1 # seconds between requests (slightly more than 60/10)
# GitHub Code Search API limits
GITHUB_CODE_SEARCH_MAX_RESULTS = 1000 # Hard limit: only first 1000 results accessible
GITHUB_CODE_SEARCH_MAX_PAGE = 10 # Page 10 = results 901-1000, page 11+ returns 422
# Retry configuration
MAX_RETRIES = 5
INITIAL_RETRY_DELAY = 10 # seconds
MAX_RETRY_DELAY = 300 # 5 minutes max delay
# PyPI check concurrency
PYPI_CONCURRENT_CHECKS = 20 # Number of concurrent PyPI checks
# PyPI API endpoint
PYPI_JSON_API_TEMPLATE = "https://pypi.org/pypi/{package}/json"
class RateLimitInfo(NamedTuple):
remaining: int | None
reset_time: int | None
class GitHubSearchResponse(NamedTuple):
items: list[dict[str, Any]]
total_count: int
rate_limit: RateLimitInfo
# Regex patterns for extracting package names
PACKAGE_PATTERN_FROM = re.compile(
r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)*--from\s+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?",
re.IGNORECASE,
)
PACKAGE_PATTERN_NORMAL = re.compile(
r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?",
re.IGNORECASE,
)
PACKAGE_PATTERN_SIMPLE = re.compile(
r"\buvx\s+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?",
re.IGNORECASE,
)
URL_PATTERN = re.compile(
r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)*--from\s+(git\+[a-z]+://|git://|https?://)",
re.IGNORECASE,
)
def extract_package_name(match_text: str) -> Optional[str]:
"""
Extract package name from a match.
Handles patterns like:
- uvx ruff
- uvx --from httpie http (extracts "httpie")
- uvx --python 3.12 textual-demo
- uvx black@latest
- uvx pytest --version
- uvx streamlit run streamlit_app/dashboard.py
Skips patterns like:
- uvx --from git+https://... (URLs are not package names)
- uvx --from http://... (URLs are not package names)
"""
# Skip URLs after --from
if URL_PATTERN.search(match_text):
return None
# Try patterns in order: --from, flags, simple
match = (
PACKAGE_PATTERN_FROM.search(match_text)
or PACKAGE_PATTERN_NORMAL.search(match_text)
or PACKAGE_PATTERN_SIMPLE.search(match_text)
)
if not match:
return None
package = match.group(1).lower()
# Remove version specifiers (e.g., @latest, @1.0.0)
if "@" in package:
package = package.split("@")[0]
# Validation checks
if (
package.startswith("--")
or "/" in package
or "\\" in package
or len(package) < 2
):
return None
return package
def _calculate_retry_delay(
status_code: int,
retry_count: int,
response_headers: httpx.Headers,
) -> int:
"""Calculate delay for retry based on status code and headers."""
if status_code in (403, 429):
# Try Retry-After header first
retry_after = response_headers.get("Retry-After")
if retry_after:
try:
return int(retry_after) + 2 # Add 2 second buffer
except ValueError:
pass
# Fall back to X-RateLimit-Reset
reset_time_str = response_headers.get("X-RateLimit-Reset")
if reset_time_str:
try:
reset_time = int(reset_time_str)
current_time = int(time.time())
return max(reset_time - current_time + 2, 10) # At least 10 seconds
except ValueError:
pass
# Default: exponential backoff
return min(INITIAL_RETRY_DELAY * (2**retry_count), MAX_RETRY_DELAY)
def search_github_code(
query: str,
token: str,
page: int,
per_page: int = 100,
retry_count: int = 0,
) -> GitHubSearchResponse:
headers = {
"Accept": "application/vnd.github.text-match+json",
"Authorization": f"Bearer {token}",
}
params = {
"q": query,
"page": page,
"per_page": min(per_page, 100),
}
logger.info(f"Searching GitHub: {query} (page {page}, attempt {retry_count + 1})")
try:
response = httpx.get(
CODE_SEARCH_ENDPOINT,
headers=headers,
params=params,
timeout=30.0,
)
response.raise_for_status()
# Extract rate limit info
remaining_str = response.headers.get("X-RateLimit-Remaining")
reset_time_str = response.headers.get("X-RateLimit-Reset")
rate_limit = RateLimitInfo(
remaining=int(remaining_str) if remaining_str else None,
reset_time=int(reset_time_str) if reset_time_str else None,
)
logger.debug(
f"Rate limit remaining: {rate_limit.remaining}, reset at: {rate_limit.reset_time}"
)
data = response.json()
total_count = data.get("total_count", 0)
logger.info(f"Count of total results: {total_count}")
return GitHubSearchResponse(
items=data.get("items", []),
total_count=total_count,
rate_limit=rate_limit,
)
except httpx.HTTPStatusError as e:
status_code = e.response.status_code
# 422 on page 11+ is likely the hard 1000 result limit
if status_code == 422 and page > GITHUB_CODE_SEARCH_MAX_PAGE:
logger.info(
f"422 error on page {page} - likely hit GitHub's 1000 result limit. "
f"Code Search API only returns first {GITHUB_CODE_SEARCH_MAX_RESULTS} results."
)
raise ValueError(
f"Reached GitHub Code Search API limit (page {page} > {GITHUB_CODE_SEARCH_MAX_PAGE})"
) from e
# Retryable errors
if status_code in (403, 422, 429) and retry_count < MAX_RETRIES:
delay = _calculate_retry_delay(status_code, retry_count, e.response.headers)
if status_code == 403:
logger.warning(
f"Rate limit exceeded (403). Retrying in {delay}s "
f"(attempt {retry_count + 1}/{MAX_RETRIES})"
)
elif status_code == 429:
logger.warning(
f"Rate limit exceeded (429). Retrying in {delay}s "
f"(attempt {retry_count + 1}/{MAX_RETRIES})"
)
elif status_code == 422:
logger.warning(
f"Validation error (422) - may be transient. Retrying in {delay}s "
f"(attempt {retry_count + 1}/{MAX_RETRIES})"
)
time.sleep(delay)
return search_github_code(query, token, page, per_page, retry_count + 1)
# Non-retryable or max retries reached
if status_code == 403:
logger.error(
"Rate limit exceeded or authentication failed after retries. "
"Check your token and wait before retrying."
)
elif status_code == 422:
logger.error(f"Invalid query after retries: {query}")
else:
logger.error(f"HTTP error {status_code} after retries")
except httpx.RequestError as e:
# Network errors are retryable
if retry_count < MAX_RETRIES:
delay = min(INITIAL_RETRY_DELAY * (2**retry_count), MAX_RETRY_DELAY)
logger.warning(
f"Request failed: {e}. Retrying in {delay}s "
f"(attempt {retry_count + 1}/{MAX_RETRIES})"
)
time.sleep(delay)
return search_github_code(query, token, page, per_page, retry_count + 1)
logger.error(f"Request failed after retries: {e}")
raise
async def wait_for_rate_limit(rate_limit: RateLimitInfo) -> None:
"""
Wait if we're approaching rate limit or need to wait until reset.
Args:
rate_limit: Rate limit information from previous request
"""
if rate_limit.remaining is None or rate_limit.reset_time is None:
await asyncio.sleep(RATE_LIMIT_DELAY)
return
# If running low on requests, wait until reset
if rate_limit.remaining <= 2:
wait_time = rate_limit.reset_time - int(time.time()) + 2 # Add 2 second buffer
if wait_time > 0:
logger.info(
f"Rate limit low ({rate_limit.remaining} remaining). "
f"Waiting {wait_time}s until reset at {rate_limit.reset_time}"
)
await asyncio.sleep(wait_time)
else:
await asyncio.sleep(RATE_LIMIT_DELAY)
else:
await asyncio.sleep(RATE_LIMIT_DELAY)
def build_size_query(
base_query: str, start_bytes: int, end_bytes: Optional[int]
) -> str:
"""Build a GitHub Code Search query with size filter."""
if end_bytes is None:
return f"{base_query} size:>={start_bytes}"
return f"{base_query} size:{start_bytes}..{end_bytes}"
async def check_pypi_package_exists(
package: str,
cache: dict[str, bool],
client: httpx.AsyncClient,
) -> tuple[str, bool]:
"""
Check if a single package exists on PyPI.
Args:
package: Package name to check
cache: Dictionary to cache results (modified in-place)
client: httpx async client instance
Returns:
Tuple of (package_name, exists)
"""
# Check cache first
if package in cache:
return (package, cache[package])
url = PYPI_JSON_API_TEMPLATE.format(package=package)
try:
response = await client.get(url, timeout=10.0, follow_redirects=True)
exists = response.status_code == 200
cache[package] = exists
if exists:
logger.debug(f"{package} exists on PyPI")
else:
logger.debug(f"{package} not found on PyPI")
return (package, exists)
except httpx.RequestError as e:
logger.debug(f"Error checking {package} on PyPI: {e}")
cache[package] = False
return (package, False)
async def check_packages_batch(
packages: list[str],
cache: dict[str, bool],
semaphore: asyncio.Semaphore,
) -> dict[str, bool]:
"""
Check a batch of packages against PyPI concurrently.
Args:
packages: List of package names to check
cache: Dictionary to cache results (modified in-place)
semaphore: Semaphore to limit concurrent requests
Returns:
Dictionary mapping package names to their existence status
"""
async def check_one(package: str) -> tuple[str, bool]:
async with semaphore:
async with httpx.AsyncClient() as client:
return await check_pypi_package_exists(package, cache, client)
tasks = [check_one(pkg) for pkg in packages]
results = await asyncio.gather(*tasks, return_exceptions=False)
return dict(results)
def extract_packages_from_items(items: list[dict[str, Any]]) -> Counter:
page_packages = Counter()
for item in items:
# Extract from text_matches (code snippets)
text_matches = item.get("text_matches", [])
for match in text_matches:
fragment = match.get("fragment", "")
package = extract_package_name(fragment)
if package:
page_packages[package] += 1
logger.debug(f"Found package: {package}")
# Also check file path/name
path = item.get("path", "")
if "uvx" in path.lower():
package = extract_package_name(path)
if package:
page_packages[package] += 1
return page_packages
async def search_uvx_usage(
token: str, max_pages: int = 10
) -> tuple[Counter[str], dict[str, bool]]:
"""
Search for uvx usage across GitHub and extract package names.
Processes packages incrementally and checks PyPI concurrently.
Args:
token: GitHub Personal Access Token
max_pages: Maximum number of pages to fetch per query (default: 10)
Returns:
Tuple of (Counter of valid package names with counts, updated PyPI cache)
"""
pypi_cache: dict[str, bool] = {}
valid_package_counts: Counter[str] = Counter()
all_package_counts: Counter[str] = Counter()
unknown_packages_queue: list[str] = []
semaphore = asyncio.Semaphore(PYPI_CONCURRENT_CHECKS)
current_rate_limit = RateLimitInfo(None, None)
# Size buckets to work around GitHub's 1000 result limit
# It would be way smarter to do this dynamically (query a given size range and do a
# binary/proportional split on the number of results) but I already got this far
# so I'm not going to change it for now.
markdown_size_buckets = [
(0, 1025),
(1025, 1250),
(1250, 1500),
(1500, 1750),
(1750, 2000),
(2000, 2500),
(2500, 3500),
(3500, 4500),
(4500, 5500),
(5500, 6250),
(6250, 7000),
(7000, 7750),
(7750, 8500),
(8500, 9250),
(9250, 10000),
(10000, 10750),
(10750, 11750),
(11750, 13000),
(13000, 14000),
(14000, 15250),
(15250, 16250),
(16250, 17500),
(17500, 18750),
(18750, 20000),
(20000, 22000),
(22000, 24000),
(24000, 26000),
(26000, 28000),
(28000, 30000),
(30000, 33000),
(33000, 36000),
(36000, 39000),
(39000, 42000),
(42000, 45000),
(45000, 50000),
(50000, 60000),
(60000, 70000),
(70000, 80000),
(80000, 100000),
(100000, 120000),
(120000, 140000),
(140000, 160000),
(160000, 180000),
(180000, 200000),
(200000, 250000),
(250000, 300000),
(300000, None),
]
shell_size_buckets = [
(0, 2800),
(2800, 6000),
(6000, 15000),
(15000, 32000),
(32000, None),
]
queries = [
build_size_query("uvx AND language:Markdown in:file", start, end)
for start, end in markdown_size_buckets
]
queries.extend(
build_size_query("uvx AND language:Shell in:file", start, end)
for start, end in shell_size_buckets
)
async def process_unknown_packages() -> None:
"""Process queued unknown packages against PyPI."""
if not unknown_packages_queue:
return
packages_to_check = list(set(unknown_packages_queue))
unknown_packages_queue.clear()
logger.info(
f"Checking {len(packages_to_check)} unknown packages against PyPI..."
)
results = await check_packages_batch(packages_to_check, pypi_cache, semaphore)
# Update valid package counts based on results
for package, exists in results.items():
if exists:
count = all_package_counts.get(package, 0)
if count > 0:
valid_package_counts[package] = count
logger.debug(
f"Added {package} to valid packages ({count} occurrences)"
)
else:
logger.warning(f"Package {package} validated but has no count")
for query_idx, query in enumerate(queries):
page = 1
effective_max_pages = min(max_pages, GITHUB_CODE_SEARCH_MAX_PAGE)
# Wait before starting a new query (except the first one)
if query_idx > 0:
logger.debug("Waiting before starting new query...")
await wait_for_rate_limit(current_rate_limit)
await process_unknown_packages()
while page <= effective_max_pages:
try:
# Rate limiting: wait between page requests (except for the first page)
if page > 1:
logger.debug("Waiting before next page...")
await wait_for_rate_limit(current_rate_limit)
await process_unknown_packages()
response = search_github_code(query, token, page=page)
# Update rate limit state from response
current_rate_limit = response.rate_limit
items = response.items
if not items:
logger.info(f"No more results for query: {query}")
break
logger.info(f"Found {len(items)} results on page {page}")
# Extract package names from this page
page_packages = extract_packages_from_items(items)
# Process packages from this page
for package, count in page_packages.items():
all_package_counts[package] += count
# Check cache first
if package in pypi_cache:
if pypi_cache[package]:
valid_package_counts[package] = all_package_counts[package]
logger.debug(
f"Known valid: {package} (total: {all_package_counts[package]})"
)
else:
unknown_packages_queue.append(package)
# Process unknown packages while we have time before next GitHub request
if unknown_packages_queue:
await process_unknown_packages()
# Check if there are more pages
effective_total = min(
response.total_count, GITHUB_CODE_SEARCH_MAX_RESULTS
)
if len(items) < 100 or page * 100 >= effective_total:
logger.info(
f"Reached end of results for query: {query} "
f"(page {page}, total: {response.total_count})"
)
break
page += 1
except ValueError as e:
# This is raised when we hit the 1000 result limit
logger.info(f"Hit GitHub Code Search API limit: {e}")
break
except Exception as e:
logger.error(f"Error processing page {page} of query '{query}': {e}")
break
# Process any remaining unknown packages after each query
await process_unknown_packages()
# Final processing of any remaining unknown packages
await process_unknown_packages()
logger.info(
f"Found {len(valid_package_counts)} valid PyPI packages "
f"out of {len(all_package_counts)} total"
)
return valid_package_counts, pypi_cache
def write_top_packages(
package_counts: Counter[str],
output_path: Path,
debug_output_path: Path,
min_count: int = 2,
) -> None:
"""
Write top packages to files, sorted by frequency.
Packages are written in buckets by threshold (100+, 25+, 10+, 5+, min_count+).
Args:
package_counts: Counter of package names and counts
output_path: Path to output file (main packages list)
debug_output_path: Path to debug output file (with counts)
min_count: Minimum occurrence count to include (default: 2)
"""
thresholds = [min_count, 5, 10, 25, 100]
# Filter packages into buckets by threshold
buckets = []
for i, threshold in enumerate(thresholds):
next_threshold = thresholds[i + 1] if i + 1 < len(thresholds) else float("inf")
bucket_packages = {
pkg: count
for pkg, count in package_counts.items()
if threshold <= count < next_threshold
}
buckets.append({"threshold": threshold, "packages": bucket_packages})
with open(output_path, "w") as f, open(debug_output_path, "w") as f_debug:
for bucket in reversed(buckets):
threshold = bucket["threshold"]
packages = bucket["packages"]
logger.info(
f"Greater than or equal to {threshold} mentions: {len(packages)} packages"
)
# Sort by count descending, then alphabetically
sorted_packages = sorted(packages.items(), key=lambda x: (-x[1], x[0]))
for package, count in sorted_packages:
f.write(f"{package}\n")
f_debug.write(f"{package}: {count}\n")
logger.info(f"Successfully wrote top packages to {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Fetch popular packages from GitHub by searching for uvx usage"
)
parser.add_argument(
"--token",
type=str,
help="GitHub Personal Access Token (or set GITHUB_TOKEN env var)",
default=os.getenv("GITHUB_TOKEN"),
)
parser.add_argument(
"--output",
type=Path,
help="Output file path (default: top_packages.txt)",
default=None,
)
parser.add_argument(
"--debug-output",
type=Path,
help="Debug output file path (default: top_packages_debug.txt)",
default=None,
)
parser.add_argument(
"--max-pages",
type=int,
default=10,
help="Maximum pages to fetch per query (default: 10)",
)
parser.add_argument(
"--min-count",
type=int,
default=2,
help="Minimum occurrence count to include (default: 2)",
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Enable verbose logging",
)
args = parser.parse_args()
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
if not args.token:
logger.error(
"GitHub token is required. Set GITHUB_TOKEN environment variable "
"or pass --token. Create a token at: https://github.com/settings/tokens"
)
sys.exit(1)
# Set default output paths
if args.output is None or args.debug_output is None:
script_dir = Path(__file__).parent
project_root = script_dir.parent.parent
if args.output is None:
args.output = (
project_root
/ "crates"
/ "uv"
/ "src"
/ "commands"
/ "tool"
/ "top_packages.txt"
)
if args.debug_output is None:
args.debug_output = (
project_root
/ "crates"
/ "uv"
/ "src"
/ "commands"
/ "tool"
/ "top_packages_debug.txt"
)
logger.info("Starting GitHub search for uvx usage...")
logger.info(f"Output will be written to: {args.output}")
logger.info(f"Debug output will be written to: {args.debug_output}")
valid_packages, pypi_cache = asyncio.run(
search_uvx_usage(args.token, max_pages=args.max_pages)
)
if not valid_packages:
logger.warning("No valid PyPI packages found.")
sys.exit(1)
logger.info(f"Found {len(valid_packages)} valid PyPI packages")
logger.info(f"Top 10 valid packages: {valid_packages.most_common(10)}")
logger.info(f"PyPI cache contains {len(pypi_cache)} entries")
write_top_packages(
valid_packages, args.output, args.debug_output, min_count=args.min_count
)
if __name__ == "__main__":
main()