diff --git a/crates/uv/src/commands/tool/top_packages.txt b/crates/uv/src/commands/tool/top_packages.txt new file mode 100644 index 000000000..307973527 --- /dev/null +++ b/crates/uv/src/commands/tool/top_packages.txt @@ -0,0 +1,167 @@ +ruff +agent-starter-pack +pre-commit +marimo +args +cookiecutter +pyright +ngrok +pytest +hatch +llm +markitdown +mcp-server-fetch +ty +copier +command +mcp-server-git +mcpo +uvx +black +streamdown +autoflake +run +nox +files-to-prompt +mypy +tox +yt-dlp +split_markdown4gpt +mcp-server-time +basic-memory +synth-ai +gac +twine +are +codetoprompt +open-webui +browser-use +pip +wpull +build +with +blender-remote +https +migrate-to-uv +bandit +mcp-atlassian +mkdocs +playwright +everything +maturin +cruft +mcp-feedback-enhanced +smithery +aignostics +databricks +jupyterlab +tool +echo +elevenlabs-mcp +ipython +package +pycowsay +support +completions +jupyter +mini-swe-agent +openhands-ai +pyenvsearch +rogue-ai +uv +uv-dynamic-versioning +ansible-core +class +gcovr +handles +mcp +mcp-proxy +mcp-scan +mcp-server-sqlite +streamlit +blender-mcp +create-mcp-server +eyelet +mcp-server-qdrant +mcp-wiki +on +path +pytest-watch +setup +textual-demo +voice-mode +automagik-tools +bump-my-version +kedro +schemathesis +huggingface_hub +integration +jupyter-core +jupytext +llm-discovery +mcp-sse-shim +openhands +reachy-mini +runs +usage +automagik-hive +business-use-core +cookieplone +cowsay +create-dagster +distribution +eval +healthyselfjournal +httpie +markitdown-mcp +speaches-cli +strip-tags +autocompletion +chroma +cmake +compatibility +detect-secrets +dist +evalgate +iowarp-mcps +isort +juv +lean-lsp-mcp +llamactl +nearc +ngiab_data_preprocess +nvitop +prek +requests +runner +samstacks +semgrep-mcp +specifyplus +sqlite-utils +the +uv-publish +wandb +agentprobe +audible-cli +cakemail-api-docs-mcp +datasette +fonttools +git-cliff +goldenverba +honcho +internetarchive +mcp-google-sheets +modal +modern +pdm +poe +pyinstaller +pyrefly +radon +should +sniffly +specify +superclaude +tools +workspace-mcp +yt-mpv diff --git a/scripts/uvx_usage_on_gh/fetch_uvx_usage.py b/scripts/uvx_usage_on_gh/fetch_uvx_usage.py new file mode 100755 index 000000000..54e84a635 --- /dev/null +++ b/scripts/uvx_usage_on_gh/fetch_uvx_usage.py @@ -0,0 +1,753 @@ +# /// script +# requires-python = ">=3.13" +# dependencies = [ +# "httpx" +# ] +# /// + +""" +Use the GitHub Code Search API to find instances of `uvx ` in: +- README files (*.md) +- Shell scripts (*.sh, *.bash, *.zsh) + +Requirements: + - A GitHub Personal Access Token (PAT) with `public_repo` scope + - Set the GITHUB_TOKEN environment variable or pass --token + +Usage: + python scripts/uvx_usage_on_gh/fetch_uvx_usage.py --output crates/uv/src/commands/tool/top_packages.txt +""" + +import argparse +import asyncio +import logging +import os +import re +import sys +import time +from collections import Counter +from pathlib import Path +from typing import Any, NamedTuple, Optional + +import httpx + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", +) +logger = logging.getLogger(__name__) + +# GitHub API configuration +GITHUB_API_BASE = "https://api.github.com" +CODE_SEARCH_ENDPOINT = f"{GITHUB_API_BASE}/search/code" + +# Rate limiting configuration +RATE_LIMIT_DELAY = 6.1 # seconds between requests (slightly more than 60/10) + +# GitHub Code Search API limits +GITHUB_CODE_SEARCH_MAX_RESULTS = 1000 # Hard limit: only first 1000 results accessible +GITHUB_CODE_SEARCH_MAX_PAGE = 10 # Page 10 = results 901-1000, page 11+ returns 422 + +# Retry configuration +MAX_RETRIES = 5 +INITIAL_RETRY_DELAY = 10 # seconds +MAX_RETRY_DELAY = 300 # 5 minutes max delay + +# PyPI check concurrency +PYPI_CONCURRENT_CHECKS = 20 # Number of concurrent PyPI checks + +# PyPI API endpoint +PYPI_JSON_API_TEMPLATE = "https://pypi.org/pypi/{package}/json" + + +class RateLimitInfo(NamedTuple): + remaining: int | None + reset_time: int | None + + +class GitHubSearchResponse(NamedTuple): + items: list[dict[str, Any]] + total_count: int + rate_limit: RateLimitInfo + + +# Regex patterns for extracting package names +PACKAGE_PATTERN_FROM = re.compile( + r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)*--from\s+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?", + re.IGNORECASE, +) +PACKAGE_PATTERN_NORMAL = re.compile( + r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?", + re.IGNORECASE, +) +PACKAGE_PATTERN_SIMPLE = re.compile( + r"\buvx\s+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?", + re.IGNORECASE, +) +URL_PATTERN = re.compile( + r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)*--from\s+(git\+[a-z]+://|git://|https?://)", + re.IGNORECASE, +) + + +def extract_package_name(match_text: str) -> Optional[str]: + """ + Extract package name from a match. + + Handles patterns like: + - uvx ruff + - uvx --from httpie http (extracts "httpie") + - uvx --python 3.12 textual-demo + - uvx black@latest + - uvx pytest --version + - uvx streamlit run streamlit_app/dashboard.py + + Skips patterns like: + - uvx --from git+https://... (URLs are not package names) + - uvx --from http://... (URLs are not package names) + """ + # Skip URLs after --from + if URL_PATTERN.search(match_text): + return None + + # Try patterns in order: --from, flags, simple + match = ( + PACKAGE_PATTERN_FROM.search(match_text) + or PACKAGE_PATTERN_NORMAL.search(match_text) + or PACKAGE_PATTERN_SIMPLE.search(match_text) + ) + + if not match: + return None + + package = match.group(1).lower() + + # Remove version specifiers (e.g., @latest, @1.0.0) + if "@" in package: + package = package.split("@")[0] + + # Validation checks + if package.startswith("--") or "/" in package or "\\" in package or len(package) < 2: + return None + + return package + + +def _calculate_retry_delay( + status_code: int, + retry_count: int, + response_headers: httpx.Headers, +) -> int: + """Calculate delay for retry based on status code and headers.""" + if status_code in (403, 429): + # Try Retry-After header first + retry_after = response_headers.get("Retry-After") + if retry_after: + try: + return int(retry_after) + 2 # Add 2 second buffer + except ValueError: + pass + + # Fall back to X-RateLimit-Reset + reset_time_str = response_headers.get("X-RateLimit-Reset") + if reset_time_str: + try: + reset_time = int(reset_time_str) + current_time = int(time.time()) + return max(reset_time - current_time + 2, 10) # At least 10 seconds + + except ValueError: + pass + + # Default: exponential backoff + return min(INITIAL_RETRY_DELAY * (2**retry_count), MAX_RETRY_DELAY) + + +def search_github_code( + query: str, + token: str, + page: int, + per_page: int = 100, + retry_count: int = 0, +) -> GitHubSearchResponse: + headers = { + "Accept": "application/vnd.github.text-match+json", + "Authorization": f"Bearer {token}", + } + + params = { + "q": query, + "page": page, + "per_page": min(per_page, 100), + } + + logger.info(f"Searching GitHub: {query} (page {page}, attempt {retry_count + 1})") + + try: + response = httpx.get( + CODE_SEARCH_ENDPOINT, + headers=headers, + params=params, + timeout=30.0, + ) + response.raise_for_status() + + # Extract rate limit info + remaining_str = response.headers.get("X-RateLimit-Remaining") + reset_time_str = response.headers.get("X-RateLimit-Reset") + rate_limit = RateLimitInfo( + remaining=int(remaining_str) if remaining_str else None, + reset_time=int(reset_time_str) if reset_time_str else None, + ) + + logger.debug( + f"Rate limit remaining: {rate_limit.remaining}, reset at: {rate_limit.reset_time}" + ) + + data = response.json() + total_count = data.get("total_count", 0) + logger.info(f"Count of total results: {total_count}") + + return GitHubSearchResponse( + items=data.get("items", []), + total_count=total_count, + rate_limit=rate_limit, + ) + + except httpx.HTTPStatusError as e: + status_code = e.response.status_code + + # 422 on page 11+ is likely the hard 1000 result limit + if status_code == 422 and page > GITHUB_CODE_SEARCH_MAX_PAGE: + logger.info( + f"422 error on page {page} - likely hit GitHub's 1000 result limit. " + f"Code Search API only returns first {GITHUB_CODE_SEARCH_MAX_RESULTS} results." + ) + raise ValueError( + f"Reached GitHub Code Search API limit (page {page} > {GITHUB_CODE_SEARCH_MAX_PAGE})" + ) from e + + # Retryable errors + if status_code in (403, 422, 429) and retry_count < MAX_RETRIES: + delay = _calculate_retry_delay(status_code, retry_count, e.response.headers) + + if status_code == 403: + logger.warning( + f"Rate limit exceeded (403). Retrying in {delay}s " + f"(attempt {retry_count + 1}/{MAX_RETRIES})" + ) + elif status_code == 429: + logger.warning( + f"Rate limit exceeded (429). Retrying in {delay}s " + f"(attempt {retry_count + 1}/{MAX_RETRIES})" + ) + elif status_code == 422: + logger.warning( + f"Validation error (422) - may be transient. Retrying in {delay}s " + f"(attempt {retry_count + 1}/{MAX_RETRIES})" + ) + + time.sleep(delay) + return search_github_code(query, token, page, per_page, retry_count + 1) + + # Non-retryable or max retries reached + if status_code == 403: + logger.error( + "Rate limit exceeded or authentication failed after retries. " + "Check your token and wait before retrying." + ) + elif status_code == 422: + logger.error(f"Invalid query after retries: {query}") + else: + logger.error(f"HTTP error {status_code} after retries") + + except httpx.RequestError as e: + # Network errors are retryable + if retry_count < MAX_RETRIES: + delay = min(INITIAL_RETRY_DELAY * (2**retry_count), MAX_RETRY_DELAY) + logger.warning( + f"Request failed: {e}. Retrying in {delay}s " + f"(attempt {retry_count + 1}/{MAX_RETRIES})" + ) + time.sleep(delay) + return search_github_code(query, token, page, per_page, retry_count + 1) + + logger.error(f"Request failed after retries: {e}") + raise + + +async def wait_for_rate_limit(rate_limit: RateLimitInfo) -> None: + """ + Wait if we're approaching rate limit or need to wait until reset. + + Args: + rate_limit: Rate limit information from previous request + """ + if rate_limit.remaining is None or rate_limit.reset_time is None: + await asyncio.sleep(RATE_LIMIT_DELAY) + return + + # If running low on requests, wait until reset + if rate_limit.remaining <= 2: + wait_time = rate_limit.reset_time - int(time.time()) + 2 # Add 2 second buffer + if wait_time > 0: + logger.info( + f"Rate limit low ({rate_limit.remaining} remaining). " + f"Waiting {wait_time}s until reset at {rate_limit.reset_time}" + ) + await asyncio.sleep(wait_time) + else: + await asyncio.sleep(RATE_LIMIT_DELAY) + else: + await asyncio.sleep(RATE_LIMIT_DELAY) + + +def build_size_query(base_query: str, start_bytes: int, end_bytes: Optional[int]) -> str: + """Build a GitHub Code Search query with size filter.""" + if end_bytes is None: + return f"{base_query} size:>={start_bytes}" + return f"{base_query} size:{start_bytes}..{end_bytes}" + + +async def check_pypi_package_exists( + package: str, + cache: dict[str, bool], + client: httpx.AsyncClient, +) -> tuple[str, bool]: + """ + Check if a single package exists on PyPI. + + Args: + package: Package name to check + cache: Dictionary to cache results (modified in-place) + client: httpx async client instance + + Returns: + Tuple of (package_name, exists) + """ + # Check cache first + if package in cache: + return (package, cache[package]) + + url = PYPI_JSON_API_TEMPLATE.format(package=package) + + try: + response = await client.get(url, timeout=10.0, follow_redirects=True) + exists = response.status_code == 200 + cache[package] = exists + + if exists: + logger.debug(f"✓ {package} exists on PyPI") + else: + logger.debug(f"✗ {package} not found on PyPI") + + return (package, exists) + except httpx.RequestError as e: + logger.debug(f"Error checking {package} on PyPI: {e}") + cache[package] = False + return (package, False) + + +async def check_packages_batch( + packages: list[str], + cache: dict[str, bool], + semaphore: asyncio.Semaphore, +) -> dict[str, bool]: + """ + Check a batch of packages against PyPI concurrently. + + Args: + packages: List of package names to check + cache: Dictionary to cache results (modified in-place) + semaphore: Semaphore to limit concurrent requests + + Returns: + Dictionary mapping package names to their existence status + """ + async def check_one(package: str) -> tuple[str, bool]: + async with semaphore: + async with httpx.AsyncClient() as client: + return await check_pypi_package_exists(package, cache, client) + + tasks = [check_one(pkg) for pkg in packages] + results = await asyncio.gather(*tasks, return_exceptions=False) + return dict(results) + + +def extract_packages_from_items(items: list[dict[str, Any]]) -> Counter: + page_packages = Counter() + + for item in items: + # Extract from text_matches (code snippets) + text_matches = item.get("text_matches", []) + for match in text_matches: + fragment = match.get("fragment", "") + package = extract_package_name(fragment) + if package: + page_packages[package] += 1 + logger.debug(f"Found package: {package}") + + # Also check file path/name + path = item.get("path", "") + if "uvx" in path.lower(): + package = extract_package_name(path) + if package: + page_packages[package] += 1 + + return page_packages + + +async def search_uvx_usage( + token: str, max_pages: int = 10 +) -> tuple[Counter[str], dict[str, bool]]: + """ + Search for uvx usage across GitHub and extract package names. + + Processes packages incrementally and checks PyPI concurrently. + + Args: + token: GitHub Personal Access Token + max_pages: Maximum number of pages to fetch per query (default: 10) + + Returns: + Tuple of (Counter of valid package names with counts, updated PyPI cache) + """ + pypi_cache: dict[str, bool] = {} + valid_package_counts: Counter[str] = Counter() + all_package_counts: Counter[str] = Counter() + unknown_packages_queue: list[str] = [] + + semaphore = asyncio.Semaphore(PYPI_CONCURRENT_CHECKS) + current_rate_limit = RateLimitInfo(None, None) + + # Size buckets to work around GitHub's 1000 result limit + # It would be way smarter to do this dynamically (query a given size range and do a + # binary/proportional split on the number of results) but I already got this far + # so I'm not going to change it for now. + markdown_size_buckets = [ + (0, 1025), + (1025, 1250), + (1250, 1500), + (1500, 1750), + (1750, 2000), + (2000, 2500), + (2500, 3500), + (3500, 4500), + (4500, 5500), + (5500, 6250), + (6250, 7000), + (7000, 7750), + (7750, 8500), + (8500, 9250), + (9250, 10000), + (10000, 10750), + (10750, 11750), + (11750, 13000), + (13000, 14000), + (14000, 15250), + (15250, 16250), + (16250, 17500), + (17500, 18750), + (18750, 20000), + (20000, 22000), + (22000, 24000), + (24000, 26000), + (26000, 28000), + (28000, 30000), + (30000, 33000), + (33000, 36000), + (36000, 39000), + (39000, 42000), + (42000, 45000), + (45000, 50000), + (50000, 60000), + (60000, 70000), + (70000, 80000), + (80000, 100000), + (100000, 120000), + (120000, 140000), + (140000, 160000), + (160000, 180000), + (180000, 200000), + (200000, 250000), + (250000, 300000), + (300000, None), + ] + + shell_size_buckets = [ + (0, 2800), + (2800, 6000), + (6000, 15000), + (15000, 32000), + (32000, None), + ] + + queries = [ + build_size_query("uvx AND language:Markdown in:file", start, end) + for start, end in markdown_size_buckets + ] + queries.extend( + build_size_query("uvx AND language:Shell in:file", start, end) + for start, end in shell_size_buckets + ) + + async def process_unknown_packages() -> None: + """Process queued unknown packages against PyPI.""" + if not unknown_packages_queue: + return + + packages_to_check = list(set(unknown_packages_queue)) + unknown_packages_queue.clear() + + logger.info(f"Checking {len(packages_to_check)} unknown packages against PyPI...") + results = await check_packages_batch(packages_to_check, pypi_cache, semaphore) + + # Update valid package counts based on results + for package, exists in results.items(): + if exists: + count = all_package_counts.get(package, 0) + if count > 0: + valid_package_counts[package] = count + logger.debug(f"Added {package} to valid packages ({count} occurrences)") + else: + logger.warning(f"Package {package} validated but has no count") + + for query_idx, query in enumerate(queries): + page = 1 + effective_max_pages = min(max_pages, GITHUB_CODE_SEARCH_MAX_PAGE) + + # Wait before starting a new query (except the first one) + if query_idx > 0: + logger.debug("Waiting before starting new query...") + await wait_for_rate_limit(current_rate_limit) + await process_unknown_packages() + + while page <= effective_max_pages: + try: + # Rate limiting: wait between page requests (except for the first page) + if page > 1: + logger.debug("Waiting before next page...") + await wait_for_rate_limit(current_rate_limit) + await process_unknown_packages() + + response = search_github_code(query, token, page=page) + + # Update rate limit state from response + current_rate_limit = response.rate_limit + + items = response.items + if not items: + logger.info(f"No more results for query: {query}") + break + + logger.info(f"Found {len(items)} results on page {page}") + + # Extract package names from this page + page_packages = extract_packages_from_items(items) + + # Process packages from this page + for package, count in page_packages.items(): + all_package_counts[package] += count + + # Check cache first + if package in pypi_cache: + if pypi_cache[package]: + valid_package_counts[package] = all_package_counts[package] + logger.debug( + f"Known valid: {package} (total: {all_package_counts[package]})" + ) + else: + unknown_packages_queue.append(package) + + # Process unknown packages while we have time before next GitHub request + if unknown_packages_queue: + await process_unknown_packages() + + # Check if there are more pages + effective_total = min( + response.total_count, GITHUB_CODE_SEARCH_MAX_RESULTS + ) + + if len(items) < 100 or page * 100 >= effective_total: + logger.info( + f"Reached end of results for query: {query} " + f"(page {page}, total: {response.total_count})" + ) + break + + page += 1 + + except ValueError as e: + # This is raised when we hit the 1000 result limit + logger.info(f"Hit GitHub Code Search API limit: {e}") + break + except Exception as e: + logger.error(f"Error processing page {page} of query '{query}': {e}") + break + + # Process any remaining unknown packages after each query + await process_unknown_packages() + + # Final processing of any remaining unknown packages + await process_unknown_packages() + + logger.info( + f"Found {len(valid_package_counts)} valid PyPI packages " + f"out of {len(all_package_counts)} total" + ) + + return valid_package_counts, pypi_cache + + +def write_top_packages( + package_counts: Counter[str], + output_path: Path, + debug_output_path: Path, + min_count: int = 2, +) -> None: + """ + Write top packages to files, sorted by frequency. + + Packages are written in buckets by threshold (100+, 25+, 10+, 5+, min_count+). + + Args: + package_counts: Counter of package names and counts + output_path: Path to output file (main packages list) + debug_output_path: Path to debug output file (with counts) + min_count: Minimum occurrence count to include (default: 2) + """ + thresholds = [min_count, 5, 10, 25, 100] + + # Filter packages into buckets by threshold + buckets = [] + for i, threshold in enumerate(thresholds): + next_threshold = thresholds[i + 1] if i + 1 < len(thresholds) else float("inf") + bucket_packages = { + pkg: count + for pkg, count in package_counts.items() + if threshold <= count < next_threshold + } + buckets.append({"threshold": threshold, "packages": bucket_packages}) + + with open(output_path, "w") as f, open(debug_output_path, "w") as f_debug: + for bucket in reversed(buckets): + threshold = bucket["threshold"] + packages = bucket["packages"] + logger.info( + f"Greater than or equal to {threshold} mentions: {len(packages)} packages" + ) + + # Sort by count descending, then alphabetically + sorted_packages = sorted( + packages.items(), key=lambda x: (-x[1], x[0]) + ) + + for package, count in sorted_packages: + f.write(f"{package}\n") + f_debug.write(f"{package}: {count}\n") + + logger.info(f"Successfully wrote top packages to {output_path}") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Fetch popular packages from GitHub by searching for uvx usage" + ) + parser.add_argument( + "--token", + type=str, + help="GitHub Personal Access Token (or set GITHUB_TOKEN env var)", + default=os.getenv("GITHUB_TOKEN"), + ) + parser.add_argument( + "--output", + type=Path, + help="Output file path (default: top_packages.txt)", + default=None, + ) + parser.add_argument( + "--debug-output", + type=Path, + help="Debug output file path (default: top_packages_debug.txt)", + default=None, + ) + parser.add_argument( + "--max-pages", + type=int, + default=10, + help="Maximum pages to fetch per query (default: 10)", + ) + parser.add_argument( + "--min-count", + type=int, + default=2, + help="Minimum occurrence count to include (default: 2)", + ) + parser.add_argument( + "--verbose", + "-v", + action="store_true", + help="Enable verbose logging", + ) + + args = parser.parse_args() + + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + if not args.token: + logger.error( + "GitHub token is required. Set GITHUB_TOKEN environment variable " + "or pass --token. Create a token at: https://github.com/settings/tokens" + ) + sys.exit(1) + + # Set default output paths + if args.output is None or args.debug_output is None: + script_dir = Path(__file__).parent + project_root = script_dir.parent.parent + if args.output is None: + args.output = ( + project_root + / "crates" + / "uv" + / "src" + / "commands" + / "tool" + / "top_packages.txt" + ) + if args.debug_output is None: + args.debug_output = ( + project_root + / "crates" + / "uv" + / "src" + / "commands" + / "tool" + / "top_packages_debug.txt" + ) + + logger.info("Starting GitHub search for uvx usage...") + logger.info(f"Output will be written to: {args.output}") + logger.info(f"Debug output will be written to: {args.debug_output}") + + valid_packages, pypi_cache = asyncio.run( + search_uvx_usage(args.token, max_pages=args.max_pages) + ) + + if not valid_packages: + logger.warning("No valid PyPI packages found.") + sys.exit(1) + + logger.info(f"Found {len(valid_packages)} valid PyPI packages") + logger.info(f"Top 10 valid packages: {valid_packages.most_common(10)}") + logger.info(f"PyPI cache contains {len(pypi_cache)} entries") + + write_top_packages( + valid_packages, args.output, args.debug_output, min_count=args.min_count + ) + + +if __name__ == "__main__": + main()