mirror of https://github.com/astral-sh/uv
764 lines
24 KiB
Python
Executable File
764 lines
24 KiB
Python
Executable File
# /// script
|
|
# requires-python = ">=3.13"
|
|
# dependencies = [
|
|
# "httpx"
|
|
# ]
|
|
# ///
|
|
|
|
"""
|
|
Use the GitHub Code Search API to find instances of `uvx <package>` in:
|
|
- README files (*.md)
|
|
- Shell scripts (*.sh, *.bash, *.zsh)
|
|
|
|
Requirements:
|
|
- A GitHub Personal Access Token (PAT) with `public_repo` scope
|
|
- Set the GITHUB_TOKEN environment variable or pass --token
|
|
|
|
Usage:
|
|
python scripts/uvx_usage_on_gh/fetch_uvx_usage.py --output crates/uv/src/commands/tool/top_packages.txt
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
from typing import Any, NamedTuple, Optional
|
|
|
|
import httpx
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# GitHub API configuration
|
|
GITHUB_API_BASE = "https://api.github.com"
|
|
CODE_SEARCH_ENDPOINT = f"{GITHUB_API_BASE}/search/code"
|
|
|
|
# Rate limiting configuration
|
|
RATE_LIMIT_DELAY = 6.1 # seconds between requests (slightly more than 60/10)
|
|
|
|
# GitHub Code Search API limits
|
|
GITHUB_CODE_SEARCH_MAX_RESULTS = 1000 # Hard limit: only first 1000 results accessible
|
|
GITHUB_CODE_SEARCH_MAX_PAGE = 10 # Page 10 = results 901-1000, page 11+ returns 422
|
|
|
|
# Retry configuration
|
|
MAX_RETRIES = 5
|
|
INITIAL_RETRY_DELAY = 10 # seconds
|
|
MAX_RETRY_DELAY = 300 # 5 minutes max delay
|
|
|
|
# PyPI check concurrency
|
|
PYPI_CONCURRENT_CHECKS = 20 # Number of concurrent PyPI checks
|
|
|
|
# PyPI API endpoint
|
|
PYPI_JSON_API_TEMPLATE = "https://pypi.org/pypi/{package}/json"
|
|
|
|
|
|
class RateLimitInfo(NamedTuple):
|
|
remaining: int | None
|
|
reset_time: int | None
|
|
|
|
|
|
class GitHubSearchResponse(NamedTuple):
|
|
items: list[dict[str, Any]]
|
|
total_count: int
|
|
rate_limit: RateLimitInfo
|
|
|
|
|
|
# Regex patterns for extracting package names
|
|
PACKAGE_PATTERN_FROM = re.compile(
|
|
r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)*--from\s+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?",
|
|
re.IGNORECASE,
|
|
)
|
|
PACKAGE_PATTERN_NORMAL = re.compile(
|
|
r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?",
|
|
re.IGNORECASE,
|
|
)
|
|
PACKAGE_PATTERN_SIMPLE = re.compile(
|
|
r"\buvx\s+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?",
|
|
re.IGNORECASE,
|
|
)
|
|
URL_PATTERN = re.compile(
|
|
r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)*--from\s+(git\+[a-z]+://|git://|https?://)",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def extract_package_name(match_text: str) -> Optional[str]:
|
|
"""
|
|
Extract package name from a match.
|
|
|
|
Handles patterns like:
|
|
- uvx ruff
|
|
- uvx --from httpie http (extracts "httpie")
|
|
- uvx --python 3.12 textual-demo
|
|
- uvx black@latest
|
|
- uvx pytest --version
|
|
- uvx streamlit run streamlit_app/dashboard.py
|
|
|
|
Skips patterns like:
|
|
- uvx --from git+https://... (URLs are not package names)
|
|
- uvx --from http://... (URLs are not package names)
|
|
"""
|
|
# Skip URLs after --from
|
|
if URL_PATTERN.search(match_text):
|
|
return None
|
|
|
|
# Try patterns in order: --from, flags, simple
|
|
match = (
|
|
PACKAGE_PATTERN_FROM.search(match_text)
|
|
or PACKAGE_PATTERN_NORMAL.search(match_text)
|
|
or PACKAGE_PATTERN_SIMPLE.search(match_text)
|
|
)
|
|
|
|
if not match:
|
|
return None
|
|
|
|
package = match.group(1).lower()
|
|
|
|
# Remove version specifiers (e.g., @latest, @1.0.0)
|
|
if "@" in package:
|
|
package = package.split("@")[0]
|
|
|
|
# Validation checks
|
|
if (
|
|
package.startswith("--")
|
|
or "/" in package
|
|
or "\\" in package
|
|
or len(package) < 2
|
|
):
|
|
return None
|
|
|
|
return package
|
|
|
|
|
|
def _calculate_retry_delay(
|
|
status_code: int,
|
|
retry_count: int,
|
|
response_headers: httpx.Headers,
|
|
) -> int:
|
|
"""Calculate delay for retry based on status code and headers."""
|
|
if status_code in (403, 429):
|
|
# Try Retry-After header first
|
|
retry_after = response_headers.get("Retry-After")
|
|
if retry_after:
|
|
try:
|
|
return int(retry_after) + 2 # Add 2 second buffer
|
|
except ValueError:
|
|
pass
|
|
|
|
# Fall back to X-RateLimit-Reset
|
|
reset_time_str = response_headers.get("X-RateLimit-Reset")
|
|
if reset_time_str:
|
|
try:
|
|
reset_time = int(reset_time_str)
|
|
current_time = int(time.time())
|
|
return max(reset_time - current_time + 2, 10) # At least 10 seconds
|
|
|
|
except ValueError:
|
|
pass
|
|
|
|
# Default: exponential backoff
|
|
return min(INITIAL_RETRY_DELAY * (2**retry_count), MAX_RETRY_DELAY)
|
|
|
|
|
|
def search_github_code(
|
|
query: str,
|
|
token: str,
|
|
page: int,
|
|
per_page: int = 100,
|
|
retry_count: int = 0,
|
|
) -> GitHubSearchResponse:
|
|
headers = {
|
|
"Accept": "application/vnd.github.text-match+json",
|
|
"Authorization": f"Bearer {token}",
|
|
}
|
|
|
|
params = {
|
|
"q": query,
|
|
"page": page,
|
|
"per_page": min(per_page, 100),
|
|
}
|
|
|
|
logger.info(f"Searching GitHub: {query} (page {page}, attempt {retry_count + 1})")
|
|
|
|
try:
|
|
response = httpx.get(
|
|
CODE_SEARCH_ENDPOINT,
|
|
headers=headers,
|
|
params=params,
|
|
timeout=30.0,
|
|
)
|
|
response.raise_for_status()
|
|
|
|
# Extract rate limit info
|
|
remaining_str = response.headers.get("X-RateLimit-Remaining")
|
|
reset_time_str = response.headers.get("X-RateLimit-Reset")
|
|
rate_limit = RateLimitInfo(
|
|
remaining=int(remaining_str) if remaining_str else None,
|
|
reset_time=int(reset_time_str) if reset_time_str else None,
|
|
)
|
|
|
|
logger.debug(
|
|
f"Rate limit remaining: {rate_limit.remaining}, reset at: {rate_limit.reset_time}"
|
|
)
|
|
|
|
data = response.json()
|
|
total_count = data.get("total_count", 0)
|
|
logger.info(f"Count of total results: {total_count}")
|
|
|
|
return GitHubSearchResponse(
|
|
items=data.get("items", []),
|
|
total_count=total_count,
|
|
rate_limit=rate_limit,
|
|
)
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
status_code = e.response.status_code
|
|
|
|
# 422 on page 11+ is likely the hard 1000 result limit
|
|
if status_code == 422 and page > GITHUB_CODE_SEARCH_MAX_PAGE:
|
|
logger.info(
|
|
f"422 error on page {page} - likely hit GitHub's 1000 result limit. "
|
|
f"Code Search API only returns first {GITHUB_CODE_SEARCH_MAX_RESULTS} results."
|
|
)
|
|
raise ValueError(
|
|
f"Reached GitHub Code Search API limit (page {page} > {GITHUB_CODE_SEARCH_MAX_PAGE})"
|
|
) from e
|
|
|
|
# Retryable errors
|
|
if status_code in (403, 422, 429) and retry_count < MAX_RETRIES:
|
|
delay = _calculate_retry_delay(status_code, retry_count, e.response.headers)
|
|
|
|
if status_code == 403:
|
|
logger.warning(
|
|
f"Rate limit exceeded (403). Retrying in {delay}s "
|
|
f"(attempt {retry_count + 1}/{MAX_RETRIES})"
|
|
)
|
|
elif status_code == 429:
|
|
logger.warning(
|
|
f"Rate limit exceeded (429). Retrying in {delay}s "
|
|
f"(attempt {retry_count + 1}/{MAX_RETRIES})"
|
|
)
|
|
elif status_code == 422:
|
|
logger.warning(
|
|
f"Validation error (422) - may be transient. Retrying in {delay}s "
|
|
f"(attempt {retry_count + 1}/{MAX_RETRIES})"
|
|
)
|
|
|
|
time.sleep(delay)
|
|
return search_github_code(query, token, page, per_page, retry_count + 1)
|
|
|
|
# Non-retryable or max retries reached
|
|
if status_code == 403:
|
|
logger.error(
|
|
"Rate limit exceeded or authentication failed after retries. "
|
|
"Check your token and wait before retrying."
|
|
)
|
|
elif status_code == 422:
|
|
logger.error(f"Invalid query after retries: {query}")
|
|
else:
|
|
logger.error(f"HTTP error {status_code} after retries")
|
|
|
|
except httpx.RequestError as e:
|
|
# Network errors are retryable
|
|
if retry_count < MAX_RETRIES:
|
|
delay = min(INITIAL_RETRY_DELAY * (2**retry_count), MAX_RETRY_DELAY)
|
|
logger.warning(
|
|
f"Request failed: {e}. Retrying in {delay}s "
|
|
f"(attempt {retry_count + 1}/{MAX_RETRIES})"
|
|
)
|
|
time.sleep(delay)
|
|
return search_github_code(query, token, page, per_page, retry_count + 1)
|
|
|
|
logger.error(f"Request failed after retries: {e}")
|
|
raise
|
|
|
|
|
|
async def wait_for_rate_limit(rate_limit: RateLimitInfo) -> None:
|
|
"""
|
|
Wait if we're approaching rate limit or need to wait until reset.
|
|
|
|
Args:
|
|
rate_limit: Rate limit information from previous request
|
|
"""
|
|
if rate_limit.remaining is None or rate_limit.reset_time is None:
|
|
await asyncio.sleep(RATE_LIMIT_DELAY)
|
|
return
|
|
|
|
# If running low on requests, wait until reset
|
|
if rate_limit.remaining <= 2:
|
|
wait_time = rate_limit.reset_time - int(time.time()) + 2 # Add 2 second buffer
|
|
if wait_time > 0:
|
|
logger.info(
|
|
f"Rate limit low ({rate_limit.remaining} remaining). "
|
|
f"Waiting {wait_time}s until reset at {rate_limit.reset_time}"
|
|
)
|
|
await asyncio.sleep(wait_time)
|
|
else:
|
|
await asyncio.sleep(RATE_LIMIT_DELAY)
|
|
else:
|
|
await asyncio.sleep(RATE_LIMIT_DELAY)
|
|
|
|
|
|
def build_size_query(
|
|
base_query: str, start_bytes: int, end_bytes: Optional[int]
|
|
) -> str:
|
|
"""Build a GitHub Code Search query with size filter."""
|
|
if end_bytes is None:
|
|
return f"{base_query} size:>={start_bytes}"
|
|
return f"{base_query} size:{start_bytes}..{end_bytes}"
|
|
|
|
|
|
async def check_pypi_package_exists(
|
|
package: str,
|
|
cache: dict[str, bool],
|
|
client: httpx.AsyncClient,
|
|
) -> tuple[str, bool]:
|
|
"""
|
|
Check if a single package exists on PyPI.
|
|
|
|
Args:
|
|
package: Package name to check
|
|
cache: Dictionary to cache results (modified in-place)
|
|
client: httpx async client instance
|
|
|
|
Returns:
|
|
Tuple of (package_name, exists)
|
|
"""
|
|
# Check cache first
|
|
if package in cache:
|
|
return (package, cache[package])
|
|
|
|
url = PYPI_JSON_API_TEMPLATE.format(package=package)
|
|
|
|
try:
|
|
response = await client.get(url, timeout=10.0, follow_redirects=True)
|
|
exists = response.status_code == 200
|
|
cache[package] = exists
|
|
|
|
if exists:
|
|
logger.debug(f"✓ {package} exists on PyPI")
|
|
else:
|
|
logger.debug(f"✗ {package} not found on PyPI")
|
|
|
|
return (package, exists)
|
|
except httpx.RequestError as e:
|
|
logger.debug(f"Error checking {package} on PyPI: {e}")
|
|
cache[package] = False
|
|
return (package, False)
|
|
|
|
|
|
async def check_packages_batch(
|
|
packages: list[str],
|
|
cache: dict[str, bool],
|
|
semaphore: asyncio.Semaphore,
|
|
) -> dict[str, bool]:
|
|
"""
|
|
Check a batch of packages against PyPI concurrently.
|
|
|
|
Args:
|
|
packages: List of package names to check
|
|
cache: Dictionary to cache results (modified in-place)
|
|
semaphore: Semaphore to limit concurrent requests
|
|
|
|
Returns:
|
|
Dictionary mapping package names to their existence status
|
|
"""
|
|
|
|
async def check_one(package: str) -> tuple[str, bool]:
|
|
async with semaphore:
|
|
async with httpx.AsyncClient() as client:
|
|
return await check_pypi_package_exists(package, cache, client)
|
|
|
|
tasks = [check_one(pkg) for pkg in packages]
|
|
results = await asyncio.gather(*tasks, return_exceptions=False)
|
|
return dict(results)
|
|
|
|
|
|
def extract_packages_from_items(items: list[dict[str, Any]]) -> Counter:
|
|
page_packages = Counter()
|
|
|
|
for item in items:
|
|
# Extract from text_matches (code snippets)
|
|
text_matches = item.get("text_matches", [])
|
|
for match in text_matches:
|
|
fragment = match.get("fragment", "")
|
|
package = extract_package_name(fragment)
|
|
if package:
|
|
page_packages[package] += 1
|
|
logger.debug(f"Found package: {package}")
|
|
|
|
# Also check file path/name
|
|
path = item.get("path", "")
|
|
if "uvx" in path.lower():
|
|
package = extract_package_name(path)
|
|
if package:
|
|
page_packages[package] += 1
|
|
|
|
return page_packages
|
|
|
|
|
|
async def search_uvx_usage(
|
|
token: str, max_pages: int = 10
|
|
) -> tuple[Counter[str], dict[str, bool]]:
|
|
"""
|
|
Search for uvx usage across GitHub and extract package names.
|
|
|
|
Processes packages incrementally and checks PyPI concurrently.
|
|
|
|
Args:
|
|
token: GitHub Personal Access Token
|
|
max_pages: Maximum number of pages to fetch per query (default: 10)
|
|
|
|
Returns:
|
|
Tuple of (Counter of valid package names with counts, updated PyPI cache)
|
|
"""
|
|
pypi_cache: dict[str, bool] = {}
|
|
valid_package_counts: Counter[str] = Counter()
|
|
all_package_counts: Counter[str] = Counter()
|
|
unknown_packages_queue: list[str] = []
|
|
|
|
semaphore = asyncio.Semaphore(PYPI_CONCURRENT_CHECKS)
|
|
current_rate_limit = RateLimitInfo(None, None)
|
|
|
|
# Size buckets to work around GitHub's 1000 result limit
|
|
# It would be way smarter to do this dynamically (query a given size range and do a
|
|
# binary/proportional split on the number of results) but I already got this far
|
|
# so I'm not going to change it for now.
|
|
markdown_size_buckets = [
|
|
(0, 1025),
|
|
(1025, 1250),
|
|
(1250, 1500),
|
|
(1500, 1750),
|
|
(1750, 2000),
|
|
(2000, 2500),
|
|
(2500, 3500),
|
|
(3500, 4500),
|
|
(4500, 5500),
|
|
(5500, 6250),
|
|
(6250, 7000),
|
|
(7000, 7750),
|
|
(7750, 8500),
|
|
(8500, 9250),
|
|
(9250, 10000),
|
|
(10000, 10750),
|
|
(10750, 11750),
|
|
(11750, 13000),
|
|
(13000, 14000),
|
|
(14000, 15250),
|
|
(15250, 16250),
|
|
(16250, 17500),
|
|
(17500, 18750),
|
|
(18750, 20000),
|
|
(20000, 22000),
|
|
(22000, 24000),
|
|
(24000, 26000),
|
|
(26000, 28000),
|
|
(28000, 30000),
|
|
(30000, 33000),
|
|
(33000, 36000),
|
|
(36000, 39000),
|
|
(39000, 42000),
|
|
(42000, 45000),
|
|
(45000, 50000),
|
|
(50000, 60000),
|
|
(60000, 70000),
|
|
(70000, 80000),
|
|
(80000, 100000),
|
|
(100000, 120000),
|
|
(120000, 140000),
|
|
(140000, 160000),
|
|
(160000, 180000),
|
|
(180000, 200000),
|
|
(200000, 250000),
|
|
(250000, 300000),
|
|
(300000, None),
|
|
]
|
|
|
|
shell_size_buckets = [
|
|
(0, 2800),
|
|
(2800, 6000),
|
|
(6000, 15000),
|
|
(15000, 32000),
|
|
(32000, None),
|
|
]
|
|
|
|
queries = [
|
|
build_size_query("uvx AND language:Markdown in:file", start, end)
|
|
for start, end in markdown_size_buckets
|
|
]
|
|
queries.extend(
|
|
build_size_query("uvx AND language:Shell in:file", start, end)
|
|
for start, end in shell_size_buckets
|
|
)
|
|
|
|
async def process_unknown_packages() -> None:
|
|
"""Process queued unknown packages against PyPI."""
|
|
if not unknown_packages_queue:
|
|
return
|
|
|
|
packages_to_check = list(set(unknown_packages_queue))
|
|
unknown_packages_queue.clear()
|
|
|
|
logger.info(
|
|
f"Checking {len(packages_to_check)} unknown packages against PyPI..."
|
|
)
|
|
results = await check_packages_batch(packages_to_check, pypi_cache, semaphore)
|
|
|
|
# Update valid package counts based on results
|
|
for package, exists in results.items():
|
|
if exists:
|
|
count = all_package_counts.get(package, 0)
|
|
if count > 0:
|
|
valid_package_counts[package] = count
|
|
logger.debug(
|
|
f"Added {package} to valid packages ({count} occurrences)"
|
|
)
|
|
else:
|
|
logger.warning(f"Package {package} validated but has no count")
|
|
|
|
for query_idx, query in enumerate(queries):
|
|
page = 1
|
|
effective_max_pages = min(max_pages, GITHUB_CODE_SEARCH_MAX_PAGE)
|
|
|
|
# Wait before starting a new query (except the first one)
|
|
if query_idx > 0:
|
|
logger.debug("Waiting before starting new query...")
|
|
await wait_for_rate_limit(current_rate_limit)
|
|
await process_unknown_packages()
|
|
|
|
while page <= effective_max_pages:
|
|
try:
|
|
# Rate limiting: wait between page requests (except for the first page)
|
|
if page > 1:
|
|
logger.debug("Waiting before next page...")
|
|
await wait_for_rate_limit(current_rate_limit)
|
|
await process_unknown_packages()
|
|
|
|
response = search_github_code(query, token, page=page)
|
|
|
|
# Update rate limit state from response
|
|
current_rate_limit = response.rate_limit
|
|
|
|
items = response.items
|
|
if not items:
|
|
logger.info(f"No more results for query: {query}")
|
|
break
|
|
|
|
logger.info(f"Found {len(items)} results on page {page}")
|
|
|
|
# Extract package names from this page
|
|
page_packages = extract_packages_from_items(items)
|
|
|
|
# Process packages from this page
|
|
for package, count in page_packages.items():
|
|
all_package_counts[package] += count
|
|
|
|
# Check cache first
|
|
if package in pypi_cache:
|
|
if pypi_cache[package]:
|
|
valid_package_counts[package] = all_package_counts[package]
|
|
logger.debug(
|
|
f"Known valid: {package} (total: {all_package_counts[package]})"
|
|
)
|
|
else:
|
|
unknown_packages_queue.append(package)
|
|
|
|
# Process unknown packages while we have time before next GitHub request
|
|
if unknown_packages_queue:
|
|
await process_unknown_packages()
|
|
|
|
# Check if there are more pages
|
|
effective_total = min(
|
|
response.total_count, GITHUB_CODE_SEARCH_MAX_RESULTS
|
|
)
|
|
|
|
if len(items) < 100 or page * 100 >= effective_total:
|
|
logger.info(
|
|
f"Reached end of results for query: {query} "
|
|
f"(page {page}, total: {response.total_count})"
|
|
)
|
|
break
|
|
|
|
page += 1
|
|
|
|
except ValueError as e:
|
|
# This is raised when we hit the 1000 result limit
|
|
logger.info(f"Hit GitHub Code Search API limit: {e}")
|
|
break
|
|
except Exception as e:
|
|
logger.error(f"Error processing page {page} of query '{query}': {e}")
|
|
break
|
|
|
|
# Process any remaining unknown packages after each query
|
|
await process_unknown_packages()
|
|
|
|
# Final processing of any remaining unknown packages
|
|
await process_unknown_packages()
|
|
|
|
logger.info(
|
|
f"Found {len(valid_package_counts)} valid PyPI packages "
|
|
f"out of {len(all_package_counts)} total"
|
|
)
|
|
|
|
return valid_package_counts, pypi_cache
|
|
|
|
|
|
def write_top_packages(
|
|
package_counts: Counter[str],
|
|
output_path: Path,
|
|
debug_output_path: Path,
|
|
min_count: int = 2,
|
|
) -> None:
|
|
"""
|
|
Write top packages to files, sorted by frequency.
|
|
|
|
Packages are written in buckets by threshold (100+, 25+, 10+, 5+, min_count+).
|
|
|
|
Args:
|
|
package_counts: Counter of package names and counts
|
|
output_path: Path to output file (main packages list)
|
|
debug_output_path: Path to debug output file (with counts)
|
|
min_count: Minimum occurrence count to include (default: 2)
|
|
"""
|
|
thresholds = [min_count, 5, 10, 25, 100]
|
|
|
|
# Filter packages into buckets by threshold
|
|
buckets = []
|
|
for i, threshold in enumerate(thresholds):
|
|
next_threshold = thresholds[i + 1] if i + 1 < len(thresholds) else float("inf")
|
|
bucket_packages = {
|
|
pkg: count
|
|
for pkg, count in package_counts.items()
|
|
if threshold <= count < next_threshold
|
|
}
|
|
buckets.append({"threshold": threshold, "packages": bucket_packages})
|
|
|
|
with open(output_path, "w") as f, open(debug_output_path, "w") as f_debug:
|
|
for bucket in reversed(buckets):
|
|
threshold = bucket["threshold"]
|
|
packages = bucket["packages"]
|
|
logger.info(
|
|
f"Greater than or equal to {threshold} mentions: {len(packages)} packages"
|
|
)
|
|
|
|
# Sort by count descending, then alphabetically
|
|
sorted_packages = sorted(packages.items(), key=lambda x: (-x[1], x[0]))
|
|
|
|
for package, count in sorted_packages:
|
|
f.write(f"{package}\n")
|
|
f_debug.write(f"{package}: {count}\n")
|
|
|
|
logger.info(f"Successfully wrote top packages to {output_path}")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Fetch popular packages from GitHub by searching for uvx usage"
|
|
)
|
|
parser.add_argument(
|
|
"--token",
|
|
type=str,
|
|
help="GitHub Personal Access Token (or set GITHUB_TOKEN env var)",
|
|
default=os.getenv("GITHUB_TOKEN"),
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
help="Output file path (default: top_packages.txt)",
|
|
default=None,
|
|
)
|
|
parser.add_argument(
|
|
"--debug-output",
|
|
type=Path,
|
|
help="Debug output file path (default: top_packages_debug.txt)",
|
|
default=None,
|
|
)
|
|
parser.add_argument(
|
|
"--max-pages",
|
|
type=int,
|
|
default=10,
|
|
help="Maximum pages to fetch per query (default: 10)",
|
|
)
|
|
parser.add_argument(
|
|
"--min-count",
|
|
type=int,
|
|
default=2,
|
|
help="Minimum occurrence count to include (default: 2)",
|
|
)
|
|
parser.add_argument(
|
|
"--verbose",
|
|
"-v",
|
|
action="store_true",
|
|
help="Enable verbose logging",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.verbose:
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
if not args.token:
|
|
logger.error(
|
|
"GitHub token is required. Set GITHUB_TOKEN environment variable "
|
|
"or pass --token. Create a token at: https://github.com/settings/tokens"
|
|
)
|
|
sys.exit(1)
|
|
|
|
# Set default output paths
|
|
if args.output is None or args.debug_output is None:
|
|
script_dir = Path(__file__).parent
|
|
project_root = script_dir.parent.parent
|
|
if args.output is None:
|
|
args.output = (
|
|
project_root
|
|
/ "crates"
|
|
/ "uv"
|
|
/ "src"
|
|
/ "commands"
|
|
/ "tool"
|
|
/ "top_packages.txt"
|
|
)
|
|
if args.debug_output is None:
|
|
args.debug_output = (
|
|
project_root
|
|
/ "crates"
|
|
/ "uv"
|
|
/ "src"
|
|
/ "commands"
|
|
/ "tool"
|
|
/ "top_packages_debug.txt"
|
|
)
|
|
|
|
logger.info("Starting GitHub search for uvx usage...")
|
|
logger.info(f"Output will be written to: {args.output}")
|
|
logger.info(f"Debug output will be written to: {args.debug_output}")
|
|
|
|
valid_packages, pypi_cache = asyncio.run(
|
|
search_uvx_usage(args.token, max_pages=args.max_pages)
|
|
)
|
|
|
|
if not valid_packages:
|
|
logger.warning("No valid PyPI packages found.")
|
|
sys.exit(1)
|
|
|
|
logger.info(f"Found {len(valid_packages)} valid PyPI packages")
|
|
logger.info(f"Top 10 valid packages: {valid_packages.most_common(10)}")
|
|
logger.info(f"PyPI cache contains {len(pypi_cache)} entries")
|
|
|
|
write_top_packages(
|
|
valid_packages, args.output, args.debug_output, min_count=args.min_count
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|