mirror of https://github.com/astral-sh/uv
top-packages generation script
Signed-off-by: Mikayla Thompson <mrt@mikayla.codes>
This commit is contained in:
parent
2642acc80b
commit
559d494884
|
|
@ -0,0 +1,167 @@
|
|||
ruff
|
||||
agent-starter-pack
|
||||
pre-commit
|
||||
marimo
|
||||
args
|
||||
cookiecutter
|
||||
pyright
|
||||
ngrok
|
||||
pytest
|
||||
hatch
|
||||
llm
|
||||
markitdown
|
||||
mcp-server-fetch
|
||||
ty
|
||||
copier
|
||||
command
|
||||
mcp-server-git
|
||||
mcpo
|
||||
uvx
|
||||
black
|
||||
streamdown
|
||||
autoflake
|
||||
run
|
||||
nox
|
||||
files-to-prompt
|
||||
mypy
|
||||
tox
|
||||
yt-dlp
|
||||
split_markdown4gpt
|
||||
mcp-server-time
|
||||
basic-memory
|
||||
synth-ai
|
||||
gac
|
||||
twine
|
||||
are
|
||||
codetoprompt
|
||||
open-webui
|
||||
browser-use
|
||||
pip
|
||||
wpull
|
||||
build
|
||||
with
|
||||
blender-remote
|
||||
https
|
||||
migrate-to-uv
|
||||
bandit
|
||||
mcp-atlassian
|
||||
mkdocs
|
||||
playwright
|
||||
everything
|
||||
maturin
|
||||
cruft
|
||||
mcp-feedback-enhanced
|
||||
smithery
|
||||
aignostics
|
||||
databricks
|
||||
jupyterlab
|
||||
tool
|
||||
echo
|
||||
elevenlabs-mcp
|
||||
ipython
|
||||
package
|
||||
pycowsay
|
||||
support
|
||||
completions
|
||||
jupyter
|
||||
mini-swe-agent
|
||||
openhands-ai
|
||||
pyenvsearch
|
||||
rogue-ai
|
||||
uv
|
||||
uv-dynamic-versioning
|
||||
ansible-core
|
||||
class
|
||||
gcovr
|
||||
handles
|
||||
mcp
|
||||
mcp-proxy
|
||||
mcp-scan
|
||||
mcp-server-sqlite
|
||||
streamlit
|
||||
blender-mcp
|
||||
create-mcp-server
|
||||
eyelet
|
||||
mcp-server-qdrant
|
||||
mcp-wiki
|
||||
on
|
||||
path
|
||||
pytest-watch
|
||||
setup
|
||||
textual-demo
|
||||
voice-mode
|
||||
automagik-tools
|
||||
bump-my-version
|
||||
kedro
|
||||
schemathesis
|
||||
huggingface_hub
|
||||
integration
|
||||
jupyter-core
|
||||
jupytext
|
||||
llm-discovery
|
||||
mcp-sse-shim
|
||||
openhands
|
||||
reachy-mini
|
||||
runs
|
||||
usage
|
||||
automagik-hive
|
||||
business-use-core
|
||||
cookieplone
|
||||
cowsay
|
||||
create-dagster
|
||||
distribution
|
||||
eval
|
||||
healthyselfjournal
|
||||
httpie
|
||||
markitdown-mcp
|
||||
speaches-cli
|
||||
strip-tags
|
||||
autocompletion
|
||||
chroma
|
||||
cmake
|
||||
compatibility
|
||||
detect-secrets
|
||||
dist
|
||||
evalgate
|
||||
iowarp-mcps
|
||||
isort
|
||||
juv
|
||||
lean-lsp-mcp
|
||||
llamactl
|
||||
nearc
|
||||
ngiab_data_preprocess
|
||||
nvitop
|
||||
prek
|
||||
requests
|
||||
runner
|
||||
samstacks
|
||||
semgrep-mcp
|
||||
specifyplus
|
||||
sqlite-utils
|
||||
the
|
||||
uv-publish
|
||||
wandb
|
||||
agentprobe
|
||||
audible-cli
|
||||
cakemail-api-docs-mcp
|
||||
datasette
|
||||
fonttools
|
||||
git-cliff
|
||||
goldenverba
|
||||
honcho
|
||||
internetarchive
|
||||
mcp-google-sheets
|
||||
modal
|
||||
modern
|
||||
pdm
|
||||
poe
|
||||
pyinstaller
|
||||
pyrefly
|
||||
radon
|
||||
should
|
||||
sniffly
|
||||
specify
|
||||
superclaude
|
||||
tools
|
||||
workspace-mcp
|
||||
yt-mpv
|
||||
|
|
@ -0,0 +1,753 @@
|
|||
# /// script
|
||||
# requires-python = ">=3.13"
|
||||
# dependencies = [
|
||||
# "httpx"
|
||||
# ]
|
||||
# ///
|
||||
|
||||
"""
|
||||
Use the GitHub Code Search API to find instances of `uvx <package>` in:
|
||||
- README files (*.md)
|
||||
- Shell scripts (*.sh, *.bash, *.zsh)
|
||||
|
||||
Requirements:
|
||||
- A GitHub Personal Access Token (PAT) with `public_repo` scope
|
||||
- Set the GITHUB_TOKEN environment variable or pass --token
|
||||
|
||||
Usage:
|
||||
python scripts/uvx_usage_on_gh/fetch_uvx_usage.py --output crates/uv/src/commands/tool/top_packages.txt
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from typing import Any, NamedTuple, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# GitHub API configuration
|
||||
GITHUB_API_BASE = "https://api.github.com"
|
||||
CODE_SEARCH_ENDPOINT = f"{GITHUB_API_BASE}/search/code"
|
||||
|
||||
# Rate limiting configuration
|
||||
RATE_LIMIT_DELAY = 6.1 # seconds between requests (slightly more than 60/10)
|
||||
|
||||
# GitHub Code Search API limits
|
||||
GITHUB_CODE_SEARCH_MAX_RESULTS = 1000 # Hard limit: only first 1000 results accessible
|
||||
GITHUB_CODE_SEARCH_MAX_PAGE = 10 # Page 10 = results 901-1000, page 11+ returns 422
|
||||
|
||||
# Retry configuration
|
||||
MAX_RETRIES = 5
|
||||
INITIAL_RETRY_DELAY = 10 # seconds
|
||||
MAX_RETRY_DELAY = 300 # 5 minutes max delay
|
||||
|
||||
# PyPI check concurrency
|
||||
PYPI_CONCURRENT_CHECKS = 20 # Number of concurrent PyPI checks
|
||||
|
||||
# PyPI API endpoint
|
||||
PYPI_JSON_API_TEMPLATE = "https://pypi.org/pypi/{package}/json"
|
||||
|
||||
|
||||
class RateLimitInfo(NamedTuple):
|
||||
remaining: int | None
|
||||
reset_time: int | None
|
||||
|
||||
|
||||
class GitHubSearchResponse(NamedTuple):
|
||||
items: list[dict[str, Any]]
|
||||
total_count: int
|
||||
rate_limit: RateLimitInfo
|
||||
|
||||
|
||||
# Regex patterns for extracting package names
|
||||
PACKAGE_PATTERN_FROM = re.compile(
|
||||
r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)*--from\s+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
PACKAGE_PATTERN_NORMAL = re.compile(
|
||||
r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
PACKAGE_PATTERN_SIMPLE = re.compile(
|
||||
r"\buvx\s+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
URL_PATTERN = re.compile(
|
||||
r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)*--from\s+(git\+[a-z]+://|git://|https?://)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def extract_package_name(match_text: str) -> Optional[str]:
|
||||
"""
|
||||
Extract package name from a match.
|
||||
|
||||
Handles patterns like:
|
||||
- uvx ruff
|
||||
- uvx --from httpie http (extracts "httpie")
|
||||
- uvx --python 3.12 textual-demo
|
||||
- uvx black@latest
|
||||
- uvx pytest --version
|
||||
- uvx streamlit run streamlit_app/dashboard.py
|
||||
|
||||
Skips patterns like:
|
||||
- uvx --from git+https://... (URLs are not package names)
|
||||
- uvx --from http://... (URLs are not package names)
|
||||
"""
|
||||
# Skip URLs after --from
|
||||
if URL_PATTERN.search(match_text):
|
||||
return None
|
||||
|
||||
# Try patterns in order: --from, flags, simple
|
||||
match = (
|
||||
PACKAGE_PATTERN_FROM.search(match_text)
|
||||
or PACKAGE_PATTERN_NORMAL.search(match_text)
|
||||
or PACKAGE_PATTERN_SIMPLE.search(match_text)
|
||||
)
|
||||
|
||||
if not match:
|
||||
return None
|
||||
|
||||
package = match.group(1).lower()
|
||||
|
||||
# Remove version specifiers (e.g., @latest, @1.0.0)
|
||||
if "@" in package:
|
||||
package = package.split("@")[0]
|
||||
|
||||
# Validation checks
|
||||
if package.startswith("--") or "/" in package or "\\" in package or len(package) < 2:
|
||||
return None
|
||||
|
||||
return package
|
||||
|
||||
|
||||
def _calculate_retry_delay(
|
||||
status_code: int,
|
||||
retry_count: int,
|
||||
response_headers: httpx.Headers,
|
||||
) -> int:
|
||||
"""Calculate delay for retry based on status code and headers."""
|
||||
if status_code in (403, 429):
|
||||
# Try Retry-After header first
|
||||
retry_after = response_headers.get("Retry-After")
|
||||
if retry_after:
|
||||
try:
|
||||
return int(retry_after) + 2 # Add 2 second buffer
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Fall back to X-RateLimit-Reset
|
||||
reset_time_str = response_headers.get("X-RateLimit-Reset")
|
||||
if reset_time_str:
|
||||
try:
|
||||
reset_time = int(reset_time_str)
|
||||
current_time = int(time.time())
|
||||
return max(reset_time - current_time + 2, 10) # At least 10 seconds
|
||||
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Default: exponential backoff
|
||||
return min(INITIAL_RETRY_DELAY * (2**retry_count), MAX_RETRY_DELAY)
|
||||
|
||||
|
||||
def search_github_code(
|
||||
query: str,
|
||||
token: str,
|
||||
page: int,
|
||||
per_page: int = 100,
|
||||
retry_count: int = 0,
|
||||
) -> GitHubSearchResponse:
|
||||
headers = {
|
||||
"Accept": "application/vnd.github.text-match+json",
|
||||
"Authorization": f"Bearer {token}",
|
||||
}
|
||||
|
||||
params = {
|
||||
"q": query,
|
||||
"page": page,
|
||||
"per_page": min(per_page, 100),
|
||||
}
|
||||
|
||||
logger.info(f"Searching GitHub: {query} (page {page}, attempt {retry_count + 1})")
|
||||
|
||||
try:
|
||||
response = httpx.get(
|
||||
CODE_SEARCH_ENDPOINT,
|
||||
headers=headers,
|
||||
params=params,
|
||||
timeout=30.0,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Extract rate limit info
|
||||
remaining_str = response.headers.get("X-RateLimit-Remaining")
|
||||
reset_time_str = response.headers.get("X-RateLimit-Reset")
|
||||
rate_limit = RateLimitInfo(
|
||||
remaining=int(remaining_str) if remaining_str else None,
|
||||
reset_time=int(reset_time_str) if reset_time_str else None,
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
f"Rate limit remaining: {rate_limit.remaining}, reset at: {rate_limit.reset_time}"
|
||||
)
|
||||
|
||||
data = response.json()
|
||||
total_count = data.get("total_count", 0)
|
||||
logger.info(f"Count of total results: {total_count}")
|
||||
|
||||
return GitHubSearchResponse(
|
||||
items=data.get("items", []),
|
||||
total_count=total_count,
|
||||
rate_limit=rate_limit,
|
||||
)
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
status_code = e.response.status_code
|
||||
|
||||
# 422 on page 11+ is likely the hard 1000 result limit
|
||||
if status_code == 422 and page > GITHUB_CODE_SEARCH_MAX_PAGE:
|
||||
logger.info(
|
||||
f"422 error on page {page} - likely hit GitHub's 1000 result limit. "
|
||||
f"Code Search API only returns first {GITHUB_CODE_SEARCH_MAX_RESULTS} results."
|
||||
)
|
||||
raise ValueError(
|
||||
f"Reached GitHub Code Search API limit (page {page} > {GITHUB_CODE_SEARCH_MAX_PAGE})"
|
||||
) from e
|
||||
|
||||
# Retryable errors
|
||||
if status_code in (403, 422, 429) and retry_count < MAX_RETRIES:
|
||||
delay = _calculate_retry_delay(status_code, retry_count, e.response.headers)
|
||||
|
||||
if status_code == 403:
|
||||
logger.warning(
|
||||
f"Rate limit exceeded (403). Retrying in {delay}s "
|
||||
f"(attempt {retry_count + 1}/{MAX_RETRIES})"
|
||||
)
|
||||
elif status_code == 429:
|
||||
logger.warning(
|
||||
f"Rate limit exceeded (429). Retrying in {delay}s "
|
||||
f"(attempt {retry_count + 1}/{MAX_RETRIES})"
|
||||
)
|
||||
elif status_code == 422:
|
||||
logger.warning(
|
||||
f"Validation error (422) - may be transient. Retrying in {delay}s "
|
||||
f"(attempt {retry_count + 1}/{MAX_RETRIES})"
|
||||
)
|
||||
|
||||
time.sleep(delay)
|
||||
return search_github_code(query, token, page, per_page, retry_count + 1)
|
||||
|
||||
# Non-retryable or max retries reached
|
||||
if status_code == 403:
|
||||
logger.error(
|
||||
"Rate limit exceeded or authentication failed after retries. "
|
||||
"Check your token and wait before retrying."
|
||||
)
|
||||
elif status_code == 422:
|
||||
logger.error(f"Invalid query after retries: {query}")
|
||||
else:
|
||||
logger.error(f"HTTP error {status_code} after retries")
|
||||
|
||||
except httpx.RequestError as e:
|
||||
# Network errors are retryable
|
||||
if retry_count < MAX_RETRIES:
|
||||
delay = min(INITIAL_RETRY_DELAY * (2**retry_count), MAX_RETRY_DELAY)
|
||||
logger.warning(
|
||||
f"Request failed: {e}. Retrying in {delay}s "
|
||||
f"(attempt {retry_count + 1}/{MAX_RETRIES})"
|
||||
)
|
||||
time.sleep(delay)
|
||||
return search_github_code(query, token, page, per_page, retry_count + 1)
|
||||
|
||||
logger.error(f"Request failed after retries: {e}")
|
||||
raise
|
||||
|
||||
|
||||
async def wait_for_rate_limit(rate_limit: RateLimitInfo) -> None:
|
||||
"""
|
||||
Wait if we're approaching rate limit or need to wait until reset.
|
||||
|
||||
Args:
|
||||
rate_limit: Rate limit information from previous request
|
||||
"""
|
||||
if rate_limit.remaining is None or rate_limit.reset_time is None:
|
||||
await asyncio.sleep(RATE_LIMIT_DELAY)
|
||||
return
|
||||
|
||||
# If running low on requests, wait until reset
|
||||
if rate_limit.remaining <= 2:
|
||||
wait_time = rate_limit.reset_time - int(time.time()) + 2 # Add 2 second buffer
|
||||
if wait_time > 0:
|
||||
logger.info(
|
||||
f"Rate limit low ({rate_limit.remaining} remaining). "
|
||||
f"Waiting {wait_time}s until reset at {rate_limit.reset_time}"
|
||||
)
|
||||
await asyncio.sleep(wait_time)
|
||||
else:
|
||||
await asyncio.sleep(RATE_LIMIT_DELAY)
|
||||
else:
|
||||
await asyncio.sleep(RATE_LIMIT_DELAY)
|
||||
|
||||
|
||||
def build_size_query(base_query: str, start_bytes: int, end_bytes: Optional[int]) -> str:
|
||||
"""Build a GitHub Code Search query with size filter."""
|
||||
if end_bytes is None:
|
||||
return f"{base_query} size:>={start_bytes}"
|
||||
return f"{base_query} size:{start_bytes}..{end_bytes}"
|
||||
|
||||
|
||||
async def check_pypi_package_exists(
|
||||
package: str,
|
||||
cache: dict[str, bool],
|
||||
client: httpx.AsyncClient,
|
||||
) -> tuple[str, bool]:
|
||||
"""
|
||||
Check if a single package exists on PyPI.
|
||||
|
||||
Args:
|
||||
package: Package name to check
|
||||
cache: Dictionary to cache results (modified in-place)
|
||||
client: httpx async client instance
|
||||
|
||||
Returns:
|
||||
Tuple of (package_name, exists)
|
||||
"""
|
||||
# Check cache first
|
||||
if package in cache:
|
||||
return (package, cache[package])
|
||||
|
||||
url = PYPI_JSON_API_TEMPLATE.format(package=package)
|
||||
|
||||
try:
|
||||
response = await client.get(url, timeout=10.0, follow_redirects=True)
|
||||
exists = response.status_code == 200
|
||||
cache[package] = exists
|
||||
|
||||
if exists:
|
||||
logger.debug(f"✓ {package} exists on PyPI")
|
||||
else:
|
||||
logger.debug(f"✗ {package} not found on PyPI")
|
||||
|
||||
return (package, exists)
|
||||
except httpx.RequestError as e:
|
||||
logger.debug(f"Error checking {package} on PyPI: {e}")
|
||||
cache[package] = False
|
||||
return (package, False)
|
||||
|
||||
|
||||
async def check_packages_batch(
|
||||
packages: list[str],
|
||||
cache: dict[str, bool],
|
||||
semaphore: asyncio.Semaphore,
|
||||
) -> dict[str, bool]:
|
||||
"""
|
||||
Check a batch of packages against PyPI concurrently.
|
||||
|
||||
Args:
|
||||
packages: List of package names to check
|
||||
cache: Dictionary to cache results (modified in-place)
|
||||
semaphore: Semaphore to limit concurrent requests
|
||||
|
||||
Returns:
|
||||
Dictionary mapping package names to their existence status
|
||||
"""
|
||||
async def check_one(package: str) -> tuple[str, bool]:
|
||||
async with semaphore:
|
||||
async with httpx.AsyncClient() as client:
|
||||
return await check_pypi_package_exists(package, cache, client)
|
||||
|
||||
tasks = [check_one(pkg) for pkg in packages]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=False)
|
||||
return dict(results)
|
||||
|
||||
|
||||
def extract_packages_from_items(items: list[dict[str, Any]]) -> Counter:
|
||||
page_packages = Counter()
|
||||
|
||||
for item in items:
|
||||
# Extract from text_matches (code snippets)
|
||||
text_matches = item.get("text_matches", [])
|
||||
for match in text_matches:
|
||||
fragment = match.get("fragment", "")
|
||||
package = extract_package_name(fragment)
|
||||
if package:
|
||||
page_packages[package] += 1
|
||||
logger.debug(f"Found package: {package}")
|
||||
|
||||
# Also check file path/name
|
||||
path = item.get("path", "")
|
||||
if "uvx" in path.lower():
|
||||
package = extract_package_name(path)
|
||||
if package:
|
||||
page_packages[package] += 1
|
||||
|
||||
return page_packages
|
||||
|
||||
|
||||
async def search_uvx_usage(
|
||||
token: str, max_pages: int = 10
|
||||
) -> tuple[Counter[str], dict[str, bool]]:
|
||||
"""
|
||||
Search for uvx usage across GitHub and extract package names.
|
||||
|
||||
Processes packages incrementally and checks PyPI concurrently.
|
||||
|
||||
Args:
|
||||
token: GitHub Personal Access Token
|
||||
max_pages: Maximum number of pages to fetch per query (default: 10)
|
||||
|
||||
Returns:
|
||||
Tuple of (Counter of valid package names with counts, updated PyPI cache)
|
||||
"""
|
||||
pypi_cache: dict[str, bool] = {}
|
||||
valid_package_counts: Counter[str] = Counter()
|
||||
all_package_counts: Counter[str] = Counter()
|
||||
unknown_packages_queue: list[str] = []
|
||||
|
||||
semaphore = asyncio.Semaphore(PYPI_CONCURRENT_CHECKS)
|
||||
current_rate_limit = RateLimitInfo(None, None)
|
||||
|
||||
# Size buckets to work around GitHub's 1000 result limit
|
||||
# It would be way smarter to do this dynamically (query a given size range and do a
|
||||
# binary/proportional split on the number of results) but I already got this far
|
||||
# so I'm not going to change it for now.
|
||||
markdown_size_buckets = [
|
||||
(0, 1025),
|
||||
(1025, 1250),
|
||||
(1250, 1500),
|
||||
(1500, 1750),
|
||||
(1750, 2000),
|
||||
(2000, 2500),
|
||||
(2500, 3500),
|
||||
(3500, 4500),
|
||||
(4500, 5500),
|
||||
(5500, 6250),
|
||||
(6250, 7000),
|
||||
(7000, 7750),
|
||||
(7750, 8500),
|
||||
(8500, 9250),
|
||||
(9250, 10000),
|
||||
(10000, 10750),
|
||||
(10750, 11750),
|
||||
(11750, 13000),
|
||||
(13000, 14000),
|
||||
(14000, 15250),
|
||||
(15250, 16250),
|
||||
(16250, 17500),
|
||||
(17500, 18750),
|
||||
(18750, 20000),
|
||||
(20000, 22000),
|
||||
(22000, 24000),
|
||||
(24000, 26000),
|
||||
(26000, 28000),
|
||||
(28000, 30000),
|
||||
(30000, 33000),
|
||||
(33000, 36000),
|
||||
(36000, 39000),
|
||||
(39000, 42000),
|
||||
(42000, 45000),
|
||||
(45000, 50000),
|
||||
(50000, 60000),
|
||||
(60000, 70000),
|
||||
(70000, 80000),
|
||||
(80000, 100000),
|
||||
(100000, 120000),
|
||||
(120000, 140000),
|
||||
(140000, 160000),
|
||||
(160000, 180000),
|
||||
(180000, 200000),
|
||||
(200000, 250000),
|
||||
(250000, 300000),
|
||||
(300000, None),
|
||||
]
|
||||
|
||||
shell_size_buckets = [
|
||||
(0, 2800),
|
||||
(2800, 6000),
|
||||
(6000, 15000),
|
||||
(15000, 32000),
|
||||
(32000, None),
|
||||
]
|
||||
|
||||
queries = [
|
||||
build_size_query("uvx AND language:Markdown in:file", start, end)
|
||||
for start, end in markdown_size_buckets
|
||||
]
|
||||
queries.extend(
|
||||
build_size_query("uvx AND language:Shell in:file", start, end)
|
||||
for start, end in shell_size_buckets
|
||||
)
|
||||
|
||||
async def process_unknown_packages() -> None:
|
||||
"""Process queued unknown packages against PyPI."""
|
||||
if not unknown_packages_queue:
|
||||
return
|
||||
|
||||
packages_to_check = list(set(unknown_packages_queue))
|
||||
unknown_packages_queue.clear()
|
||||
|
||||
logger.info(f"Checking {len(packages_to_check)} unknown packages against PyPI...")
|
||||
results = await check_packages_batch(packages_to_check, pypi_cache, semaphore)
|
||||
|
||||
# Update valid package counts based on results
|
||||
for package, exists in results.items():
|
||||
if exists:
|
||||
count = all_package_counts.get(package, 0)
|
||||
if count > 0:
|
||||
valid_package_counts[package] = count
|
||||
logger.debug(f"Added {package} to valid packages ({count} occurrences)")
|
||||
else:
|
||||
logger.warning(f"Package {package} validated but has no count")
|
||||
|
||||
for query_idx, query in enumerate(queries):
|
||||
page = 1
|
||||
effective_max_pages = min(max_pages, GITHUB_CODE_SEARCH_MAX_PAGE)
|
||||
|
||||
# Wait before starting a new query (except the first one)
|
||||
if query_idx > 0:
|
||||
logger.debug("Waiting before starting new query...")
|
||||
await wait_for_rate_limit(current_rate_limit)
|
||||
await process_unknown_packages()
|
||||
|
||||
while page <= effective_max_pages:
|
||||
try:
|
||||
# Rate limiting: wait between page requests (except for the first page)
|
||||
if page > 1:
|
||||
logger.debug("Waiting before next page...")
|
||||
await wait_for_rate_limit(current_rate_limit)
|
||||
await process_unknown_packages()
|
||||
|
||||
response = search_github_code(query, token, page=page)
|
||||
|
||||
# Update rate limit state from response
|
||||
current_rate_limit = response.rate_limit
|
||||
|
||||
items = response.items
|
||||
if not items:
|
||||
logger.info(f"No more results for query: {query}")
|
||||
break
|
||||
|
||||
logger.info(f"Found {len(items)} results on page {page}")
|
||||
|
||||
# Extract package names from this page
|
||||
page_packages = extract_packages_from_items(items)
|
||||
|
||||
# Process packages from this page
|
||||
for package, count in page_packages.items():
|
||||
all_package_counts[package] += count
|
||||
|
||||
# Check cache first
|
||||
if package in pypi_cache:
|
||||
if pypi_cache[package]:
|
||||
valid_package_counts[package] = all_package_counts[package]
|
||||
logger.debug(
|
||||
f"Known valid: {package} (total: {all_package_counts[package]})"
|
||||
)
|
||||
else:
|
||||
unknown_packages_queue.append(package)
|
||||
|
||||
# Process unknown packages while we have time before next GitHub request
|
||||
if unknown_packages_queue:
|
||||
await process_unknown_packages()
|
||||
|
||||
# Check if there are more pages
|
||||
effective_total = min(
|
||||
response.total_count, GITHUB_CODE_SEARCH_MAX_RESULTS
|
||||
)
|
||||
|
||||
if len(items) < 100 or page * 100 >= effective_total:
|
||||
logger.info(
|
||||
f"Reached end of results for query: {query} "
|
||||
f"(page {page}, total: {response.total_count})"
|
||||
)
|
||||
break
|
||||
|
||||
page += 1
|
||||
|
||||
except ValueError as e:
|
||||
# This is raised when we hit the 1000 result limit
|
||||
logger.info(f"Hit GitHub Code Search API limit: {e}")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing page {page} of query '{query}': {e}")
|
||||
break
|
||||
|
||||
# Process any remaining unknown packages after each query
|
||||
await process_unknown_packages()
|
||||
|
||||
# Final processing of any remaining unknown packages
|
||||
await process_unknown_packages()
|
||||
|
||||
logger.info(
|
||||
f"Found {len(valid_package_counts)} valid PyPI packages "
|
||||
f"out of {len(all_package_counts)} total"
|
||||
)
|
||||
|
||||
return valid_package_counts, pypi_cache
|
||||
|
||||
|
||||
def write_top_packages(
|
||||
package_counts: Counter[str],
|
||||
output_path: Path,
|
||||
debug_output_path: Path,
|
||||
min_count: int = 2,
|
||||
) -> None:
|
||||
"""
|
||||
Write top packages to files, sorted by frequency.
|
||||
|
||||
Packages are written in buckets by threshold (100+, 25+, 10+, 5+, min_count+).
|
||||
|
||||
Args:
|
||||
package_counts: Counter of package names and counts
|
||||
output_path: Path to output file (main packages list)
|
||||
debug_output_path: Path to debug output file (with counts)
|
||||
min_count: Minimum occurrence count to include (default: 2)
|
||||
"""
|
||||
thresholds = [min_count, 5, 10, 25, 100]
|
||||
|
||||
# Filter packages into buckets by threshold
|
||||
buckets = []
|
||||
for i, threshold in enumerate(thresholds):
|
||||
next_threshold = thresholds[i + 1] if i + 1 < len(thresholds) else float("inf")
|
||||
bucket_packages = {
|
||||
pkg: count
|
||||
for pkg, count in package_counts.items()
|
||||
if threshold <= count < next_threshold
|
||||
}
|
||||
buckets.append({"threshold": threshold, "packages": bucket_packages})
|
||||
|
||||
with open(output_path, "w") as f, open(debug_output_path, "w") as f_debug:
|
||||
for bucket in reversed(buckets):
|
||||
threshold = bucket["threshold"]
|
||||
packages = bucket["packages"]
|
||||
logger.info(
|
||||
f"Greater than or equal to {threshold} mentions: {len(packages)} packages"
|
||||
)
|
||||
|
||||
# Sort by count descending, then alphabetically
|
||||
sorted_packages = sorted(
|
||||
packages.items(), key=lambda x: (-x[1], x[0])
|
||||
)
|
||||
|
||||
for package, count in sorted_packages:
|
||||
f.write(f"{package}\n")
|
||||
f_debug.write(f"{package}: {count}\n")
|
||||
|
||||
logger.info(f"Successfully wrote top packages to {output_path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Fetch popular packages from GitHub by searching for uvx usage"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--token",
|
||||
type=str,
|
||||
help="GitHub Personal Access Token (or set GITHUB_TOKEN env var)",
|
||||
default=os.getenv("GITHUB_TOKEN"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
help="Output file path (default: top_packages.txt)",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--debug-output",
|
||||
type=Path,
|
||||
help="Debug output file path (default: top_packages_debug.txt)",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-pages",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Maximum pages to fetch per query (default: 10)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-count",
|
||||
type=int,
|
||||
default=2,
|
||||
help="Minimum occurrence count to include (default: 2)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
"-v",
|
||||
action="store_true",
|
||||
help="Enable verbose logging",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.verbose:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
if not args.token:
|
||||
logger.error(
|
||||
"GitHub token is required. Set GITHUB_TOKEN environment variable "
|
||||
"or pass --token. Create a token at: https://github.com/settings/tokens"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
# Set default output paths
|
||||
if args.output is None or args.debug_output is None:
|
||||
script_dir = Path(__file__).parent
|
||||
project_root = script_dir.parent.parent
|
||||
if args.output is None:
|
||||
args.output = (
|
||||
project_root
|
||||
/ "crates"
|
||||
/ "uv"
|
||||
/ "src"
|
||||
/ "commands"
|
||||
/ "tool"
|
||||
/ "top_packages.txt"
|
||||
)
|
||||
if args.debug_output is None:
|
||||
args.debug_output = (
|
||||
project_root
|
||||
/ "crates"
|
||||
/ "uv"
|
||||
/ "src"
|
||||
/ "commands"
|
||||
/ "tool"
|
||||
/ "top_packages_debug.txt"
|
||||
)
|
||||
|
||||
logger.info("Starting GitHub search for uvx usage...")
|
||||
logger.info(f"Output will be written to: {args.output}")
|
||||
logger.info(f"Debug output will be written to: {args.debug_output}")
|
||||
|
||||
valid_packages, pypi_cache = asyncio.run(
|
||||
search_uvx_usage(args.token, max_pages=args.max_pages)
|
||||
)
|
||||
|
||||
if not valid_packages:
|
||||
logger.warning("No valid PyPI packages found.")
|
||||
sys.exit(1)
|
||||
|
||||
logger.info(f"Found {len(valid_packages)} valid PyPI packages")
|
||||
logger.info(f"Top 10 valid packages: {valid_packages.most_common(10)}")
|
||||
logger.info(f"PyPI cache contains {len(pypi_cache)} entries")
|
||||
|
||||
write_top_packages(
|
||||
valid_packages, args.output, args.debug_output, min_count=args.min_count
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue