mirror of https://github.com/astral-sh/uv
top-packages generation script
Signed-off-by: Mikayla Thompson <mrt@mikayla.codes>
This commit is contained in:
parent
2642acc80b
commit
559d494884
|
|
@ -0,0 +1,167 @@
|
||||||
|
ruff
|
||||||
|
agent-starter-pack
|
||||||
|
pre-commit
|
||||||
|
marimo
|
||||||
|
args
|
||||||
|
cookiecutter
|
||||||
|
pyright
|
||||||
|
ngrok
|
||||||
|
pytest
|
||||||
|
hatch
|
||||||
|
llm
|
||||||
|
markitdown
|
||||||
|
mcp-server-fetch
|
||||||
|
ty
|
||||||
|
copier
|
||||||
|
command
|
||||||
|
mcp-server-git
|
||||||
|
mcpo
|
||||||
|
uvx
|
||||||
|
black
|
||||||
|
streamdown
|
||||||
|
autoflake
|
||||||
|
run
|
||||||
|
nox
|
||||||
|
files-to-prompt
|
||||||
|
mypy
|
||||||
|
tox
|
||||||
|
yt-dlp
|
||||||
|
split_markdown4gpt
|
||||||
|
mcp-server-time
|
||||||
|
basic-memory
|
||||||
|
synth-ai
|
||||||
|
gac
|
||||||
|
twine
|
||||||
|
are
|
||||||
|
codetoprompt
|
||||||
|
open-webui
|
||||||
|
browser-use
|
||||||
|
pip
|
||||||
|
wpull
|
||||||
|
build
|
||||||
|
with
|
||||||
|
blender-remote
|
||||||
|
https
|
||||||
|
migrate-to-uv
|
||||||
|
bandit
|
||||||
|
mcp-atlassian
|
||||||
|
mkdocs
|
||||||
|
playwright
|
||||||
|
everything
|
||||||
|
maturin
|
||||||
|
cruft
|
||||||
|
mcp-feedback-enhanced
|
||||||
|
smithery
|
||||||
|
aignostics
|
||||||
|
databricks
|
||||||
|
jupyterlab
|
||||||
|
tool
|
||||||
|
echo
|
||||||
|
elevenlabs-mcp
|
||||||
|
ipython
|
||||||
|
package
|
||||||
|
pycowsay
|
||||||
|
support
|
||||||
|
completions
|
||||||
|
jupyter
|
||||||
|
mini-swe-agent
|
||||||
|
openhands-ai
|
||||||
|
pyenvsearch
|
||||||
|
rogue-ai
|
||||||
|
uv
|
||||||
|
uv-dynamic-versioning
|
||||||
|
ansible-core
|
||||||
|
class
|
||||||
|
gcovr
|
||||||
|
handles
|
||||||
|
mcp
|
||||||
|
mcp-proxy
|
||||||
|
mcp-scan
|
||||||
|
mcp-server-sqlite
|
||||||
|
streamlit
|
||||||
|
blender-mcp
|
||||||
|
create-mcp-server
|
||||||
|
eyelet
|
||||||
|
mcp-server-qdrant
|
||||||
|
mcp-wiki
|
||||||
|
on
|
||||||
|
path
|
||||||
|
pytest-watch
|
||||||
|
setup
|
||||||
|
textual-demo
|
||||||
|
voice-mode
|
||||||
|
automagik-tools
|
||||||
|
bump-my-version
|
||||||
|
kedro
|
||||||
|
schemathesis
|
||||||
|
huggingface_hub
|
||||||
|
integration
|
||||||
|
jupyter-core
|
||||||
|
jupytext
|
||||||
|
llm-discovery
|
||||||
|
mcp-sse-shim
|
||||||
|
openhands
|
||||||
|
reachy-mini
|
||||||
|
runs
|
||||||
|
usage
|
||||||
|
automagik-hive
|
||||||
|
business-use-core
|
||||||
|
cookieplone
|
||||||
|
cowsay
|
||||||
|
create-dagster
|
||||||
|
distribution
|
||||||
|
eval
|
||||||
|
healthyselfjournal
|
||||||
|
httpie
|
||||||
|
markitdown-mcp
|
||||||
|
speaches-cli
|
||||||
|
strip-tags
|
||||||
|
autocompletion
|
||||||
|
chroma
|
||||||
|
cmake
|
||||||
|
compatibility
|
||||||
|
detect-secrets
|
||||||
|
dist
|
||||||
|
evalgate
|
||||||
|
iowarp-mcps
|
||||||
|
isort
|
||||||
|
juv
|
||||||
|
lean-lsp-mcp
|
||||||
|
llamactl
|
||||||
|
nearc
|
||||||
|
ngiab_data_preprocess
|
||||||
|
nvitop
|
||||||
|
prek
|
||||||
|
requests
|
||||||
|
runner
|
||||||
|
samstacks
|
||||||
|
semgrep-mcp
|
||||||
|
specifyplus
|
||||||
|
sqlite-utils
|
||||||
|
the
|
||||||
|
uv-publish
|
||||||
|
wandb
|
||||||
|
agentprobe
|
||||||
|
audible-cli
|
||||||
|
cakemail-api-docs-mcp
|
||||||
|
datasette
|
||||||
|
fonttools
|
||||||
|
git-cliff
|
||||||
|
goldenverba
|
||||||
|
honcho
|
||||||
|
internetarchive
|
||||||
|
mcp-google-sheets
|
||||||
|
modal
|
||||||
|
modern
|
||||||
|
pdm
|
||||||
|
poe
|
||||||
|
pyinstaller
|
||||||
|
pyrefly
|
||||||
|
radon
|
||||||
|
should
|
||||||
|
sniffly
|
||||||
|
specify
|
||||||
|
superclaude
|
||||||
|
tools
|
||||||
|
workspace-mcp
|
||||||
|
yt-mpv
|
||||||
|
|
@ -0,0 +1,753 @@
|
||||||
|
# /// script
|
||||||
|
# requires-python = ">=3.13"
|
||||||
|
# dependencies = [
|
||||||
|
# "httpx"
|
||||||
|
# ]
|
||||||
|
# ///
|
||||||
|
|
||||||
|
"""
|
||||||
|
Use the GitHub Code Search API to find instances of `uvx <package>` in:
|
||||||
|
- README files (*.md)
|
||||||
|
- Shell scripts (*.sh, *.bash, *.zsh)
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
- A GitHub Personal Access Token (PAT) with `public_repo` scope
|
||||||
|
- Set the GITHUB_TOKEN environment variable or pass --token
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/uvx_usage_on_gh/fetch_uvx_usage.py --output crates/uv/src/commands/tool/top_packages.txt
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from collections import Counter
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, NamedTuple, Optional
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# GitHub API configuration
|
||||||
|
GITHUB_API_BASE = "https://api.github.com"
|
||||||
|
CODE_SEARCH_ENDPOINT = f"{GITHUB_API_BASE}/search/code"
|
||||||
|
|
||||||
|
# Rate limiting configuration
|
||||||
|
RATE_LIMIT_DELAY = 6.1 # seconds between requests (slightly more than 60/10)
|
||||||
|
|
||||||
|
# GitHub Code Search API limits
|
||||||
|
GITHUB_CODE_SEARCH_MAX_RESULTS = 1000 # Hard limit: only first 1000 results accessible
|
||||||
|
GITHUB_CODE_SEARCH_MAX_PAGE = 10 # Page 10 = results 901-1000, page 11+ returns 422
|
||||||
|
|
||||||
|
# Retry configuration
|
||||||
|
MAX_RETRIES = 5
|
||||||
|
INITIAL_RETRY_DELAY = 10 # seconds
|
||||||
|
MAX_RETRY_DELAY = 300 # 5 minutes max delay
|
||||||
|
|
||||||
|
# PyPI check concurrency
|
||||||
|
PYPI_CONCURRENT_CHECKS = 20 # Number of concurrent PyPI checks
|
||||||
|
|
||||||
|
# PyPI API endpoint
|
||||||
|
PYPI_JSON_API_TEMPLATE = "https://pypi.org/pypi/{package}/json"
|
||||||
|
|
||||||
|
|
||||||
|
class RateLimitInfo(NamedTuple):
|
||||||
|
remaining: int | None
|
||||||
|
reset_time: int | None
|
||||||
|
|
||||||
|
|
||||||
|
class GitHubSearchResponse(NamedTuple):
|
||||||
|
items: list[dict[str, Any]]
|
||||||
|
total_count: int
|
||||||
|
rate_limit: RateLimitInfo
|
||||||
|
|
||||||
|
|
||||||
|
# Regex patterns for extracting package names
|
||||||
|
PACKAGE_PATTERN_FROM = re.compile(
|
||||||
|
r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)*--from\s+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
PACKAGE_PATTERN_NORMAL = re.compile(
|
||||||
|
r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
PACKAGE_PATTERN_SIMPLE = re.compile(
|
||||||
|
r"\buvx\s+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
URL_PATTERN = re.compile(
|
||||||
|
r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)*--from\s+(git\+[a-z]+://|git://|https?://)",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_package_name(match_text: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Extract package name from a match.
|
||||||
|
|
||||||
|
Handles patterns like:
|
||||||
|
- uvx ruff
|
||||||
|
- uvx --from httpie http (extracts "httpie")
|
||||||
|
- uvx --python 3.12 textual-demo
|
||||||
|
- uvx black@latest
|
||||||
|
- uvx pytest --version
|
||||||
|
- uvx streamlit run streamlit_app/dashboard.py
|
||||||
|
|
||||||
|
Skips patterns like:
|
||||||
|
- uvx --from git+https://... (URLs are not package names)
|
||||||
|
- uvx --from http://... (URLs are not package names)
|
||||||
|
"""
|
||||||
|
# Skip URLs after --from
|
||||||
|
if URL_PATTERN.search(match_text):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Try patterns in order: --from, flags, simple
|
||||||
|
match = (
|
||||||
|
PACKAGE_PATTERN_FROM.search(match_text)
|
||||||
|
or PACKAGE_PATTERN_NORMAL.search(match_text)
|
||||||
|
or PACKAGE_PATTERN_SIMPLE.search(match_text)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
|
||||||
|
package = match.group(1).lower()
|
||||||
|
|
||||||
|
# Remove version specifiers (e.g., @latest, @1.0.0)
|
||||||
|
if "@" in package:
|
||||||
|
package = package.split("@")[0]
|
||||||
|
|
||||||
|
# Validation checks
|
||||||
|
if package.startswith("--") or "/" in package or "\\" in package or len(package) < 2:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return package
|
||||||
|
|
||||||
|
|
||||||
|
def _calculate_retry_delay(
|
||||||
|
status_code: int,
|
||||||
|
retry_count: int,
|
||||||
|
response_headers: httpx.Headers,
|
||||||
|
) -> int:
|
||||||
|
"""Calculate delay for retry based on status code and headers."""
|
||||||
|
if status_code in (403, 429):
|
||||||
|
# Try Retry-After header first
|
||||||
|
retry_after = response_headers.get("Retry-After")
|
||||||
|
if retry_after:
|
||||||
|
try:
|
||||||
|
return int(retry_after) + 2 # Add 2 second buffer
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fall back to X-RateLimit-Reset
|
||||||
|
reset_time_str = response_headers.get("X-RateLimit-Reset")
|
||||||
|
if reset_time_str:
|
||||||
|
try:
|
||||||
|
reset_time = int(reset_time_str)
|
||||||
|
current_time = int(time.time())
|
||||||
|
return max(reset_time - current_time + 2, 10) # At least 10 seconds
|
||||||
|
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Default: exponential backoff
|
||||||
|
return min(INITIAL_RETRY_DELAY * (2**retry_count), MAX_RETRY_DELAY)
|
||||||
|
|
||||||
|
|
||||||
|
def search_github_code(
|
||||||
|
query: str,
|
||||||
|
token: str,
|
||||||
|
page: int,
|
||||||
|
per_page: int = 100,
|
||||||
|
retry_count: int = 0,
|
||||||
|
) -> GitHubSearchResponse:
|
||||||
|
headers = {
|
||||||
|
"Accept": "application/vnd.github.text-match+json",
|
||||||
|
"Authorization": f"Bearer {token}",
|
||||||
|
}
|
||||||
|
|
||||||
|
params = {
|
||||||
|
"q": query,
|
||||||
|
"page": page,
|
||||||
|
"per_page": min(per_page, 100),
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(f"Searching GitHub: {query} (page {page}, attempt {retry_count + 1})")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = httpx.get(
|
||||||
|
CODE_SEARCH_ENDPOINT,
|
||||||
|
headers=headers,
|
||||||
|
params=params,
|
||||||
|
timeout=30.0,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Extract rate limit info
|
||||||
|
remaining_str = response.headers.get("X-RateLimit-Remaining")
|
||||||
|
reset_time_str = response.headers.get("X-RateLimit-Reset")
|
||||||
|
rate_limit = RateLimitInfo(
|
||||||
|
remaining=int(remaining_str) if remaining_str else None,
|
||||||
|
reset_time=int(reset_time_str) if reset_time_str else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Rate limit remaining: {rate_limit.remaining}, reset at: {rate_limit.reset_time}"
|
||||||
|
)
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
total_count = data.get("total_count", 0)
|
||||||
|
logger.info(f"Count of total results: {total_count}")
|
||||||
|
|
||||||
|
return GitHubSearchResponse(
|
||||||
|
items=data.get("items", []),
|
||||||
|
total_count=total_count,
|
||||||
|
rate_limit=rate_limit,
|
||||||
|
)
|
||||||
|
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
status_code = e.response.status_code
|
||||||
|
|
||||||
|
# 422 on page 11+ is likely the hard 1000 result limit
|
||||||
|
if status_code == 422 and page > GITHUB_CODE_SEARCH_MAX_PAGE:
|
||||||
|
logger.info(
|
||||||
|
f"422 error on page {page} - likely hit GitHub's 1000 result limit. "
|
||||||
|
f"Code Search API only returns first {GITHUB_CODE_SEARCH_MAX_RESULTS} results."
|
||||||
|
)
|
||||||
|
raise ValueError(
|
||||||
|
f"Reached GitHub Code Search API limit (page {page} > {GITHUB_CODE_SEARCH_MAX_PAGE})"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
# Retryable errors
|
||||||
|
if status_code in (403, 422, 429) and retry_count < MAX_RETRIES:
|
||||||
|
delay = _calculate_retry_delay(status_code, retry_count, e.response.headers)
|
||||||
|
|
||||||
|
if status_code == 403:
|
||||||
|
logger.warning(
|
||||||
|
f"Rate limit exceeded (403). Retrying in {delay}s "
|
||||||
|
f"(attempt {retry_count + 1}/{MAX_RETRIES})"
|
||||||
|
)
|
||||||
|
elif status_code == 429:
|
||||||
|
logger.warning(
|
||||||
|
f"Rate limit exceeded (429). Retrying in {delay}s "
|
||||||
|
f"(attempt {retry_count + 1}/{MAX_RETRIES})"
|
||||||
|
)
|
||||||
|
elif status_code == 422:
|
||||||
|
logger.warning(
|
||||||
|
f"Validation error (422) - may be transient. Retrying in {delay}s "
|
||||||
|
f"(attempt {retry_count + 1}/{MAX_RETRIES})"
|
||||||
|
)
|
||||||
|
|
||||||
|
time.sleep(delay)
|
||||||
|
return search_github_code(query, token, page, per_page, retry_count + 1)
|
||||||
|
|
||||||
|
# Non-retryable or max retries reached
|
||||||
|
if status_code == 403:
|
||||||
|
logger.error(
|
||||||
|
"Rate limit exceeded or authentication failed after retries. "
|
||||||
|
"Check your token and wait before retrying."
|
||||||
|
)
|
||||||
|
elif status_code == 422:
|
||||||
|
logger.error(f"Invalid query after retries: {query}")
|
||||||
|
else:
|
||||||
|
logger.error(f"HTTP error {status_code} after retries")
|
||||||
|
|
||||||
|
except httpx.RequestError as e:
|
||||||
|
# Network errors are retryable
|
||||||
|
if retry_count < MAX_RETRIES:
|
||||||
|
delay = min(INITIAL_RETRY_DELAY * (2**retry_count), MAX_RETRY_DELAY)
|
||||||
|
logger.warning(
|
||||||
|
f"Request failed: {e}. Retrying in {delay}s "
|
||||||
|
f"(attempt {retry_count + 1}/{MAX_RETRIES})"
|
||||||
|
)
|
||||||
|
time.sleep(delay)
|
||||||
|
return search_github_code(query, token, page, per_page, retry_count + 1)
|
||||||
|
|
||||||
|
logger.error(f"Request failed after retries: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
async def wait_for_rate_limit(rate_limit: RateLimitInfo) -> None:
|
||||||
|
"""
|
||||||
|
Wait if we're approaching rate limit or need to wait until reset.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rate_limit: Rate limit information from previous request
|
||||||
|
"""
|
||||||
|
if rate_limit.remaining is None or rate_limit.reset_time is None:
|
||||||
|
await asyncio.sleep(RATE_LIMIT_DELAY)
|
||||||
|
return
|
||||||
|
|
||||||
|
# If running low on requests, wait until reset
|
||||||
|
if rate_limit.remaining <= 2:
|
||||||
|
wait_time = rate_limit.reset_time - int(time.time()) + 2 # Add 2 second buffer
|
||||||
|
if wait_time > 0:
|
||||||
|
logger.info(
|
||||||
|
f"Rate limit low ({rate_limit.remaining} remaining). "
|
||||||
|
f"Waiting {wait_time}s until reset at {rate_limit.reset_time}"
|
||||||
|
)
|
||||||
|
await asyncio.sleep(wait_time)
|
||||||
|
else:
|
||||||
|
await asyncio.sleep(RATE_LIMIT_DELAY)
|
||||||
|
else:
|
||||||
|
await asyncio.sleep(RATE_LIMIT_DELAY)
|
||||||
|
|
||||||
|
|
||||||
|
def build_size_query(base_query: str, start_bytes: int, end_bytes: Optional[int]) -> str:
|
||||||
|
"""Build a GitHub Code Search query with size filter."""
|
||||||
|
if end_bytes is None:
|
||||||
|
return f"{base_query} size:>={start_bytes}"
|
||||||
|
return f"{base_query} size:{start_bytes}..{end_bytes}"
|
||||||
|
|
||||||
|
|
||||||
|
async def check_pypi_package_exists(
|
||||||
|
package: str,
|
||||||
|
cache: dict[str, bool],
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
) -> tuple[str, bool]:
|
||||||
|
"""
|
||||||
|
Check if a single package exists on PyPI.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
package: Package name to check
|
||||||
|
cache: Dictionary to cache results (modified in-place)
|
||||||
|
client: httpx async client instance
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (package_name, exists)
|
||||||
|
"""
|
||||||
|
# Check cache first
|
||||||
|
if package in cache:
|
||||||
|
return (package, cache[package])
|
||||||
|
|
||||||
|
url = PYPI_JSON_API_TEMPLATE.format(package=package)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await client.get(url, timeout=10.0, follow_redirects=True)
|
||||||
|
exists = response.status_code == 200
|
||||||
|
cache[package] = exists
|
||||||
|
|
||||||
|
if exists:
|
||||||
|
logger.debug(f"✓ {package} exists on PyPI")
|
||||||
|
else:
|
||||||
|
logger.debug(f"✗ {package} not found on PyPI")
|
||||||
|
|
||||||
|
return (package, exists)
|
||||||
|
except httpx.RequestError as e:
|
||||||
|
logger.debug(f"Error checking {package} on PyPI: {e}")
|
||||||
|
cache[package] = False
|
||||||
|
return (package, False)
|
||||||
|
|
||||||
|
|
||||||
|
async def check_packages_batch(
|
||||||
|
packages: list[str],
|
||||||
|
cache: dict[str, bool],
|
||||||
|
semaphore: asyncio.Semaphore,
|
||||||
|
) -> dict[str, bool]:
|
||||||
|
"""
|
||||||
|
Check a batch of packages against PyPI concurrently.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
packages: List of package names to check
|
||||||
|
cache: Dictionary to cache results (modified in-place)
|
||||||
|
semaphore: Semaphore to limit concurrent requests
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary mapping package names to their existence status
|
||||||
|
"""
|
||||||
|
async def check_one(package: str) -> tuple[str, bool]:
|
||||||
|
async with semaphore:
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
return await check_pypi_package_exists(package, cache, client)
|
||||||
|
|
||||||
|
tasks = [check_one(pkg) for pkg in packages]
|
||||||
|
results = await asyncio.gather(*tasks, return_exceptions=False)
|
||||||
|
return dict(results)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_packages_from_items(items: list[dict[str, Any]]) -> Counter:
|
||||||
|
page_packages = Counter()
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
# Extract from text_matches (code snippets)
|
||||||
|
text_matches = item.get("text_matches", [])
|
||||||
|
for match in text_matches:
|
||||||
|
fragment = match.get("fragment", "")
|
||||||
|
package = extract_package_name(fragment)
|
||||||
|
if package:
|
||||||
|
page_packages[package] += 1
|
||||||
|
logger.debug(f"Found package: {package}")
|
||||||
|
|
||||||
|
# Also check file path/name
|
||||||
|
path = item.get("path", "")
|
||||||
|
if "uvx" in path.lower():
|
||||||
|
package = extract_package_name(path)
|
||||||
|
if package:
|
||||||
|
page_packages[package] += 1
|
||||||
|
|
||||||
|
return page_packages
|
||||||
|
|
||||||
|
|
||||||
|
async def search_uvx_usage(
|
||||||
|
token: str, max_pages: int = 10
|
||||||
|
) -> tuple[Counter[str], dict[str, bool]]:
|
||||||
|
"""
|
||||||
|
Search for uvx usage across GitHub and extract package names.
|
||||||
|
|
||||||
|
Processes packages incrementally and checks PyPI concurrently.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token: GitHub Personal Access Token
|
||||||
|
max_pages: Maximum number of pages to fetch per query (default: 10)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (Counter of valid package names with counts, updated PyPI cache)
|
||||||
|
"""
|
||||||
|
pypi_cache: dict[str, bool] = {}
|
||||||
|
valid_package_counts: Counter[str] = Counter()
|
||||||
|
all_package_counts: Counter[str] = Counter()
|
||||||
|
unknown_packages_queue: list[str] = []
|
||||||
|
|
||||||
|
semaphore = asyncio.Semaphore(PYPI_CONCURRENT_CHECKS)
|
||||||
|
current_rate_limit = RateLimitInfo(None, None)
|
||||||
|
|
||||||
|
# Size buckets to work around GitHub's 1000 result limit
|
||||||
|
# It would be way smarter to do this dynamically (query a given size range and do a
|
||||||
|
# binary/proportional split on the number of results) but I already got this far
|
||||||
|
# so I'm not going to change it for now.
|
||||||
|
markdown_size_buckets = [
|
||||||
|
(0, 1025),
|
||||||
|
(1025, 1250),
|
||||||
|
(1250, 1500),
|
||||||
|
(1500, 1750),
|
||||||
|
(1750, 2000),
|
||||||
|
(2000, 2500),
|
||||||
|
(2500, 3500),
|
||||||
|
(3500, 4500),
|
||||||
|
(4500, 5500),
|
||||||
|
(5500, 6250),
|
||||||
|
(6250, 7000),
|
||||||
|
(7000, 7750),
|
||||||
|
(7750, 8500),
|
||||||
|
(8500, 9250),
|
||||||
|
(9250, 10000),
|
||||||
|
(10000, 10750),
|
||||||
|
(10750, 11750),
|
||||||
|
(11750, 13000),
|
||||||
|
(13000, 14000),
|
||||||
|
(14000, 15250),
|
||||||
|
(15250, 16250),
|
||||||
|
(16250, 17500),
|
||||||
|
(17500, 18750),
|
||||||
|
(18750, 20000),
|
||||||
|
(20000, 22000),
|
||||||
|
(22000, 24000),
|
||||||
|
(24000, 26000),
|
||||||
|
(26000, 28000),
|
||||||
|
(28000, 30000),
|
||||||
|
(30000, 33000),
|
||||||
|
(33000, 36000),
|
||||||
|
(36000, 39000),
|
||||||
|
(39000, 42000),
|
||||||
|
(42000, 45000),
|
||||||
|
(45000, 50000),
|
||||||
|
(50000, 60000),
|
||||||
|
(60000, 70000),
|
||||||
|
(70000, 80000),
|
||||||
|
(80000, 100000),
|
||||||
|
(100000, 120000),
|
||||||
|
(120000, 140000),
|
||||||
|
(140000, 160000),
|
||||||
|
(160000, 180000),
|
||||||
|
(180000, 200000),
|
||||||
|
(200000, 250000),
|
||||||
|
(250000, 300000),
|
||||||
|
(300000, None),
|
||||||
|
]
|
||||||
|
|
||||||
|
shell_size_buckets = [
|
||||||
|
(0, 2800),
|
||||||
|
(2800, 6000),
|
||||||
|
(6000, 15000),
|
||||||
|
(15000, 32000),
|
||||||
|
(32000, None),
|
||||||
|
]
|
||||||
|
|
||||||
|
queries = [
|
||||||
|
build_size_query("uvx AND language:Markdown in:file", start, end)
|
||||||
|
for start, end in markdown_size_buckets
|
||||||
|
]
|
||||||
|
queries.extend(
|
||||||
|
build_size_query("uvx AND language:Shell in:file", start, end)
|
||||||
|
for start, end in shell_size_buckets
|
||||||
|
)
|
||||||
|
|
||||||
|
async def process_unknown_packages() -> None:
|
||||||
|
"""Process queued unknown packages against PyPI."""
|
||||||
|
if not unknown_packages_queue:
|
||||||
|
return
|
||||||
|
|
||||||
|
packages_to_check = list(set(unknown_packages_queue))
|
||||||
|
unknown_packages_queue.clear()
|
||||||
|
|
||||||
|
logger.info(f"Checking {len(packages_to_check)} unknown packages against PyPI...")
|
||||||
|
results = await check_packages_batch(packages_to_check, pypi_cache, semaphore)
|
||||||
|
|
||||||
|
# Update valid package counts based on results
|
||||||
|
for package, exists in results.items():
|
||||||
|
if exists:
|
||||||
|
count = all_package_counts.get(package, 0)
|
||||||
|
if count > 0:
|
||||||
|
valid_package_counts[package] = count
|
||||||
|
logger.debug(f"Added {package} to valid packages ({count} occurrences)")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Package {package} validated but has no count")
|
||||||
|
|
||||||
|
for query_idx, query in enumerate(queries):
|
||||||
|
page = 1
|
||||||
|
effective_max_pages = min(max_pages, GITHUB_CODE_SEARCH_MAX_PAGE)
|
||||||
|
|
||||||
|
# Wait before starting a new query (except the first one)
|
||||||
|
if query_idx > 0:
|
||||||
|
logger.debug("Waiting before starting new query...")
|
||||||
|
await wait_for_rate_limit(current_rate_limit)
|
||||||
|
await process_unknown_packages()
|
||||||
|
|
||||||
|
while page <= effective_max_pages:
|
||||||
|
try:
|
||||||
|
# Rate limiting: wait between page requests (except for the first page)
|
||||||
|
if page > 1:
|
||||||
|
logger.debug("Waiting before next page...")
|
||||||
|
await wait_for_rate_limit(current_rate_limit)
|
||||||
|
await process_unknown_packages()
|
||||||
|
|
||||||
|
response = search_github_code(query, token, page=page)
|
||||||
|
|
||||||
|
# Update rate limit state from response
|
||||||
|
current_rate_limit = response.rate_limit
|
||||||
|
|
||||||
|
items = response.items
|
||||||
|
if not items:
|
||||||
|
logger.info(f"No more results for query: {query}")
|
||||||
|
break
|
||||||
|
|
||||||
|
logger.info(f"Found {len(items)} results on page {page}")
|
||||||
|
|
||||||
|
# Extract package names from this page
|
||||||
|
page_packages = extract_packages_from_items(items)
|
||||||
|
|
||||||
|
# Process packages from this page
|
||||||
|
for package, count in page_packages.items():
|
||||||
|
all_package_counts[package] += count
|
||||||
|
|
||||||
|
# Check cache first
|
||||||
|
if package in pypi_cache:
|
||||||
|
if pypi_cache[package]:
|
||||||
|
valid_package_counts[package] = all_package_counts[package]
|
||||||
|
logger.debug(
|
||||||
|
f"Known valid: {package} (total: {all_package_counts[package]})"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
unknown_packages_queue.append(package)
|
||||||
|
|
||||||
|
# Process unknown packages while we have time before next GitHub request
|
||||||
|
if unknown_packages_queue:
|
||||||
|
await process_unknown_packages()
|
||||||
|
|
||||||
|
# Check if there are more pages
|
||||||
|
effective_total = min(
|
||||||
|
response.total_count, GITHUB_CODE_SEARCH_MAX_RESULTS
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(items) < 100 or page * 100 >= effective_total:
|
||||||
|
logger.info(
|
||||||
|
f"Reached end of results for query: {query} "
|
||||||
|
f"(page {page}, total: {response.total_count})"
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
except ValueError as e:
|
||||||
|
# This is raised when we hit the 1000 result limit
|
||||||
|
logger.info(f"Hit GitHub Code Search API limit: {e}")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing page {page} of query '{query}': {e}")
|
||||||
|
break
|
||||||
|
|
||||||
|
# Process any remaining unknown packages after each query
|
||||||
|
await process_unknown_packages()
|
||||||
|
|
||||||
|
# Final processing of any remaining unknown packages
|
||||||
|
await process_unknown_packages()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Found {len(valid_package_counts)} valid PyPI packages "
|
||||||
|
f"out of {len(all_package_counts)} total"
|
||||||
|
)
|
||||||
|
|
||||||
|
return valid_package_counts, pypi_cache
|
||||||
|
|
||||||
|
|
||||||
|
def write_top_packages(
|
||||||
|
package_counts: Counter[str],
|
||||||
|
output_path: Path,
|
||||||
|
debug_output_path: Path,
|
||||||
|
min_count: int = 2,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Write top packages to files, sorted by frequency.
|
||||||
|
|
||||||
|
Packages are written in buckets by threshold (100+, 25+, 10+, 5+, min_count+).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
package_counts: Counter of package names and counts
|
||||||
|
output_path: Path to output file (main packages list)
|
||||||
|
debug_output_path: Path to debug output file (with counts)
|
||||||
|
min_count: Minimum occurrence count to include (default: 2)
|
||||||
|
"""
|
||||||
|
thresholds = [min_count, 5, 10, 25, 100]
|
||||||
|
|
||||||
|
# Filter packages into buckets by threshold
|
||||||
|
buckets = []
|
||||||
|
for i, threshold in enumerate(thresholds):
|
||||||
|
next_threshold = thresholds[i + 1] if i + 1 < len(thresholds) else float("inf")
|
||||||
|
bucket_packages = {
|
||||||
|
pkg: count
|
||||||
|
for pkg, count in package_counts.items()
|
||||||
|
if threshold <= count < next_threshold
|
||||||
|
}
|
||||||
|
buckets.append({"threshold": threshold, "packages": bucket_packages})
|
||||||
|
|
||||||
|
with open(output_path, "w") as f, open(debug_output_path, "w") as f_debug:
|
||||||
|
for bucket in reversed(buckets):
|
||||||
|
threshold = bucket["threshold"]
|
||||||
|
packages = bucket["packages"]
|
||||||
|
logger.info(
|
||||||
|
f"Greater than or equal to {threshold} mentions: {len(packages)} packages"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Sort by count descending, then alphabetically
|
||||||
|
sorted_packages = sorted(
|
||||||
|
packages.items(), key=lambda x: (-x[1], x[0])
|
||||||
|
)
|
||||||
|
|
||||||
|
for package, count in sorted_packages:
|
||||||
|
f.write(f"{package}\n")
|
||||||
|
f_debug.write(f"{package}: {count}\n")
|
||||||
|
|
||||||
|
logger.info(f"Successfully wrote top packages to {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Fetch popular packages from GitHub by searching for uvx usage"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--token",
|
||||||
|
type=str,
|
||||||
|
help="GitHub Personal Access Token (or set GITHUB_TOKEN env var)",
|
||||||
|
default=os.getenv("GITHUB_TOKEN"),
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
type=Path,
|
||||||
|
help="Output file path (default: top_packages.txt)",
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--debug-output",
|
||||||
|
type=Path,
|
||||||
|
help="Debug output file path (default: top_packages_debug.txt)",
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-pages",
|
||||||
|
type=int,
|
||||||
|
default=10,
|
||||||
|
help="Maximum pages to fetch per query (default: 10)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--min-count",
|
||||||
|
type=int,
|
||||||
|
default=2,
|
||||||
|
help="Minimum occurrence count to include (default: 2)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--verbose",
|
||||||
|
"-v",
|
||||||
|
action="store_true",
|
||||||
|
help="Enable verbose logging",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.verbose:
|
||||||
|
logging.getLogger().setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
if not args.token:
|
||||||
|
logger.error(
|
||||||
|
"GitHub token is required. Set GITHUB_TOKEN environment variable "
|
||||||
|
"or pass --token. Create a token at: https://github.com/settings/tokens"
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Set default output paths
|
||||||
|
if args.output is None or args.debug_output is None:
|
||||||
|
script_dir = Path(__file__).parent
|
||||||
|
project_root = script_dir.parent.parent
|
||||||
|
if args.output is None:
|
||||||
|
args.output = (
|
||||||
|
project_root
|
||||||
|
/ "crates"
|
||||||
|
/ "uv"
|
||||||
|
/ "src"
|
||||||
|
/ "commands"
|
||||||
|
/ "tool"
|
||||||
|
/ "top_packages.txt"
|
||||||
|
)
|
||||||
|
if args.debug_output is None:
|
||||||
|
args.debug_output = (
|
||||||
|
project_root
|
||||||
|
/ "crates"
|
||||||
|
/ "uv"
|
||||||
|
/ "src"
|
||||||
|
/ "commands"
|
||||||
|
/ "tool"
|
||||||
|
/ "top_packages_debug.txt"
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("Starting GitHub search for uvx usage...")
|
||||||
|
logger.info(f"Output will be written to: {args.output}")
|
||||||
|
logger.info(f"Debug output will be written to: {args.debug_output}")
|
||||||
|
|
||||||
|
valid_packages, pypi_cache = asyncio.run(
|
||||||
|
search_uvx_usage(args.token, max_pages=args.max_pages)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not valid_packages:
|
||||||
|
logger.warning("No valid PyPI packages found.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
logger.info(f"Found {len(valid_packages)} valid PyPI packages")
|
||||||
|
logger.info(f"Top 10 valid packages: {valid_packages.most_common(10)}")
|
||||||
|
logger.info(f"PyPI cache contains {len(pypi_cache)} entries")
|
||||||
|
|
||||||
|
write_top_packages(
|
||||||
|
valid_packages, args.output, args.debug_output, min_count=args.min_count
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue