top-packages generation script

Signed-off-by: Mikayla Thompson <mrt@mikayla.codes>
2025-11-14 12:13:16 -07:00 · 2025-11-14 12:13:16 -07:00 · 559d494884
parent 2642acc80b
commit 559d494884
2 changed files with 920 additions and 0 deletions
--- a/crates/uv/src/commands/tool/top_packages.txt
+++ b/crates/uv/src/commands/tool/top_packages.txt
@ -0,0 +1,167 @@
 ruff
 agent-starter-pack
 pre-commit
 marimo
 args
 cookiecutter
 pyright
 ngrok
 pytest
 hatch
 llm
 markitdown
 mcp-server-fetch
 ty
 copier
 command
 mcp-server-git
 mcpo
 uvx
 black
 streamdown
 autoflake
 run
 nox
 files-to-prompt
 mypy
 tox
 yt-dlp
 split_markdown4gpt
 mcp-server-time
 basic-memory
 synth-ai
 gac
 twine
 are
 codetoprompt
 open-webui
 browser-use
 pip
 wpull
 build
 with
 blender-remote
 https
 migrate-to-uv
 bandit
 mcp-atlassian
 mkdocs
 playwright
 everything
 maturin
 cruft
 mcp-feedback-enhanced
 smithery
 aignostics
 databricks
 jupyterlab
 tool
 echo
 elevenlabs-mcp
 ipython
 package
 pycowsay
 support
 completions
 jupyter
 mini-swe-agent
 openhands-ai
 pyenvsearch
 rogue-ai
 uv
 uv-dynamic-versioning
 ansible-core
 class
 gcovr
 handles
 mcp
 mcp-proxy
 mcp-scan
 mcp-server-sqlite
 streamlit
 blender-mcp
 create-mcp-server
 eyelet
 mcp-server-qdrant
 mcp-wiki
 on
 path
 pytest-watch
 setup
 textual-demo
 voice-mode
 automagik-tools
 bump-my-version
 kedro
 schemathesis
 huggingface_hub
 integration
 jupyter-core
 jupytext
 llm-discovery
 mcp-sse-shim
 openhands
 reachy-mini
 runs
 usage
 automagik-hive
 business-use-core
 cookieplone
 cowsay
 create-dagster
 distribution
 eval
 healthyselfjournal
 httpie
 markitdown-mcp
 speaches-cli
 strip-tags
 autocompletion
 chroma
 cmake
 compatibility
 detect-secrets
 dist
 evalgate
 iowarp-mcps
 isort
 juv
 lean-lsp-mcp
 llamactl
 nearc
 ngiab_data_preprocess
 nvitop
 prek
 requests
 runner
 samstacks
 semgrep-mcp
 specifyplus
 sqlite-utils
 the
 uv-publish
 wandb
 agentprobe
 audible-cli
 cakemail-api-docs-mcp
 datasette
 fonttools
 git-cliff
 goldenverba
 honcho
 internetarchive
 mcp-google-sheets
 modal
 modern
 pdm
 poe
 pyinstaller
 pyrefly
 radon
 should
 sniffly
 specify
 superclaude
 tools
 workspace-mcp
 yt-mpv
--- a/scripts/uvx_usage_on_gh/fetch_uvx_usage.py
+++ b/scripts/uvx_usage_on_gh/fetch_uvx_usage.py
@ -0,0 +1,753 @@
 # /// script
 # requires-python = ">=3.13"
 # dependencies = [
 # 	"httpx"
 # ]
 # ///
 """
 Use the GitHub Code Search API to find instances of `uvx <package>` in:
 - README files (*.md)
 - Shell scripts (*.sh, *.bash, *.zsh)
 Requirements:
    - A GitHub Personal Access Token (PAT) with `public_repo` scope
    - Set the GITHUB_TOKEN environment variable or pass --token
 Usage:
    python scripts/uvx_usage_on_gh/fetch_uvx_usage.py --output crates/uv/src/commands/tool/top_packages.txt
 """
 import argparse
 import asyncio
 import logging
 import os
 import re
 import sys
 import time
 from collections import Counter
 from pathlib import Path
 from typing import Any, NamedTuple, Optional
 import httpx
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
 )
 logger = logging.getLogger(__name__)
 # GitHub API configuration
 GITHUB_API_BASE = "https://api.github.com"
 CODE_SEARCH_ENDPOINT = f"{GITHUB_API_BASE}/search/code"
 # Rate limiting configuration
 RATE_LIMIT_DELAY = 6.1  # seconds between requests (slightly more than 60/10)
 # GitHub Code Search API limits
 GITHUB_CODE_SEARCH_MAX_RESULTS = 1000  # Hard limit: only first 1000 results accessible
 GITHUB_CODE_SEARCH_MAX_PAGE = 10  # Page 10 = results 901-1000, page 11+ returns 422
 # Retry configuration
 MAX_RETRIES = 5
 INITIAL_RETRY_DELAY = 10  # seconds
 MAX_RETRY_DELAY = 300  # 5 minutes max delay
 # PyPI check concurrency
 PYPI_CONCURRENT_CHECKS = 20  # Number of concurrent PyPI checks
 # PyPI API endpoint
 PYPI_JSON_API_TEMPLATE = "https://pypi.org/pypi/{package}/json"
 class RateLimitInfo(NamedTuple):
    remaining: int | None
    reset_time: int | None
 class GitHubSearchResponse(NamedTuple):
    items: list[dict[str, Any]]
    total_count: int
    rate_limit: RateLimitInfo
 # Regex patterns for extracting package names
 PACKAGE_PATTERN_FROM = re.compile(
    r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)*--from\s+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?",
    re.IGNORECASE,
 )
 PACKAGE_PATTERN_NORMAL = re.compile(
    r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?",
    re.IGNORECASE,
 )
 PACKAGE_PATTERN_SIMPLE = re.compile(
    r"\buvx\s+([a-z0-9](?:[a-z0-9._-]*[a-z0-9])?)(?:@\S+)?",
    re.IGNORECASE,
 )
 URL_PATTERN = re.compile(
    r"\buvx\s+(?:--\w+(?:\s+\S+)?\s+)*--from\s+(git\+[a-z]+://|git://|https?://)",
    re.IGNORECASE,
 )
 def extract_package_name(match_text: str) -> Optional[str]:
    """
    Extract package name from a match.
    Handles patterns like:
    - uvx ruff
    - uvx --from httpie http (extracts "httpie")
    - uvx --python 3.12 textual-demo
    - uvx black@latest
    - uvx pytest --version
    - uvx streamlit run streamlit_app/dashboard.py
    Skips patterns like:
    - uvx --from git+https://... (URLs are not package names)
    - uvx --from http://... (URLs are not package names)
    """
    # Skip URLs after --from
    if URL_PATTERN.search(match_text):
        return None
    # Try patterns in order: --from, flags, simple
    match = (
        PACKAGE_PATTERN_FROM.search(match_text)
        or PACKAGE_PATTERN_NORMAL.search(match_text)
        or PACKAGE_PATTERN_SIMPLE.search(match_text)
    )
    if not match:
        return None
    package = match.group(1).lower()
    # Remove version specifiers (e.g., @latest, @1.0.0)
    if "@" in package:
        package = package.split("@")[0]
    # Validation checks
    if package.startswith("--") or "/" in package or "\\" in package or len(package) < 2:
        return None
    return package
 def _calculate_retry_delay(
    status_code: int,
    retry_count: int,
    response_headers: httpx.Headers,
 ) -> int:
    """Calculate delay for retry based on status code and headers."""
    if status_code in (403, 429):
        # Try Retry-After header first
        retry_after = response_headers.get("Retry-After")
        if retry_after:
            try:
                return int(retry_after) + 2  # Add 2 second buffer
            except ValueError:
                pass
        # Fall back to X-RateLimit-Reset
        reset_time_str = response_headers.get("X-RateLimit-Reset")
        if reset_time_str:
            try:
                reset_time = int(reset_time_str)
                current_time = int(time.time())
                return max(reset_time - current_time + 2, 10)  # At least 10 seconds
            except ValueError:
                pass
    # Default: exponential backoff
    return min(INITIAL_RETRY_DELAY * (2**retry_count), MAX_RETRY_DELAY)
 def search_github_code(
    query: str,
    token: str,
    page: int,
    per_page: int = 100,
    retry_count: int = 0,
 ) -> GitHubSearchResponse:
    headers = {
        "Accept": "application/vnd.github.text-match+json",
        "Authorization": f"Bearer {token}",
    }
    params = {
        "q": query,
        "page": page,
        "per_page": min(per_page, 100),
    }
    logger.info(f"Searching GitHub: {query} (page {page}, attempt {retry_count + 1})")
    try:
        response = httpx.get(
            CODE_SEARCH_ENDPOINT,
            headers=headers,
            params=params,
            timeout=30.0,
        )
        response.raise_for_status()
        # Extract rate limit info
        remaining_str = response.headers.get("X-RateLimit-Remaining")
        reset_time_str = response.headers.get("X-RateLimit-Reset")
        rate_limit = RateLimitInfo(
            remaining=int(remaining_str) if remaining_str else None,
            reset_time=int(reset_time_str) if reset_time_str else None,
        )
        logger.debug(
            f"Rate limit remaining: {rate_limit.remaining}, reset at: {rate_limit.reset_time}"
        )
        data = response.json()
        total_count = data.get("total_count", 0)
        logger.info(f"Count of total results: {total_count}")
        return GitHubSearchResponse(
            items=data.get("items", []),
            total_count=total_count,
            rate_limit=rate_limit,
        )
    except httpx.HTTPStatusError as e:
        status_code = e.response.status_code
        # 422 on page 11+ is likely the hard 1000 result limit
        if status_code == 422 and page > GITHUB_CODE_SEARCH_MAX_PAGE:
            logger.info(
                f"422 error on page {page} - likely hit GitHub's 1000 result limit. "
                f"Code Search API only returns first {GITHUB_CODE_SEARCH_MAX_RESULTS} results."
            )
            raise ValueError(
                f"Reached GitHub Code Search API limit (page {page} > {GITHUB_CODE_SEARCH_MAX_PAGE})"
            ) from e
        # Retryable errors
        if status_code in (403, 422, 429) and retry_count < MAX_RETRIES:
            delay = _calculate_retry_delay(status_code, retry_count, e.response.headers)
            if status_code == 403:
                logger.warning(
                    f"Rate limit exceeded (403). Retrying in {delay}s "
                    f"(attempt {retry_count + 1}/{MAX_RETRIES})"
                )
            elif status_code == 429:
                logger.warning(
                    f"Rate limit exceeded (429). Retrying in {delay}s "
                    f"(attempt {retry_count + 1}/{MAX_RETRIES})"
                )
            elif status_code == 422:
                logger.warning(
                    f"Validation error (422) - may be transient. Retrying in {delay}s "
                    f"(attempt {retry_count + 1}/{MAX_RETRIES})"
                )
            time.sleep(delay)
            return search_github_code(query, token, page, per_page, retry_count + 1)
        # Non-retryable or max retries reached
        if status_code == 403:
            logger.error(
                "Rate limit exceeded or authentication failed after retries. "
                "Check your token and wait before retrying."
            )
        elif status_code == 422:
            logger.error(f"Invalid query after retries: {query}")
        else:
            logger.error(f"HTTP error {status_code} after retries")
    except httpx.RequestError as e:
        # Network errors are retryable
        if retry_count < MAX_RETRIES:
            delay = min(INITIAL_RETRY_DELAY * (2**retry_count), MAX_RETRY_DELAY)
            logger.warning(
                f"Request failed: {e}. Retrying in {delay}s "
                f"(attempt {retry_count + 1}/{MAX_RETRIES})"
            )
            time.sleep(delay)
            return search_github_code(query, token, page, per_page, retry_count + 1)
        logger.error(f"Request failed after retries: {e}")
        raise
 async def wait_for_rate_limit(rate_limit: RateLimitInfo) -> None:
    """
    Wait if we're approaching rate limit or need to wait until reset.
    Args:
        rate_limit: Rate limit information from previous request
    """
    if rate_limit.remaining is None or rate_limit.reset_time is None:
        await asyncio.sleep(RATE_LIMIT_DELAY)
        return
    # If running low on requests, wait until reset
    if rate_limit.remaining <= 2:
        wait_time = rate_limit.reset_time - int(time.time()) + 2  # Add 2 second buffer
        if wait_time > 0:
            logger.info(
                f"Rate limit low ({rate_limit.remaining} remaining). "
                f"Waiting {wait_time}s until reset at {rate_limit.reset_time}"
            )
            await asyncio.sleep(wait_time)
        else:
            await asyncio.sleep(RATE_LIMIT_DELAY)
    else:
        await asyncio.sleep(RATE_LIMIT_DELAY)
 def build_size_query(base_query: str, start_bytes: int, end_bytes: Optional[int]) -> str:
    """Build a GitHub Code Search query with size filter."""
    if end_bytes is None:
        return f"{base_query} size:>={start_bytes}"
    return f"{base_query} size:{start_bytes}..{end_bytes}"
 async def check_pypi_package_exists(
    package: str,
    cache: dict[str, bool],
    client: httpx.AsyncClient,
 ) -> tuple[str, bool]:
    """
    Check if a single package exists on PyPI.
    Args:
        package: Package name to check
        cache: Dictionary to cache results (modified in-place)
        client: httpx async client instance
    Returns:
        Tuple of (package_name, exists)
    """
    # Check cache first
    if package in cache:
        return (package, cache[package])
    url = PYPI_JSON_API_TEMPLATE.format(package=package)
    try:
        response = await client.get(url, timeout=10.0, follow_redirects=True)
        exists = response.status_code == 200
        cache[package] = exists
        if exists:
            logger.debug(f"✓ {package} exists on PyPI")
        else:
            logger.debug(f"✗ {package} not found on PyPI")
        return (package, exists)
    except httpx.RequestError as e:
        logger.debug(f"Error checking {package} on PyPI: {e}")
        cache[package] = False
        return (package, False)
 async def check_packages_batch(
    packages: list[str],
    cache: dict[str, bool],
    semaphore: asyncio.Semaphore,
 ) -> dict[str, bool]:
    """
    Check a batch of packages against PyPI concurrently.
    Args:
        packages: List of package names to check
        cache: Dictionary to cache results (modified in-place)
        semaphore: Semaphore to limit concurrent requests
    Returns:
        Dictionary mapping package names to their existence status
    """
    async def check_one(package: str) -> tuple[str, bool]:
        async with semaphore:
            async with httpx.AsyncClient() as client:
                return await check_pypi_package_exists(package, cache, client)
    tasks = [check_one(pkg) for pkg in packages]
    results = await asyncio.gather(*tasks, return_exceptions=False)
    return dict(results)
 def extract_packages_from_items(items: list[dict[str, Any]]) -> Counter:
    page_packages = Counter()
    for item in items:
        # Extract from text_matches (code snippets)
        text_matches = item.get("text_matches", [])
        for match in text_matches:
            fragment = match.get("fragment", "")
            package = extract_package_name(fragment)
            if package:
                page_packages[package] += 1
                logger.debug(f"Found package: {package}")
        # Also check file path/name
        path = item.get("path", "")
        if "uvx" in path.lower():
            package = extract_package_name(path)
            if package:
                page_packages[package] += 1
    return page_packages
 async def search_uvx_usage(
    token: str, max_pages: int = 10
 ) -> tuple[Counter[str], dict[str, bool]]:
    """
    Search for uvx usage across GitHub and extract package names.
    Processes packages incrementally and checks PyPI concurrently.
    Args:
        token: GitHub Personal Access Token
        max_pages: Maximum number of pages to fetch per query (default: 10)
    Returns:
        Tuple of (Counter of valid package names with counts, updated PyPI cache)
    """
    pypi_cache: dict[str, bool] = {}
    valid_package_counts: Counter[str] = Counter()
    all_package_counts: Counter[str] = Counter()
    unknown_packages_queue: list[str] = []
    semaphore = asyncio.Semaphore(PYPI_CONCURRENT_CHECKS)
    current_rate_limit = RateLimitInfo(None, None)
    # Size buckets to work around GitHub's 1000 result limit
    # It would be way smarter to do this dynamically (query a given size range and do a 
    # binary/proportional split on the number of results) but I already got this far
    # so I'm not going to change it for now.
    markdown_size_buckets = [
        (0, 1025),
        (1025, 1250),
        (1250, 1500),
        (1500, 1750),
        (1750, 2000),
        (2000, 2500),
        (2500, 3500),
        (3500, 4500),
        (4500, 5500),
        (5500, 6250),
        (6250, 7000),
        (7000, 7750),
        (7750, 8500),
        (8500, 9250),
        (9250, 10000),
        (10000, 10750),
        (10750, 11750),
        (11750, 13000),
        (13000, 14000),
        (14000, 15250),
        (15250, 16250),
        (16250, 17500),
        (17500, 18750),
        (18750, 20000),
        (20000, 22000),
        (22000, 24000),
        (24000, 26000),
        (26000, 28000),
        (28000, 30000),
        (30000, 33000),
        (33000, 36000),
        (36000, 39000),
        (39000, 42000),
        (42000, 45000),
        (45000, 50000),
        (50000, 60000),
        (60000, 70000),
        (70000, 80000),
        (80000, 100000),
        (100000, 120000),
        (120000, 140000),
        (140000, 160000),
        (160000, 180000),
        (180000, 200000),
        (200000, 250000),
        (250000, 300000),
        (300000, None),
    ]
    shell_size_buckets = [
        (0, 2800),
        (2800, 6000),
        (6000, 15000),
        (15000, 32000),
        (32000, None),
    ]
    queries = [
        build_size_query("uvx AND language:Markdown in:file", start, end)
        for start, end in markdown_size_buckets
    ]
    queries.extend(
        build_size_query("uvx AND language:Shell in:file", start, end)
        for start, end in shell_size_buckets
    )
    async def process_unknown_packages() -> None:
        """Process queued unknown packages against PyPI."""
        if not unknown_packages_queue:
            return
        packages_to_check = list(set(unknown_packages_queue))
        unknown_packages_queue.clear()
        logger.info(f"Checking {len(packages_to_check)} unknown packages against PyPI...")
        results = await check_packages_batch(packages_to_check, pypi_cache, semaphore)
        # Update valid package counts based on results
        for package, exists in results.items():
            if exists:
                count = all_package_counts.get(package, 0)
                if count > 0:
                    valid_package_counts[package] = count
                    logger.debug(f"Added {package} to valid packages ({count} occurrences)")
                else:
                    logger.warning(f"Package {package} validated but has no count")
    for query_idx, query in enumerate(queries):
        page = 1
        effective_max_pages = min(max_pages, GITHUB_CODE_SEARCH_MAX_PAGE)
        # Wait before starting a new query (except the first one)
        if query_idx > 0:
            logger.debug("Waiting before starting new query...")
            await wait_for_rate_limit(current_rate_limit)
            await process_unknown_packages()
        while page <= effective_max_pages:
            try:
                # Rate limiting: wait between page requests (except for the first page)
                if page > 1:
                    logger.debug("Waiting before next page...")
                    await wait_for_rate_limit(current_rate_limit)
                    await process_unknown_packages()
                response = search_github_code(query, token, page=page)
                # Update rate limit state from response
                current_rate_limit = response.rate_limit
                items = response.items
                if not items:
                    logger.info(f"No more results for query: {query}")
                    break
                logger.info(f"Found {len(items)} results on page {page}")
                # Extract package names from this page
                page_packages = extract_packages_from_items(items)
                # Process packages from this page
                for package, count in page_packages.items():
                    all_package_counts[package] += count
                    # Check cache first
                    if package in pypi_cache:
                        if pypi_cache[package]:
                            valid_package_counts[package] = all_package_counts[package]
                            logger.debug(
                                f"Known valid: {package} (total: {all_package_counts[package]})"
                            )
                    else:
                        unknown_packages_queue.append(package)
                # Process unknown packages while we have time before next GitHub request
                if unknown_packages_queue:
                    await process_unknown_packages()
                # Check if there are more pages
                effective_total = min(
                    response.total_count, GITHUB_CODE_SEARCH_MAX_RESULTS
                )
                if len(items) < 100 or page * 100 >= effective_total:
                    logger.info(
                        f"Reached end of results for query: {query} "
                        f"(page {page}, total: {response.total_count})"
                    )
                    break
                page += 1
            except ValueError as e:
                # This is raised when we hit the 1000 result limit
                logger.info(f"Hit GitHub Code Search API limit: {e}")
                break
            except Exception as e:
                logger.error(f"Error processing page {page} of query '{query}': {e}")
                break
        # Process any remaining unknown packages after each query
        await process_unknown_packages()
    # Final processing of any remaining unknown packages
    await process_unknown_packages()
    logger.info(
        f"Found {len(valid_package_counts)} valid PyPI packages "
        f"out of {len(all_package_counts)} total"
    )
    return valid_package_counts, pypi_cache
 def write_top_packages(
    package_counts: Counter[str],
    output_path: Path,
    debug_output_path: Path,
    min_count: int = 2,
 ) -> None:
    """
    Write top packages to files, sorted by frequency.
    Packages are written in buckets by threshold (100+, 25+, 10+, 5+, min_count+).
    Args:
        package_counts: Counter of package names and counts
        output_path: Path to output file (main packages list)
        debug_output_path: Path to debug output file (with counts)
        min_count: Minimum occurrence count to include (default: 2)
    """
    thresholds = [min_count, 5, 10, 25, 100]
    # Filter packages into buckets by threshold
    buckets = []
    for i, threshold in enumerate(thresholds):
        next_threshold = thresholds[i + 1] if i + 1 < len(thresholds) else float("inf")
        bucket_packages = {
            pkg: count
            for pkg, count in package_counts.items()
            if threshold <= count < next_threshold
        }
        buckets.append({"threshold": threshold, "packages": bucket_packages})
    with open(output_path, "w") as f, open(debug_output_path, "w") as f_debug:
        for bucket in reversed(buckets):
            threshold = bucket["threshold"]
            packages = bucket["packages"]
            logger.info(
                f"Greater than or equal to {threshold} mentions: {len(packages)} packages"
            )
            # Sort by count descending, then alphabetically
            sorted_packages = sorted(
                packages.items(), key=lambda x: (-x[1], x[0])
            )
            for package, count in sorted_packages:
                f.write(f"{package}\n")
                f_debug.write(f"{package}: {count}\n")
    logger.info(f"Successfully wrote top packages to {output_path}")
 def main() -> None:
    parser = argparse.ArgumentParser(
        description="Fetch popular packages from GitHub by searching for uvx usage"
    )
    parser.add_argument(
        "--token",
        type=str,
        help="GitHub Personal Access Token (or set GITHUB_TOKEN env var)",
        default=os.getenv("GITHUB_TOKEN"),
    )
    parser.add_argument(
        "--output",
        type=Path,
        help="Output file path (default: top_packages.txt)",
        default=None,
    )
    parser.add_argument(
        "--debug-output",
        type=Path,
        help="Debug output file path (default: top_packages_debug.txt)",
        default=None,
    )
    parser.add_argument(
        "--max-pages",
        type=int,
        default=10,
        help="Maximum pages to fetch per query (default: 10)",
    )
    parser.add_argument(
        "--min-count",
        type=int,
        default=2,
        help="Minimum occurrence count to include (default: 2)",
    )
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        help="Enable verbose logging",
    )
    args = parser.parse_args()
    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)
    if not args.token:
        logger.error(
            "GitHub token is required. Set GITHUB_TOKEN environment variable "
            "or pass --token. Create a token at: https://github.com/settings/tokens"
        )
        sys.exit(1)
    # Set default output paths
    if args.output is None or args.debug_output is None:
        script_dir = Path(__file__).parent
        project_root = script_dir.parent.parent
        if args.output is None:
            args.output = (
                project_root
                / "crates"
                / "uv"
                / "src"
                / "commands"
                / "tool"
                / "top_packages.txt"
            )
        if args.debug_output is None:
            args.debug_output = (
                project_root
                / "crates"
                / "uv"
                / "src"
                / "commands"
                / "tool"
                / "top_packages_debug.txt"
            )
    logger.info("Starting GitHub search for uvx usage...")
    logger.info(f"Output will be written to: {args.output}")
    logger.info(f"Debug output will be written to: {args.debug_output}")
    valid_packages, pypi_cache = asyncio.run(
        search_uvx_usage(args.token, max_pages=args.max_pages)
    )
    if not valid_packages:
        logger.warning("No valid PyPI packages found.")
        sys.exit(1)
    logger.info(f"Found {len(valid_packages)} valid PyPI packages")
    logger.info(f"Top 10 valid packages: {valid_packages.most_common(10)}")
    logger.info(f"PyPI cache contains {len(pypi_cache)} entries")
    write_top_packages(
        valid_packages, args.output, args.debug_output, min_count=args.min_count
    )
 if __name__ == "__main__":
    main()