Refactor ecosystem checks into module

2023-10-17 12:48:16 -05:00 · 2023-10-17 12:48:16 -05:00 · 105fb1c682
parent 88c0106421
commit 105fb1c682
11 changed files with 904 additions and 0 deletions
--- a/python/ruff-ecosystem/README.md
+++ b/python/ruff-ecosystem/README.md
@ -0,0 +1,45 @@
+# ruff-ecosystem
+
+Ruff ecosystem checks.
+
+## Installation
+
+From the Ruff project root, install with `pip`:
+
+```shell
+pip install -e ./python/ruff-ecosystem
+```
+
+## Usage
+
+```
+ruff-ecosystem <check | format> <baseline executable> <comparison executable>
+```
+
+Note executable paths must be absolute or relative to the current working directory.
+
+Run `ruff check` ecosystem checks comparing your debug build to your system Ruff:
+
+```shell
+ruff-ecosystem check  "$(which ruff)" "./target/debug/ruff"
+```
+
+Run `ruff format` ecosystem checks comparing your debug build to your system Ruff:
+
+```shell
+ruff-ecosystem format  "$(which ruff)" "./target/debug/ruff"
+```
+
+## Development
+
+When developing, it can be useful to set the `--pdb` flag to drop into a debugger on failure:
+
+```shell
+ruff-ecosystem check  "$(which ruff)" "./target/debug/ruff" --pdb
+```
+
+You can also provide a path to cache checkouts to speed up repeated runs:
+
+```shell
+ruff-ecosystem check  "$(which ruff)" "./target/debug/ruff" --cache ./repos
+```
--- a/python/ruff-ecosystem/pyproject.toml
+++ b/python/ruff-ecosystem/pyproject.toml
@ -0,0 +1,10 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "ruff-ecosystem"
+version = "0.0.0"
+
+[project.scripts]
+ruff-ecosystem = "ruff_ecosystem.cli:entrypoint"
--- a/python/ruff-ecosystem/ruff_ecosystem/init.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/init.py
@ -0,0 +1,3 @@
+import logging
+
+logger = logging.getLogger("ruff-ecosystem")
--- a/python/ruff-ecosystem/ruff_ecosystem/main.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/main.py
@ -0,0 +1,8 @@
+"""
+Enables usage with `python -m ruff_ecosystem`
+"""
+
+from ruff_ecosystem.cli import entrypoint
+
+if __name__ == "__main__":
+    entrypoint()
--- a/python/ruff-ecosystem/ruff_ecosystem/cli.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/cli.py
@ -0,0 +1,116 @@
+import argparse
+import asyncio
+import logging
+import tempfile
+from pathlib import Path
+from contextlib import nullcontext
+from ruff_ecosystem.models import RuffCommand
+from ruff_ecosystem.emitters import EmitterType
+from ruff_ecosystem.defaults import DEFAULT_TARGETS
+from ruff_ecosystem.main import main
+from signal import SIGINT, SIGTERM
+
+import sys
+
+
+def excepthook(type, value, tb):
+    if hasattr(sys, "ps1") or not sys.stderr.isatty():
+        # we are in interactive mode or we don't have a tty so call the default
+        sys.__excepthook__(type, value, tb)
+    else:
+        import traceback, pdb
+
+        traceback.print_exception(type, value, tb)
+        print()
+        pdb.post_mortem(tb)
+
+
+def entrypoint():
+    args = parse_args()
+
+    if args.pdb:
+        sys.excepthook = excepthook
+
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    # Use a temporary directory for caching if no cache is specified
+    cache_context = (
+        tempfile.TemporaryDirectory() if not args.cache else nullcontext(args.cache)
+    )
+
+    with cache_context as cache:
+        loop = asyncio.get_event_loop()
+        main_task = asyncio.ensure_future(
+            main(
+                command=RuffCommand(args.ruff_command),
+                ruff_baseline_executable=args.ruff_baseline,
+                ruff_comparison_executable=args.ruff_comparison,
+                targets=DEFAULT_TARGETS,
+                emitter=EmitterType(args.output_format).to_emitter(),
+                cache=Path(cache),
+                raise_on_failure=args.pdb,
+            )
+        )
+        # https://stackoverflow.com/a/58840987/3549270
+        for signal in [SIGINT, SIGTERM]:
+            loop.add_signal_handler(signal, main_task.cancel)
+        try:
+            loop.run_until_complete(main_task)
+        finally:
+            loop.close()
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Check two versions of ruff against a corpus of open-source code.",
+    )
+
+    # TODO: Support non-default `--targets`
+    # parser.add_argument(
+    #     "--targets",
+    #     type=Path,
+    #     help=(
+    #         "Optional JSON files to use over the default repositories. "
+    #         "Supports both github_search_*.jsonl and known-github-tomls.jsonl."
+    #     ),
+    # )
+    parser.add_argument(
+        "--cache",
+        type=Path,
+        help="Location for caching cloned repositories",
+    )
+    parser.add_argument(
+        "--output-format",
+        choices=[option.name for option in EmitterType],
+        default="json",
+        help="Location for caching cloned repositories",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Enable debug logging",
+    )
+    parser.add_argument(
+        "--pdb",
+        action="store_true",
+        help="Enable debugging on failure",
+    )
+    parser.add_argument(
+        "ruff_command",
+        choices=[option.name for option in RuffCommand],
+        help="The Ruff command to test",
+    )
+    parser.add_argument(
+        "ruff_baseline",
+        type=Path,
+    )
+    parser.add_argument(
+        "ruff_comparison",
+        type=Path,
+    )
+
+    return parser.parse_args()
--- a/python/ruff-ecosystem/ruff_ecosystem/defaults.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/defaults.py
@ -0,0 +1,66 @@
+from .models import Repository, CheckOptions, Target
+
+# TODO: Consider exporting this as JSON instead for consistent setup
+DEFAULT_TARGETS = [
+    # Target(repo=Repository(owner="DisnakeDev", name="disnake", branch="master")),
+    # Target(repo=Repository(owner="PostHog", name="HouseWatch", branch="main")),
+    # Target(repo=Repository(owner="RasaHQ", name="rasa", branch="main")),
+    # Target(repo=Repository(owner="Snowflake-Labs", name="snowcli", branch="main")),
+    # Target(repo=Repository(owner="aiven", name="aiven-client", branch="main")),
+    # Target(repo=Repository(owner="alteryx", name="featuretools", branch="main")),
+    # Target(
+    #     repo=Repository(owner="apache", name="airflow", branch="main"),
+    #     check_options=CheckOptions(select="ALL"),
+    # ),
+    # Target(repo=Repository(owner="aws", name="aws-sam-cli", branch="develop")),
+    # Target(repo=Repository(owner="bloomberg", name="pytest-memray", branch="main")),
+    # Target(
+    #     repo=Repository(owner="bokeh", name="bokeh", branch="branch-3.3"),
+    #     check_options=CheckOptions(select="ALL"),
+    # ),
+    # Target(repo=Repository(owner="commaai", name="openpilot", branch="master")),
+    # Target(repo=Repository(owner="demisto", name="content", branch="master")),
+    # Target(repo=Repository(owner="docker", name="docker-py", branch="main")),
+    # Target(
+    #     repo=Repository(owner="freedomofpress", name="securedrop", branch="develop")
+    # ),
+    # Target(repo=Repository(owner="fronzbot", name="blinkpy", branch="dev")),
+    # Target(repo=Repository(owner="ibis-project", name="ibis", branch="master")),
+    # Target(repo=Repository(owner="ing-bank", name="probatus", branch="main")),
+    # Target(repo=Repository(owner="jrnl-org", name="jrnl", branch="develop")),
+    # Target(repo=Repository(owner="latchbio", name="latch", branch="main")),
+    # Target(repo=Repository(owner="lnbits", name="lnbits", branch="main")),
+    # Target(repo=Repository(owner="milvus-io", name="pymilvus", branch="master")),
+    # Target(repo=Repository(owner="mlflow", name="mlflow", branch="master")),
+    # Target(repo=Repository(owner="model-bakers", name="model_bakery", branch="main")),
+    # Target(repo=Repository(owner="pandas-dev", name="pandas", branch="main")),
+    # Target(repo=Repository(owner="prefecthq", name="prefect", branch="main")),
+    # Target(repo=Repository(owner="pypa", name="build", branch="main")),
+    # Target(repo=Repository(owner="pypa", name="cibuildwheel", branch="main")),
+    # Target(repo=Repository(owner="pypa", name="pip", branch="main")),
+    # Target(repo=Repository(owner="pypa", name="setuptools", branch="main")),
+    # Target(repo=Repository(owner="python", name="mypy", branch="master")),
+    # Target(
+    #     repo=Repository(
+    #         owner="python",
+    #         name="typeshed",
+    #         branch="main",
+    #     ),
+    #     check_options=CheckOptions(select="PYI"),
+    # ),
+    # Target(repo=Repository(owner="python-poetry", name="poetry", branch="master")),
+    # Target(repo=Repository(owner="reflex-dev", name="reflex", branch="main")),
+    # Target(repo=Repository(owner="rotki", name="rotki", branch="develop")),
+    # Target(repo=Repository(owner="scikit-build", name="scikit-build", branch="main")),
+    # Target(
+    #     repo=Repository(owner="scikit-build", name="scikit-build-core", branch="main")
+    # ),
+    # Target(repo=Repository(owner="sphinx-doc", name="sphinx", branch="master")),
+    # Target(repo=Repository(owner="spruceid", name="siwe-py", branch="main")),
+    # Target(repo=Repository(owner="tiangolo", name="fastapi", branch="master")),
+    # Target(repo=Repository(owner="yandex", name="ch-backup", branch="main")),
+    Target(
+        repo=Repository(owner="zulip", name="zulip", branch="main"),
+        check_options=CheckOptions(select="ALL"),
+    ),
+]
--- a/python/ruff-ecosystem/ruff_ecosystem/emitters.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/emitters.py
@ -0,0 +1,99 @@
+from enum import Enum
+import abc
+from ruff_ecosystem.models import Target, Diff, ClonedRepository, Result
+from ruff_ecosystem.ruff import CHECK_DIFF_LINE_RE
+import traceback
+import json
+from pathlib import Path
+import dataclasses
+
+
+class Emitter(abc.ABC):
+    @abc.abstractclassmethod
+    def emit_error(cls, target: Target, exc: Exception):
+        pass
+
+    @abc.abstractclassmethod
+    def emit_diff(cls, target: Target, diff: Diff, cloned_repo: ClonedRepository):
+        pass
+
+    @abc.abstractclassmethod
+    def emit_result(cls, result: Result):
+        pass
+
+
+class DebugEmitter(Emitter):
+    def emit_error(cls, target: Target, exc: Exception):
+        print(f"Error in {target.repo.fullname}")
+        traceback.print_exception(exc)
+
+    def emit_diff(cls, target: Target, diff: Diff, cloned_repo: ClonedRepository):
+        pass
+
+
+class JSONEmitter(Emitter):
+    class DataclassJSONEncoder(json.JSONEncoder):
+        def default(self, o):
+            if dataclasses.is_dataclass(o):
+                return dataclasses.asdict(o)
+            if isinstance(o, set):
+                return tuple(o)
+            if isinstance(o, Path):
+                return str(o)
+            return super().default(o)
+
+    def emit_error(cls, target: Target, exc: Exception):
+        pass
+
+    def emit_diff(cls, target: Target, diff: Diff, cloned_repo: ClonedRepository):
+        pass
+
+    def emit_result(cls, result: Result):
+        print(json.dumps(result, indent=4, cls=cls.DataclassJSONEncoder))
+
+
+class MarkdownEmitter(Emitter):
+    def emit_error(cls, target: Target, exc: Exception):
+        cls._print(title="error", content=f"```\n{exc}\n```", target=target)
+
+    def emit_diff(cls, target: Target, diff: Diff, cloned_repo: ClonedRepository):
+        changes = f"+{len(diff.added)}, -{len(diff.removed)}"
+
+        content = ""
+        for line in list(diff):
+            match = CHECK_DIFF_LINE_RE.match(line)
+            if match is None:
+                content += line + "\n"
+                continue
+
+            pre, inner, path, lnum, post = match.groups()
+            url = cloned_repo.url_for(path, int(lnum))
+            content += f"{pre} <a href='{url}'>{inner}</a> {post}" + "\n"
+
+        cls._print(title=changes, content=f"<pre>\n{content}\n</pre>", target=target)
+
+    def _print(cls, title: str, content: str, target: Target):
+        print(f"<details><summary>{target.repo.fullname} ({title})</summary>")
+        print(target.repo.url, target.check_options.summary())
+        print("<p>")
+        print()
+
+        print(content)
+
+        print()
+        print("</p>")
+        print("</details>")
+
+
+class EmitterType(Enum):
+    markdown = "markdown"
+    json = "json"
+
+    def to_emitter(self) -> Emitter:
+        match self:
+            case self.markdown:
+                return MarkdownEmitter()
+            case self.json:
+                return JSONEmitter()
+            case _:
+                raise ValueError("Unknown emitter type {self}")
--- a/python/ruff-ecosystem/ruff_ecosystem/git.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/git.py
@ -0,0 +1,72 @@
+from ruff_ecosystem.models import Repository, ClonedRepository
+from contextlib import asynccontextmanager
+from pathlib import Path
+from typing import AsyncGenerator
+from asyncio import create_subprocess_exec
+from subprocess import PIPE
+from ruff_ecosystem import logger
+
+
+@asynccontextmanager
+async def clone(
+    repo: Repository, checkout_dir: Path
+) -> AsyncGenerator[ClonedRepository, None]:
+    """Shallow clone this repository to a temporary directory."""
+    if checkout_dir.exists():
+        logger.debug(f"Reusing {repo.owner}:{repo.name}")
+        yield await _cloned_repository(repo, checkout_dir)
+        return
+
+    logger.debug(f"Cloning {repo.owner}:{repo.name} to {checkout_dir}")
+    command = [
+        "git",
+        "clone",
+        "--config",
+        "advice.detachedHead=false",
+        "--quiet",
+        "--depth",
+        "1",
+        "--no-tags",
+    ]
+    if repo.branch:
+        command.extend(["--branch", repo.branch])
+
+    command.extend(
+        [
+            f"https://github.com/{repo.owner}/{repo.name}",
+            checkout_dir,
+        ],
+    )
+
+    process = await create_subprocess_exec(*command, env={"GIT_TERMINAL_PROMPT": "0"})
+
+    status_code = await process.wait()
+
+    logger.debug(
+        f"Finished cloning {repo.fullname} with status {status_code}",
+    )
+    yield await _cloned_repository(repo, checkout_dir)
+
+
+async def _cloned_repository(repo: Repository, checkout_dir: Path) -> ClonedRepository:
+    return ClonedRepository(
+        name=repo.name,
+        owner=repo.owner,
+        branch=repo.branch,
+        path=checkout_dir,
+        commit_hash=await _get_commit_hash(checkout_dir),
+    )
+
+
+async def _get_commit_hash(checkout_dir: Path) -> str:
+    """
+    Return the commit sha for the repository in the checkout directory.
+    """
+    process = await create_subprocess_exec(
+        *["git", "rev-parse", "HEAD"],
+        cwd=checkout_dir,
+        stdout=PIPE,
+    )
+    stdout, _ = await process.communicate()
+    assert await process.wait() == 0, f"Failed to retrieve commit sha at {checkout_dir}"
+    return stdout.decode().strip()
--- a/python/ruff-ecosystem/ruff_ecosystem/main.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/main.py
@ -0,0 +1,235 @@
+from ruff_ecosystem.models import (
+    RuffCommand,
+    Target,
+    Diff,
+    ClonedRepository,
+    RuleChanges,
+    CheckComparison,
+    Result,
+)
+from pathlib import Path
+from ruff_ecosystem import logger
+import asyncio
+from ruff_ecosystem.git import clone
+from ruff_ecosystem.ruff import ruff_check, ruff_format
+from ruff_ecosystem.emitters import Emitter
+import difflib
+from typing import TypeVar
+import re
+
+T = TypeVar("T")
+
+
+async def main(
+    command: RuffCommand,
+    ruff_baseline_executable: Path,
+    ruff_comparison_executable: Path,
+    targets: list[Target],
+    cache: Path | None,
+    emitter: Emitter,
+    max_parallelism: int = 50,
+    raise_on_failure: bool = False,
+) -> None:
+    logger.debug("Using command %s", command.value)
+    logger.debug("Using baseline executable at %s", ruff_baseline_executable)
+    logger.debug("Using comparison executable at %s", ruff_comparison_executable)
+    logger.debug("Using cache directory %s", cache)
+    logger.debug("Checking %s targets", len(targets))
+
+    semaphore = asyncio.Semaphore(max_parallelism)
+
+    async def limited_parallelism(coroutine: T) -> T:
+        async with semaphore:
+            return await coroutine
+
+    comparisons: list[Exception | CheckComparison] = await asyncio.gather(
+        *[
+            limited_parallelism(
+                clone_and_compare(
+                    command,
+                    ruff_baseline_executable,
+                    ruff_comparison_executable,
+                    target,
+                    cache,
+                )
+            )
+            for target in targets
+        ],
+        return_exceptions=not raise_on_failure,
+    )
+    comparisons_by_target = dict(zip(targets, comparisons, strict=True))
+
+    # Calculate totals
+    total_removed = total_added = errors = 0
+    total_rule_changes = RuleChanges()
+    for comparison in comparisons_by_target.values():
+        if isinstance(comparison, Exception):
+            errors += 1
+        else:
+            total_removed += len(comparison.diff.removed)
+            total_added += len(comparison.diff.added)
+            total_rule_changes += comparison.rule_changes
+
+    errors = []
+    comparisons = []
+    for target, comparison in comparisons_by_target.items():
+        if isinstance(comparison, Exception):
+            errors.append((target, comparison))
+            continue
+
+        if comparison.diff:
+            comparisons.append((target, comparison))
+
+        else:
+            continue
+
+    result = Result(
+        total_added=total_added,
+        total_removed=total_removed,
+        total_rule_changes=total_rule_changes,
+        comparisons=comparisons,
+        errors=errors,
+    )
+
+    emitter.emit_result(result)
+    return
+
+    if total_removed == 0 and total_added == 0 and errors == 0:
+        print("\u2705 ecosystem check detected no changes.")
+        return
+
+    s = "s" if errors != 1 else ""
+    changes = f"(+{total_added}, -{total_removed}, {errors} error{s})"
+
+    print(f"\u2139\ufe0f ecosystem check **detected changes**. {changes}")
+    print()
+
+    for target, comparison in comparisons_by_target.items():
+        if isinstance(comparison, Exception):
+            emitter.emit_error(target, comparison)
+            continue
+
+        if comparison.diff:
+            emitter.emit_diff(target, comparison.diff, comparison.repo)
+
+        else:
+            continue
+
+    if len(total_rule_changes.rule_codes()) > 0:
+        print(f"Rules changed: {len(total_rule_changes.rule_codes())}")
+        print()
+        print("| Rule | Changes | Additions | Removals |")
+        print("| ---- | ------- | --------- | -------- |")
+        for rule, (additions, removals) in sorted(
+            total_rule_changes.items(),
+            key=lambda x: (x[1][0] + x[1][1]),
+            reverse=True,
+        ):
+            print(f"| {rule} | {additions + removals} | {additions} | {removals} |")
+
+
+async def clone_and_compare(
+    command: RuffCommand,
+    ruff_baseline_executable: Path,
+    ruff_comparison_executable: Path,
+    target: Target,
+    cache: Path,
+) -> CheckComparison:
+    """Check a specific repository against two versions of ruff."""
+    assert ":" not in target.repo.owner
+    assert ":" not in target.repo.name
+
+    match command:
+        case RuffCommand.check:
+            ruff_task, create_comparison, options = (
+                ruff_check,
+                create_check_comparison,
+                target.check_options,
+            )
+        case RuffCommand.format:
+            ruff_task, create_comparison, options = (
+                ruff_format,
+                create_format_comparison,
+                target.format_options,
+            )
+        case _:
+            raise ValueError(f"Unknowm target Ruff command {command}")
+
+    checkout_dir = cache.joinpath(f"{target.repo.owner}:{target.repo.name}")
+    async with clone(target.repo, checkout_dir) as cloned_repo:
+        try:
+            async with asyncio.TaskGroup() as tg:
+                baseline_task = tg.create_task(
+                    ruff_task(
+                        executable=ruff_baseline_executable.resolve(),
+                        path=cloned_repo.path,
+                        name=cloned_repo.fullname,
+                        options=options,
+                    ),
+                )
+                comparison_task = tg.create_task(
+                    ruff_task(
+                        executable=ruff_comparison_executable.resolve(),
+                        path=cloned_repo.path,
+                        name=cloned_repo.fullname,
+                        options=options,
+                    ),
+                )
+        except ExceptionGroup as e:
+            raise e.exceptions[0] from e
+
+    return create_comparison(
+        cloned_repo, baseline_task.result(), comparison_task.result()
+    )
+
+
+def create_check_comparison(
+    repo: ClonedRepository, baseline_output: str, comparison_output: str
+) -> CheckComparison:
+    removed, added = set(), set()
+
+    for line in difflib.ndiff(baseline_output, comparison_output):
+        if line.startswith("- "):
+            removed.add(line[2:])
+        elif line.startswith("+ "):
+            added.add(line[2:])
+
+    diff = Diff(removed=removed, added=added)
+
+    return CheckComparison(
+        diff=diff, repo=repo, rule_changes=rule_changes_from_diff(diff)
+    )
+
+
+def rule_changes_from_diff(diff: Diff) -> RuleChanges:
+    """
+    Parse a diff from `ruff check` to determine the additions and removals for each rule.
+    """
+    rule_changes = RuleChanges()
+
+    # Count rule changes
+    for line in diff.lines():
+        # Find rule change for current line or construction
+        # + <rule>/<path>:<line>:<column>: <rule_code> <message>
+        matches = re.search(r": ([A-Z]{1,4}[0-9]{3,4})", line)
+
+        if matches is None:
+            # Handle case where there are no regex matches e.g.
+            # +                 "?application=AIRFLOW&authenticator=TEST_AUTH&role=TEST_ROLE&warehouse=TEST_WAREHOUSE" # noqa: E501, ERA001
+            # Which was found in local testing
+            continue
+
+        rule_code = matches.group(1)
+
+        # Get current additions and removals for this rule
+        current_changes = rule_changes[rule_code]
+
+        # Check if addition or removal depending on the first character
+        if line[0] == "+":
+            current_changes = (current_changes[0] + 1, current_changes[1])
+        elif line[0] == "-":
+            current_changes = (current_changes[0], current_changes[1] + 1)
+
+        rule_changes[rule_code] = current_changes
+
+    return rule_changes
--- a/python/ruff-ecosystem/ruff_ecosystem/models.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/models.py
@ -0,0 +1,160 @@
+from enum import Enum
+from dataclasses import dataclass, field
+from typing import Self, Iterator
+import heapq
+from pathlib import Path
+
+
+class RuffCommand(Enum):
+    check = "check"
+    format = "format"
+
+
+@dataclass(frozen=True)
+class Repository:
+    """
+    A remote GitHub repository
+    """
+
+    owner: str
+    name: str
+    branch: str | None
+
+    @property
+    def fullname(self) -> str:
+        return f"{self.owner}/{self.name}"
+
+    @property
+    def url(self: Self) -> str:
+        return f"https://github.com/{self.owner}/{self.name}"
+
+
+@dataclass(frozen=True)
+class ClonedRepository(Repository):
+    """
+    A cloned GitHub repository, which includes the hash of the cloned commit.
+    """
+
+    commit_hash: str
+    path: Path
+
+    def url_for(self: Self, path: str, line_number: int | None = None) -> str:
+        """
+        Return the remote GitHub URL for the given path in this repository.
+        """
+        # Default to main branch
+        url = f"https://github.com/{self.owner}/{self.name}/blob/{self.commit_hash}/{path}"
+        if line_number:
+            url += f"#L{line_number}"
+        return url
+
+    @property
+    def url(self: Self) -> str:
+        return f"https://github.com/{self.owner}/{self.name}@{self.commit_hash}"
+
+
+@dataclass(frozen=True)
+class Diff:
+    """A diff between two runs of ruff."""
+
+    removed: set[str]
+    added: set[str]
+
+    def __bool__(self: Self) -> bool:
+        """Return true if this diff is non-empty."""
+        return bool(self.removed or self.added)
+
+    def lines(self: Self) -> Iterator[str]:
+        """Iterate through the changed lines in diff format."""
+        for line in heapq.merge(sorted(self.removed), sorted(self.added)):
+            if line in self.removed:
+                yield f"- {line}"
+            else:
+                yield f"+ {line}"
+
+
+@dataclass(frozen=True)
+class RuleChanges:
+    changes: dict[str, tuple[int, int]] = field(default_factory=dict)
+
+    def rule_codes(self) -> list[str]:
+        return list(self.changes.keys())
+
+    def items(self) -> Iterator[tuple[str, tuple[int, int]]]:
+        return self.changes.items()
+
+    def __setitem__(self, key: str, value: tuple[int, int]) -> None:
+        self.changes[key] = value
+
+    def __getitem__(self, key: str) -> tuple[int, int]:
+        return self.changes.get(key, (0, 0))
+
+    def __add__(self, other: Self) -> Self:
+        if not isinstance(other, type(self)):
+            return NotImplemented
+
+        result = self.changes.copy()
+        for rule_code, (added, removed) in other.changes.items():
+            if rule_code in result:
+                result[rule_code] = (
+                    result[rule_code][0] + added,
+                    result[rule_code][1] + removed,
+                )
+            else:
+                result[rule_code] = (added, removed)
+
+        return RuleChanges(changes=result)
+
+
+@dataclass(frozen=True)
+class CheckComparison:
+    diff: Diff
+    repo: ClonedRepository
+    rule_changes: RuleChanges
+
+
+@dataclass(frozen=True)
+class CheckOptions:
+    """
+    Ruff check options
+    """
+
+    select: str = ""
+    ignore: str = ""
+    exclude: str = ""
+
+    # Generating fixes is slow and verbose
+    show_fixes: bool = False
+
+    def summary(self) -> str:
+        return f"select {self.select} ignore {self.ignore} exclude {self.exclude}"
+
+
+@dataclass(frozen=True)
+class FormatOptions:
+    """
+    Ruff format options
+    """
+
+    pass
+
+
+@dataclass(frozen=True)
+class Target:
+    """
+    An ecosystem target
+    """
+
+    repo: Repository
+    check_options: CheckOptions = field(default_factory=CheckOptions)
+    format_options: FormatOptions = field(default_factory=FormatOptions)
+
+
+@dataclass(frozen=True)
+class Result:
+    total_added: int
+    total_removed: int
+    total_rule_changes: RuleChanges
+
+    comparisons: tuple[Target, CheckComparison]
+    errors: tuple[Target, Exception]
--- a/python/ruff-ecosystem/ruff_ecosystem/ruff.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/ruff.py
@ -0,0 +1,90 @@
+from pathlib import Path
+from ruff_ecosystem import logger
+from ruff_ecosystem.models import CheckOptions, FormatOptions
+import time
+from asyncio import create_subprocess_exec
+from subprocess import PIPE
+from typing import Sequence
+import re
+
+CHECK_SUMMARY_LINE_RE = re.compile(
+    r"^(Found \d+ error.*)|(.*potentially fixable with.*)$"
+)
+
+
+CHECK_DIFF_LINE_RE = re.compile(
+    r"^(?P<pre>[+-]) (?P<inner>(?P<path>[^:]+):(?P<lnum>\d+):\d+:) (?P<post>.*)$",
+)
+
+
+class RuffError(Exception):
+    """An error reported by ruff."""
+
+
+async def ruff_check(
+    *, executable: Path, path: Path, name: str, options: CheckOptions
+) -> Sequence[str]:
+    """Run the given ruff binary against the specified path."""
+    logger.debug(f"Checking {name} with {executable}")
+    ruff_args = ["check", "--no-cache", "--exit-zero"]
+    if options.select:
+        ruff_args.extend(["--select", options.select])
+    if options.ignore:
+        ruff_args.extend(["--ignore", options.ignore])
+    if options.exclude:
+        ruff_args.extend(["--exclude", options.exclude])
+    if options.show_fixes:
+        ruff_args.extend(["--show-fixes", "--ecosystem-ci"])
+
+    start = time.time()
+    proc = await create_subprocess_exec(
+        executable.absolute(),
+        *ruff_args,
+        ".",
+        stdout=PIPE,
+        stderr=PIPE,
+        cwd=path,
+    )
+    result, err = await proc.communicate()
+    end = time.time()
+
+    logger.debug(f"Finished checking {name} with {executable} in {end - start:.2f}")
+
+    if proc.returncode != 0:
+        raise RuffError(err.decode("utf8"))
+
+    lines = [
+        line
+        for line in result.decode("utf8").splitlines()
+        if not CHECK_SUMMARY_LINE_RE.match(line)
+    ]
+
+    return sorted(lines)
+
+
+async def ruff_format(
+    *, executable: Path, path: Path, name: str, options: FormatOptions
+) -> Sequence[str]:
+    """Run the given ruff binary against the specified path."""
+    logger.debug(f"Checking {name} with {executable}")
+    ruff_args = ["format", "--no-cache", "--exit-zero"]
+
+    start = time.time()
+    proc = await create_subprocess_exec(
+        executable.absolute(),
+        *ruff_args,
+        ".",
+        stdout=PIPE,
+        stderr=PIPE,
+        cwd=path,
+    )
+    result, err = await proc.communicate()
+    end = time.time()
+
+    logger.debug(f"Finished formatting {name} with {executable} in {end - start:.2f}")
+
+    if proc.returncode != 0:
+        raise RuffError(err.decode("utf8"))
+
+    lines = result.decode("utf8").splitlines()
+    return lines