Add formatter ecosystem checks

2026-01-21 05:20:49 -05:00 · 2023-10-24 08:57:45 -05:00
parent 4b79f57872
commit ab49eaae61
15 changed files with 1018 additions and 651 deletions
--- a/python/ruff-ecosystem/pyproject.toml
+++ b/python/ruff-ecosystem/pyproject.toml
@@ -8,3 +8,7 @@ version = "0.0.0"

 [project.scripts]
 ruff-ecosystem = "ruff_ecosystem.cli:entrypoint"
+
+
+[tool.ruff]
+extend-select = ["I"]
--- a/python/ruff-ecosystem/ruff_ecosystem/main.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/main.py
@@ -2,7 +2,7 @@
 Enables usage with `python -m ruff_ecosystem`
 """

-from ruff_ecosystem.cli import entrypoint
+import ruff_ecosystem.cli

 if __name__ == "__main__":
-    entrypoint()
+    ruff_ecosystem.cli.entrypoint()
--- a/python/ruff-ecosystem/ruff_ecosystem/check.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/check.py
@@ -0,0 +1,341 @@
+from __future__ import annotations
+
+import asyncio
+import re
+import time
+from asyncio import create_subprocess_exec
+from collections import Counter
+from dataclasses import dataclass, field
+from pathlib import Path
+from subprocess import PIPE
+from typing import TYPE_CHECKING, Any, Iterator, Self, Sequence
+
+from ruff_ecosystem import logger
+from ruff_ecosystem.types import (
+    Comparison,
+    Diff,
+    Result,
+    RuffError,
+    Serializable,
+)
+from ruff_ecosystem.markdown import project_section
+
+if TYPE_CHECKING:
+    from ruff_ecosystem.projects import ClonedRepository, Project
+
+
+# Matches lines that are summaries rather than diagnostics
+CHECK_SUMMARY_LINE_RE = re.compile(r"^(Found \d+ error.*)|(.* fixable with .*)$")
+
+# Parses a diagnostic line (in a diff) to retrieve path and line number
+CHECK_DIFF_LINE_RE = re.compile(
+    r"^(?P<pre>[+-]) (?P<inner>(?P<path>[^:]+):(?P<lnum>\d+):\d+:) (?P<post>.*)$",
+)
+
+
+async def compare_check(
+    ruff_baseline_executable: Path,
+    ruff_comparison_executable: Path,
+    options: CheckOptions,
+    cloned_repo: ClonedRepository,
+) -> Comparison:
+    async with asyncio.TaskGroup() as tg:
+        baseline_task = tg.create_task(
+            ruff_check(
+                executable=ruff_baseline_executable.resolve(),
+                path=cloned_repo.path,
+                name=cloned_repo.fullname,
+                options=options,
+            ),
+        )
+        comparison_task = tg.create_task(
+            ruff_check(
+                executable=ruff_comparison_executable.resolve(),
+                path=cloned_repo.path,
+                name=cloned_repo.fullname,
+                options=options,
+            ),
+        )
+
+    baseline_output, comparison_output = (
+        baseline_task.result(),
+        comparison_task.result(),
+    )
+    diff = Diff.new(baseline_output, comparison_output)
+
+    return Comparison(diff=diff, repo=cloned_repo)
+
+
+def summarize_check_result(result: Result) -> str:
+    # Calculate the total number of rule changes
+    all_rule_changes = RuleChanges()
+    for _, comparison in result.completed:
+        all_rule_changes.update(RuleChanges.from_diff(comparison.diff))
+
+    lines = []
+    total_removed = all_rule_changes.total_removed()
+    total_added = all_rule_changes.total_added()
+    error_count = len(result.errored)
+
+    if total_removed == 0 and total_added == 0 and error_count == 0:
+        return "\u2705 ecosystem check detected no `ruff check` changes."
+
+    # Summarize the total changes
+    s = "s" if error_count != 1 else ""
+    changes = f"(+{total_added}, -{total_removed}, {error_count} error{s})"
+
+    lines.append(
+        f"\u2139\ufe0f ecosystem check **detected `ruff check` changes**. {changes}"
+    )
+    lines.append("")
+
+    # Then per-project changes
+    for project, comparison in result.completed:
+        if not comparison.diff:
+            continue  # Skip empty diffs
+
+        diff = deduplicate_and_sort_diff(comparison.diff)
+
+        # Display the output diff
+        diff_lines = []
+        for line in limit_rule_lines(diff, project.check_options.max_lines_per_rule):
+            diff_lines.append(add_permalink_to_diagnostic_line(comparison.repo, line))
+
+        lines.extend(
+            project_section(
+                title=f"+{diff.lines_added}, -{diff.lines_removed}",
+                content="\n".join(diff_lines),
+                options=project.check_options.markdown(),
+                project=project,
+            )
+        )
+
+    for project, error in result.errored:
+        lines.extend(
+            project_section(
+                title="error",
+                content=str(error),
+                options="",
+                project=project,
+            )
+        )
+
+    # Display a summary table of changed rules
+    if all_rule_changes:
+        lines.append(f"Rules changed: {len(all_rule_changes.rule_codes())}")
+        lines.append("")
+        lines.append("| Rule | Changes | Additions | Removals |")
+        lines.append("| ---- | ------- | --------- | -------- |")
+        for rule, total in sorted(
+            all_rule_changes.total_changes_by_rule(),
+            key=lambda item: item[1],  # Sort by the total changes
+            reverse=True,
+        ):
+            additions, removals = (
+                all_rule_changes.added[rule],
+                all_rule_changes.removed[rule],
+            )
+            lines.append(f"| {rule} | {total} | {additions} | {removals} |")
+
+    return "\n".join(lines)
+
+
+def add_permalink_to_diagnostic_line(repo: ClonedRepository, line: str) -> str:
+    match = CHECK_DIFF_LINE_RE.match(line)
+    if match is None:
+        return line
+
+    pre, inner, path, lnum, post = match.groups()
+    url = repo.url_for(path, int(lnum))
+    return f"{pre} <a href='{url}'>{inner}</a> {post}"
+
+
+async def ruff_check(
+    *, executable: Path, path: Path, name: str, options: CheckOptions
+) -> Sequence[str]:
+    """Run the given ruff binary against the specified path."""
+    logger.debug(f"Checking {name} with {executable}")
+    ruff_args = ["check", "--no-cache", "--exit-zero"]
+    if options.select:
+        ruff_args.extend(["--select", options.select])
+    if options.ignore:
+        ruff_args.extend(["--ignore", options.ignore])
+    if options.exclude:
+        ruff_args.extend(["--exclude", options.exclude])
+    if options.show_fixes:
+        ruff_args.extend(["--show-fixes", "--ecosystem-ci"])
+
+    start = time.time()
+    proc = await create_subprocess_exec(
+        executable.absolute(),
+        *ruff_args,
+        ".",
+        stdout=PIPE,
+        stderr=PIPE,
+        cwd=path,
+    )
+    result, err = await proc.communicate()
+    end = time.time()
+
+    logger.debug(f"Finished checking {name} with {executable} in {end - start:.2f}s")
+
+    if proc.returncode != 0:
+        raise RuffError(err.decode("utf8"))
+
+    # Strip summary lines so the diff is only diagnostic lines
+    lines = [
+        line
+        for line in result.decode("utf8").splitlines()
+        if not CHECK_SUMMARY_LINE_RE.match(line)
+    ]
+
+    return lines
+
+
+@dataclass(frozen=True)
+class CheckOptions(Serializable):
+    """
+    Ruff check options
+    """
+
+    select: str = ""
+    ignore: str = ""
+    exclude: str = ""
+
+    # Generating fixes is slow and verbose
+    show_fixes: bool = False
+
+    # Limit the number of reported lines per rule
+    max_lines_per_rule: int | None = 50
+
+    def markdown(self) -> str:
+        return f"select {self.select} ignore {self.ignore} exclude {self.exclude}"
+
+
+@dataclass(frozen=True)
+class RuleChanges:
+    """
+    The number of additions and removals by rule code
+    """
+
+    added: Counter = field(default_factory=Counter)
+    removed: Counter = field(default_factory=Counter)
+
+    def rule_codes(self) -> set[str]:
+        return set(self.added.keys()).union(self.removed.keys())
+
+    def __add__(self, other: Self) -> Self:
+        if not isinstance(other, type(self)):
+            return NotImplemented
+
+        new = RuleChanges()
+        new.update(self)
+        new.update(other)
+        return new
+
+    def update(self, other: Self) -> Self:
+        self.added.update(other.added)
+        self.removed.update(other.removed)
+        return self
+
+    def total_added(self) -> int:
+        return sum(self.added.values())
+
+    def total_removed(self) -> int:
+        return sum(self.removed.values())
+
+    def total_changes_by_rule(self) -> Iterator[str, int]:
+        """
+        Yields the sum of changes for each rule
+        """
+        totals = Counter()
+        totals.update(self.added)
+        totals.update(self.removed)
+        yield from totals.items()
+
+    @classmethod
+    def from_diff(cls: type[Self], diff: Diff) -> Self:
+        """
+        Parse a diff from `ruff check` to determine the additions and removals for each rule
+        """
+        rule_changes = cls()
+
+        for line in sorted(diff.added):
+            code = parse_rule_code(line)
+            if code is not None:
+                rule_changes.added[code] += 1
+
+        for line in sorted(diff.removed):
+            code = parse_rule_code(line)
+            if code is not None:
+                rule_changes.removed[code] += 1
+
+        return rule_changes
+
+    def __bool__(self):
+        return bool(self.added or self.removed)
+
+
+def parse_rule_code(line: str) -> str | None:
+    """
+    Parse the rule code from a diagnostic line
+
+    + <rule>/<path>:<line>:<column>: <rule_code> <message>
+    """
+    matches = re.search(r": ([A-Z]{1,4}[0-9]{3,4})", line)
+
+    if matches is None:
+        # Handle case where there are no regex matches e.g.
+        # +                 "?application=AIRFLOW&authenticator=TEST_AUTH&role=TEST_ROLE&warehouse=TEST_WAREHOUSE" # noqa: E501, ERA001
+        # Which was found in local testing
+        return None
+
+    return matches.group(1)
+
+
+def deduplicate_and_sort_diff(diff: Diff) -> Diff:
+    """
+    Removes any duplicate lines and any unchanged lines from the diff.
+    """
+    lines = set()
+    for line in diff:
+        if line.startswith("+ "):
+            lines.add(line)
+        elif line.startswith("- "):
+            lines.add(line)
+
+    # Sort without the leading + or -
+    return Diff(list(sorted(lines, key=lambda line: line[2:])))
+
+
+def limit_rule_lines(diff: Diff, max_per_rule: int | None = 100) -> list[str]:
+    """
+    Reduce the diff to include a maximum number of lines for each rule.
+    """
+    if max_per_rule is None:
+        return diff
+
+    counts = Counter()
+    reduced = []
+
+    for line in diff:
+        code = parse_rule_code(line)
+
+        # Do not omit any unparsable lines
+        if not code:
+            reduced.append(line)
+            continue
+
+        counts[code] += 1
+        if counts[code] > max_per_rule:
+            continue
+
+        reduced.append(line)
+
+    # Add lines summarizing the omitted changes
+    for code, count in counts.items():
+        hidden_count = count - max_per_rule
+        if hidden_count > 0:
+            reduced.append(f"{hidden_count} changes omitted for rule {code}")
+
+    return reduced
--- a/python/ruff-ecosystem/ruff_ecosystem/cli.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/cli.py
@@ -1,16 +1,18 @@
 import argparse
 import asyncio
 import logging
+import os
+import shutil
+import sys
+import sysconfig
 import tempfile
-from pathlib import Path
 from contextlib import nullcontext
-from ruff_ecosystem.models import RuffCommand
-from ruff_ecosystem.emitters import EmitterType
-from ruff_ecosystem.defaults import DEFAULT_TARGETS
-from ruff_ecosystem.main import main
+from pathlib import Path
 from signal import SIGINT, SIGTERM

-import sys
+from ruff_ecosystem.defaults import DEFAULT_TARGETS
+from ruff_ecosystem.main import OutputFormat, main
+from ruff_ecosystem.projects import RuffCommand


 def excepthook(type, value, tb):
@@ -18,7 +20,8 @@ def excepthook(type, value, tb):
        # we are in interactive mode or we don't have a tty so call the default
        sys.__excepthook__(type, value, tb)
    else:
-        import traceback, pdb
+        import pdb
+        import traceback

        traceback.print_exception(type, value, tb)
        print()
@@ -49,7 +52,7 @@ def entrypoint():
                ruff_baseline_executable=args.ruff_baseline,
                ruff_comparison_executable=args.ruff_comparison,
                targets=DEFAULT_TARGETS,
-                emitter=EmitterType(args.output_format).to_emitter(),
+                format=OutputFormat(args.output_format),
                cache=Path(cache),
                raise_on_failure=args.pdb,
            )
@@ -84,7 +87,7 @@ def parse_args() -> argparse.Namespace:
    )
    parser.add_argument(
        "--output-format",
-        choices=[option.name for option in EmitterType],
+        choices=[option.name for option in OutputFormat],
        default="json",
        help="Location for caching cloned repositories",
    )
@@ -114,3 +117,21 @@ def parse_args() -> argparse.Namespace:
    )

    return parser.parse_args()
+
+
+def get_executable_path(name: str) -> str | None:
+    # Add suffix for Windows executables
+    name += ".exe" if sys.platform == "win32" else ""
+
+    path = os.path.join(sysconfig.get_path("scripts"), name)
+
+    # The executable in the current interpreter's scripts directory.
+    if os.path.exists(path):
+        return path
+
+    # The executable in the global environment.
+    environment_path = shutil.which("ruff")
+    if environment_path:
+        return environment_path
+
+    return None
--- a/python/ruff-ecosystem/ruff_ecosystem/defaults.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/defaults.py
@@ -1,66 +1,64 @@
-from .models import Repository, CheckOptions, Target
+from ruff_ecosystem.projects import CheckOptions, Project, Repository

-# TODO: Consider exporting this as JSON instead for consistent setup
+# TODO: Consider exporting this as JSON
 DEFAULT_TARGETS = [
-    # Target(repo=Repository(owner="DisnakeDev", name="disnake", branch="master")),
-    # Target(repo=Repository(owner="PostHog", name="HouseWatch", branch="main")),
-    # Target(repo=Repository(owner="RasaHQ", name="rasa", branch="main")),
-    # Target(repo=Repository(owner="Snowflake-Labs", name="snowcli", branch="main")),
-    # Target(repo=Repository(owner="aiven", name="aiven-client", branch="main")),
-    # Target(repo=Repository(owner="alteryx", name="featuretools", branch="main")),
-    # Target(
-    #     repo=Repository(owner="apache", name="airflow", branch="main"),
+    # Project(repo=Repository(owner="DisnakeDev", name="disnake", ref="master")),
+    # Project(repo=Repository(owner="PostHog", name="HouseWatch", ref="main")),
+    # Project(repo=Repository(owner="RasaHQ", name="rasa", ref="main")),
+    # Project(repo=Repository(owner="Snowflake-Labs", name="snowcli", ref="main")),
+    # Project(repo=Repository(owner="aiven", name="aiven-client", ref="main")),
+    # Project(repo=Repository(owner="alteryx", name="featuretools", ref="main")),
+    # Project(
+    #     repo=Repository(owner="apache", name="airflow", ref="main"),
    #     check_options=CheckOptions(select="ALL"),
    # ),
-    # Target(repo=Repository(owner="aws", name="aws-sam-cli", branch="develop")),
-    # Target(repo=Repository(owner="bloomberg", name="pytest-memray", branch="main")),
-    # Target(
-    #     repo=Repository(owner="bokeh", name="bokeh", branch="branch-3.3"),
-    #     check_options=CheckOptions(select="ALL"),
-    # ),
-    # Target(repo=Repository(owner="commaai", name="openpilot", branch="master")),
-    # Target(repo=Repository(owner="demisto", name="content", branch="master")),
-    # Target(repo=Repository(owner="docker", name="docker-py", branch="main")),
-    # Target(
-    #     repo=Repository(owner="freedomofpress", name="securedrop", branch="develop")
-    # ),
-    # Target(repo=Repository(owner="fronzbot", name="blinkpy", branch="dev")),
-    # Target(repo=Repository(owner="ibis-project", name="ibis", branch="master")),
-    # Target(repo=Repository(owner="ing-bank", name="probatus", branch="main")),
-    # Target(repo=Repository(owner="jrnl-org", name="jrnl", branch="develop")),
-    # Target(repo=Repository(owner="latchbio", name="latch", branch="main")),
-    # Target(repo=Repository(owner="lnbits", name="lnbits", branch="main")),
-    # Target(repo=Repository(owner="milvus-io", name="pymilvus", branch="master")),
-    # Target(repo=Repository(owner="mlflow", name="mlflow", branch="master")),
-    # Target(repo=Repository(owner="model-bakers", name="model_bakery", branch="main")),
-    # Target(repo=Repository(owner="pandas-dev", name="pandas", branch="main")),
-    # Target(repo=Repository(owner="prefecthq", name="prefect", branch="main")),
-    # Target(repo=Repository(owner="pypa", name="build", branch="main")),
-    # Target(repo=Repository(owner="pypa", name="cibuildwheel", branch="main")),
-    # Target(repo=Repository(owner="pypa", name="pip", branch="main")),
-    # Target(repo=Repository(owner="pypa", name="setuptools", branch="main")),
-    # Target(repo=Repository(owner="python", name="mypy", branch="master")),
-    # Target(
+    # Project(repo=Repository(owner="aws", name="aws-sam-cli", ref="develop")),
+    # Project(repo=Repository(owner="bloomberg", name="pytest-memray", ref="main")),
+    Project(
+        repo=Repository(owner="bokeh", name="bokeh", ref="branch-3.3"),
+        check_options=CheckOptions(select="ALL"),
+    ),
+    # Project(repo=Repository(owner="commaai", name="openpilot", ref="master")),
+    # Project(repo=Repository(owner="demisto", name="content", ref="master")),
+    # Project(repo=Repository(owner="docker", name="docker-py", ref="main")),
+    # Project(repo=Repository(owner="freedomofpress", name="securedrop", ref="develop")),
+    # Project(repo=Repository(owner="fronzbot", name="blinkpy", ref="dev")),
+    # Project(repo=Repository(owner="ibis-project", name="ibis", ref="master")),
+    # Project(repo=Repository(owner="ing-bank", name="probatus", ref="main")),
+    # Project(repo=Repository(owner="jrnl-org", name="jrnl", ref="develop")),
+    # Project(repo=Repository(owner="latchbio", name="latch", ref="main")),
+    # Project(repo=Repository(owner="lnbits", name="lnbits", ref="main")),
+    # Project(repo=Repository(owner="milvus-io", name="pymilvus", ref="master")),
+    # Project(repo=Repository(owner="mlflow", name="mlflow", ref="master")),
+    # Project(repo=Repository(owner="model-bakers", name="model_bakery", ref="main")),
+    # Project(repo=Repository(owner="pandas-dev", name="pandas", ref="main")),
+    # Project(repo=Repository(owner="prefecthq", name="prefect", ref="main")),
+    # Project(repo=Repository(owner="pypa", name="build", ref="main")),
+    # Project(repo=Repository(owner="pypa", name="cibuildwheel", ref="main")),
+    # Project(repo=Repository(owner="pypa", name="pip", ref="main")),
+    # Project(repo=Repository(owner="pypa", name="setuptools", ref="main")),
+    # Project(repo=Repository(owner="python", name="mypy", ref="master")),
+    # Project(
    #     repo=Repository(
    #         owner="python",
    #         name="typeshed",
-    #         branch="main",
+    #         ref="main",
    #     ),
    #     check_options=CheckOptions(select="PYI"),
    # ),
-    # Target(repo=Repository(owner="python-poetry", name="poetry", branch="master")),
-    # Target(repo=Repository(owner="reflex-dev", name="reflex", branch="main")),
-    # Target(repo=Repository(owner="rotki", name="rotki", branch="develop")),
-    # Target(repo=Repository(owner="scikit-build", name="scikit-build", branch="main")),
-    # Target(
-    #     repo=Repository(owner="scikit-build", name="scikit-build-core", branch="main")
+    # Project(repo=Repository(owner="python-poetry", name="poetry", ref="master")),
+    # Project(repo=Repository(owner="reflex-dev", name="reflex", ref="main")),
+    # Project(repo=Repository(owner="rotki", name="rotki", ref="develop")),
+    # Project(repo=Repository(owner="scikit-build", name="scikit-build", ref="main")),
+    # Project(
+    #     repo=Repository(owner="scikit-build", name="scikit-build-core", ref="main")
+    # ),
+    # Project(repo=Repository(owner="sphinx-doc", name="sphinx", ref="master")),
+    # Project(repo=Repository(owner="spruceid", name="siwe-py", ref="main")),
+    # Project(repo=Repository(owner="tiangolo", name="fastapi", ref="master")),
+    # Project(repo=Repository(owner="yandex", name="ch-backup", ref="main")),
+    # Project(
+    #     repo=Repository(owner="zulip", name="zulip", ref="main"),
+    #     check_options=CheckOptions(select="ALL"),
    # ),
-    # Target(repo=Repository(owner="sphinx-doc", name="sphinx", branch="master")),
-    # Target(repo=Repository(owner="spruceid", name="siwe-py", branch="main")),
-    # Target(repo=Repository(owner="tiangolo", name="fastapi", branch="master")),
-    # Target(repo=Repository(owner="yandex", name="ch-backup", branch="main")),
-    Target(
-        repo=Repository(owner="zulip", name="zulip", branch="main"),
-        check_options=CheckOptions(select="ALL"),
-    ),
 ]
--- a/python/ruff-ecosystem/ruff_ecosystem/emitters.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/emitters.py
@@ -1,99 +0,0 @@
-from enum import Enum
-import abc
-from ruff_ecosystem.models import Target, Diff, ClonedRepository, Result
-from ruff_ecosystem.ruff import CHECK_DIFF_LINE_RE
-import traceback
-import json
-from pathlib import Path
-import dataclasses
-
-
-class Emitter(abc.ABC):
-    @abc.abstractclassmethod
-    def emit_error(cls, target: Target, exc: Exception):
-        pass
-
-    @abc.abstractclassmethod
-    def emit_diff(cls, target: Target, diff: Diff, cloned_repo: ClonedRepository):
-        pass
-
-    @abc.abstractclassmethod
-    def emit_result(cls, result: Result):
-        pass
-
-
-class DebugEmitter(Emitter):
-    def emit_error(cls, target: Target, exc: Exception):
-        print(f"Error in {target.repo.fullname}")
-        traceback.print_exception(exc)
-
-    def emit_diff(cls, target: Target, diff: Diff, cloned_repo: ClonedRepository):
-        pass
-
-
-class JSONEmitter(Emitter):
-    class DataclassJSONEncoder(json.JSONEncoder):
-        def default(self, o):
-            if dataclasses.is_dataclass(o):
-                return dataclasses.asdict(o)
-            if isinstance(o, set):
-                return tuple(o)
-            if isinstance(o, Path):
-                return str(o)
-            return super().default(o)
-
-    def emit_error(cls, target: Target, exc: Exception):
-        pass
-
-    def emit_diff(cls, target: Target, diff: Diff, cloned_repo: ClonedRepository):
-        pass
-
-    def emit_result(cls, result: Result):
-        print(json.dumps(result, indent=4, cls=cls.DataclassJSONEncoder))
-
-
-class MarkdownEmitter(Emitter):
-    def emit_error(cls, target: Target, exc: Exception):
-        cls._print(title="error", content=f"```\n{exc}\n```", target=target)
-
-    def emit_diff(cls, target: Target, diff: Diff, cloned_repo: ClonedRepository):
-        changes = f"+{len(diff.added)}, -{len(diff.removed)}"
-
-        content = ""
-        for line in list(diff):
-            match = CHECK_DIFF_LINE_RE.match(line)
-            if match is None:
-                content += line + "\n"
-                continue
-
-            pre, inner, path, lnum, post = match.groups()
-            url = cloned_repo.url_for(path, int(lnum))
-            content += f"{pre} <a href='{url}'>{inner}</a> {post}" + "\n"
-
-        cls._print(title=changes, content=f"<pre>\n{content}\n</pre>", target=target)
-
-    def _print(cls, title: str, content: str, target: Target):
-        print(f"<details><summary>{target.repo.fullname} ({title})</summary>")
-        print(target.repo.url, target.check_options.summary())
-        print("<p>")
-        print()
-
-        print(content)
-
-        print()
-        print("</p>")
-        print("</details>")
-
-
-class EmitterType(Enum):
-    markdown = "markdown"
-    json = "json"
-
-    def to_emitter(self) -> Emitter:
-        match self:
-            case self.markdown:
-                return MarkdownEmitter()
-            case self.json:
-                return JSONEmitter()
-            case _:
-                raise ValueError("Unknown emitter type {self}")
--- a/python/ruff-ecosystem/ruff_ecosystem/format.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/format.py
@@ -0,0 +1,238 @@
+from __future__ import annotations
+
+import time
+from asyncio import create_subprocess_exec
+from dataclasses import dataclass
+from pathlib import Path
+from subprocess import PIPE
+from unidiff import PatchSet
+from typing import TYPE_CHECKING, Self, Sequence
+import re
+from ruff_ecosystem import logger
+from ruff_ecosystem.markdown import project_section
+from ruff_ecosystem.types import Comparison, Diff, Result, RuffError
+
+if TYPE_CHECKING:
+    from ruff_ecosystem.projects import ClonedRepository, Project
+
+
+FORMAT_IGNORE_LINES = re.compile("^warning: `ruff format` is a work-in-progress.*")
+
+
+def summarize_format_result(result: Result) -> str:
+    lines = []
+    total_lines_removed = total_lines_added = 0
+    total_files_modified = 0
+    error_count = len(result.errored)
+    patch_sets = []
+
+    for project, comparison in result.completed:
+        total_lines_added += comparison.diff.lines_added
+        total_lines_removed += comparison.diff.lines_removed
+
+        patch_set = PatchSet("\n".join(comparison.diff.lines))
+        patch_sets.append(patch_set)
+        total_files_modified += len(patch_set.modified_files)
+
+    if total_lines_removed == 0 and total_lines_added == 0 and error_count == 0:
+        return "\u2705 ecosystem check detected no format changes."
+
+    # Summarize the total changes
+    es = "s" if error_count != 1 else ""
+    fs = "s" if total_files_modified != 1 else ""
+    changes = f"(+{total_lines_added}, -{total_lines_removed} lines in {total_files_modified} file{fs}, {error_count} error{es})"
+    lines.append(f"\u2139\ufe0f ecosystem check **detected format changes**. {changes}")
+    lines.append("")
+
+    # Then per-project changes
+    for (project, comparison), patch_set in zip(result.completed, patch_sets):
+        if not comparison.diff:
+            continue  # Skip empty diffs
+
+        files = len(patch_set.modified_files)
+        s = "s" if files != 1 else ""
+        title = f"+{comparison.diff.lines_added}, -{comparison.diff.lines_removed} lines in {files} file{s}"
+
+        lines.extend(
+            project_section(
+                title=title,
+                content=patch_set_with_permalinks(patch_set, comparison.repo),
+                options=project.format_options.markdown(),
+                project=project,
+            )
+        )
+
+    for project, error in result.errored:
+        lines.extend(
+            project_section(
+                title="error",
+                content=str(error),
+                options="",
+                project=project,
+            )
+        )
+
+    return "\n".join(lines)
+
+
+async def ruff_format(
+    *,
+    executable: Path,
+    path: Path,
+    name: str,
+    options: FormatOptions,
+    diff: bool = False,
+) -> Sequence[str]:
+    """Run the given ruff binary against the specified path."""
+    logger.debug(f"Formatting {name} with {executable}")
+    ruff_args = ["format"]
+
+    if diff:
+        ruff_args.append("--diff")
+
+    start = time.time()
+    proc = await create_subprocess_exec(
+        executable.absolute(),
+        *ruff_args,
+        ".",
+        stdout=PIPE,
+        stderr=PIPE,
+        cwd=path,
+    )
+    result, err = await proc.communicate()
+    end = time.time()
+
+    logger.debug(f"Finished formatting {name} with {executable} in {end - start:.2f}s")
+
+    if proc.returncode not in [0, 1]:
+        raise RuffError(err.decode("utf8"))
+
+    lines = result.decode("utf8").splitlines()
+    return lines
+
+
+async def black_format(
+    *,
+    executable: Path,
+    path: Path,
+    name: str,
+) -> Sequence[str]:
+    """Run the given black binary against the specified path."""
+    logger.debug(f"Formatting {name} with {executable}")
+    black_args = []
+
+    start = time.time()
+    proc = await create_subprocess_exec(
+        executable.absolute(),
+        *black_args,
+        ".",
+        stdout=PIPE,
+        stderr=PIPE,
+        cwd=path,
+    )
+    result, err = await proc.communicate()
+    end = time.time()
+
+    logger.debug(f"Finished formatting {name} with {executable} in {end - start:.2f}s")
+
+    if proc.returncode != 0:
+        raise RuffError(err.decode("utf8"))
+
+    lines = result.decode("utf8").splitlines()
+    return [line for line in lines if not FORMAT_IGNORE_LINES.match(line)]
+
+
+async def compare_format(
+    ruff_baseline_executable: Path,
+    ruff_comparison_executable: Path,
+    options: FormatOptions,
+    cloned_repo: ClonedRepository,
+):
+    # Run format without diff to get the baseline
+    await ruff_format(
+        executable=ruff_baseline_executable.resolve(),
+        path=cloned_repo.path,
+        name=cloned_repo.fullname,
+        options=options,
+    )
+    # Then get the diff from stdout
+    diff = await ruff_format(
+        executable=ruff_comparison_executable.resolve(),
+        path=cloned_repo.path,
+        name=cloned_repo.fullname,
+        options=options,
+        diff=True,
+    )
+
+    return create_format_comparison(cloned_repo, FormatDiff(lines=diff))
+
+
+def create_format_comparison(repo: ClonedRepository, diff: str) -> FormatComparison:
+    return FormatComparison(diff=diff, repo=repo)
+
+
+@dataclass(frozen=True)
+class FormatOptions:
+    """
+    Ruff format options
+    """
+
+    pass
+
+    def markdown(self: Self) -> str:
+        return ""
+
+
+@dataclass(frozen=True)
+class FormatDiff(Diff):
+    """A diff from ruff format."""
+
+    lines: list[str]
+
+    def __bool__(self: Self) -> bool:
+        """Return true if this diff is non-empty."""
+        return bool(self.lines)
+
+    @property
+    def added(self) -> set[str]:
+        return set(line for line in self.lines if line.startswith("+"))
+
+    @property
+    def removed(self) -> set[str]:
+        return set(line for line in self.lines if line.startswith("-"))
+
+
+@dataclass(frozen=True)
+class FormatComparison(Comparison):
+    diff: FormatDiff
+    repo: ClonedRepository
+
+
+@dataclass(frozen=True)
+class FormatResult(Result):
+    comparisons: tuple[Project, FormatComparison]
+
+
+def patch_set_with_permalinks(patch_set: PatchSet, repo: ClonedRepository) -> str:
+    lines = []
+    for file_patch in patch_set:
+        file_link = repo.url_for(file_patch.path)
+        lines.append(f"<a href='{file_link}'>{file_patch.path}</a>")
+        for hunk in file_patch:
+            hunk_link = repo.url_for(
+                file_patch.path,
+                hunk.source_start,
+                hunk.source_start + hunk.source_length,
+            )
+            hunk_lines = str(hunk).splitlines()
+
+            # Add a link before the hunk
+            link_title = file_patch.path + hunk_link.split(file_patch.path)[-1]
+            lines.append(f"<a href='{hunk_link}'>{link_title}</a>")
+
+            # Wrap the contents of the hunk in a diff code block
+            lines.append("```diff")
+            lines.extend(hunk_lines[1:])
+            lines.append("```")
+
+    return "\n".join(lines)
--- a/python/ruff-ecosystem/ruff_ecosystem/git.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/git.py
@@ -1,72 +0,0 @@
-from ruff_ecosystem.models import Repository, ClonedRepository
-from contextlib import asynccontextmanager
-from pathlib import Path
-from typing import AsyncGenerator
-from asyncio import create_subprocess_exec
-from subprocess import PIPE
-from ruff_ecosystem import logger
-
-
-@asynccontextmanager
-async def clone(
-    repo: Repository, checkout_dir: Path
-) -> AsyncGenerator[ClonedRepository, None]:
-    """Shallow clone this repository to a temporary directory."""
-    if checkout_dir.exists():
-        logger.debug(f"Reusing {repo.owner}:{repo.name}")
-        yield await _cloned_repository(repo, checkout_dir)
-        return
-
-    logger.debug(f"Cloning {repo.owner}:{repo.name} to {checkout_dir}")
-    command = [
-        "git",
-        "clone",
-        "--config",
-        "advice.detachedHead=false",
-        "--quiet",
-        "--depth",
-        "1",
-        "--no-tags",
-    ]
-    if repo.branch:
-        command.extend(["--branch", repo.branch])
-
-    command.extend(
-        [
-            f"https://github.com/{repo.owner}/{repo.name}",
-            checkout_dir,
-        ],
-    )
-
-    process = await create_subprocess_exec(*command, env={"GIT_TERMINAL_PROMPT": "0"})
-
-    status_code = await process.wait()
-
-    logger.debug(
-        f"Finished cloning {repo.fullname} with status {status_code}",
-    )
-    yield await _cloned_repository(repo, checkout_dir)
-
-
-async def _cloned_repository(repo: Repository, checkout_dir: Path) -> ClonedRepository:
-    return ClonedRepository(
-        name=repo.name,
-        owner=repo.owner,
-        branch=repo.branch,
-        path=checkout_dir,
-        commit_hash=await _get_commit_hash(checkout_dir),
-    )
-
-
-async def _get_commit_hash(checkout_dir: Path) -> str:
-    """
-    Return the commit sha for the repository in the checkout directory.
-    """
-    process = await create_subprocess_exec(
-        *["git", "rev-parse", "HEAD"],
-        cwd=checkout_dir,
-        stdout=PIPE,
-    )
-    stdout, _ = await process.communicate()
-    assert await process.wait() == 0, f"Failed to retrieve commit sha at {checkout_dir}"
-    return stdout.decode().strip()
--- a/python/ruff-ecosystem/ruff_ecosystem/main.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/main.py
@@ -1,32 +1,34 @@
-from ruff_ecosystem.models import (
-    RuffCommand,
-    Target,
-    Diff,
-    ClonedRepository,
-    RuleChanges,
-    CheckComparison,
-    Result,
-)
-from pathlib import Path
-from ruff_ecosystem import logger
 import asyncio
-from ruff_ecosystem.git import clone
-from ruff_ecosystem.ruff import ruff_check, ruff_format
-from ruff_ecosystem.emitters import Emitter
-import difflib
+import dataclasses
+import json
+from enum import Enum
+from pathlib import Path
 from typing import TypeVar
-import re
+
+from ruff_ecosystem import logger
+from ruff_ecosystem.check import compare_check, summarize_check_result
+from ruff_ecosystem.format import compare_format, summarize_format_result
+from ruff_ecosystem.projects import (
+    Project,
+    RuffCommand,
+)
+from ruff_ecosystem.types import Comparison, Result, Serializable

 T = TypeVar("T")


+class OutputFormat(Enum):
+    markdown = "markdown"
+    json = "json"
+
+
 async def main(
    command: RuffCommand,
    ruff_baseline_executable: Path,
    ruff_comparison_executable: Path,
-    targets: list[Target],
+    targets: list[Project],
    cache: Path | None,
-    emitter: Emitter,
+    format: OutputFormat,
    max_parallelism: int = 50,
    raise_on_failure: bool = False,
 ) -> None:
@@ -42,7 +44,7 @@ async def main(
        async with semaphore:
            return await coroutine

-    comparisons: list[Exception | CheckComparison] = await asyncio.gather(
+    comparisons: list[Exception | Comparison] = await asyncio.gather(
        *[
            limited_parallelism(
                clone_and_compare(
@@ -59,177 +61,82 @@ async def main(
    )
    comparisons_by_target = dict(zip(targets, comparisons, strict=True))

-    # Calculate totals
-    total_removed = total_added = errors = 0
-    total_rule_changes = RuleChanges()
-    for comparison in comparisons_by_target.values():
-        if isinstance(comparison, Exception):
-            errors += 1
-        else:
-            total_removed += len(comparison.diff.removed)
-            total_added += len(comparison.diff.added)
-            total_rule_changes += comparison.rule_changes
-
-    errors = []
-    comparisons = []
+    errors, successes = [], []
    for target, comparison in comparisons_by_target.items():
        if isinstance(comparison, Exception):
            errors.append((target, comparison))
            continue

-        if comparison.diff:
-            comparisons.append((target, comparison))
+        successes.append((target, comparison))

-        else:
-            continue
+    result = Result(completed=successes, errored=errors)

-    result = Result(
-        total_added=total_added,
-        total_removed=total_removed,
-        total_rule_changes=total_rule_changes,
-        comparisons=comparisons,
-        errors=errors,
-    )
+    match format:
+        case OutputFormat.json:
+            print(json.dumps(result, indent=4, cls=JSONEncoder))
+        case OutputFormat.markdown:
+            match command:
+                case RuffCommand.check:
+                    print(summarize_check_result(result))
+                case RuffCommand.format:
+                    print(summarize_format_result(result))
+                case _:
+                    raise ValueError(f"Unknown target Ruff command {command}")
+        case _:
+            raise ValueError(f"Unknown output format {format}")

-    emitter.emit_result(result)
-    return
-
-    if total_removed == 0 and total_added == 0 and errors == 0:
-        print("\u2705 ecosystem check detected no changes.")
-        return
-
-    s = "s" if errors != 1 else ""
-    changes = f"(+{total_added}, -{total_removed}, {errors} error{s})"
-
-    print(f"\u2139\ufe0f ecosystem check **detected changes**. {changes}")
-    print()
-
-    for target, comparison in comparisons_by_target.items():
-        if isinstance(comparison, Exception):
-            emitter.emit_error(target, comparison)
-            continue
-
-        if comparison.diff:
-            emitter.emit_diff(target, comparison.diff, comparison.repo)
-
-        else:
-            continue
-
-    if len(total_rule_changes.rule_codes()) > 0:
-        print(f"Rules changed: {len(total_rule_changes.rule_codes())}")
-        print()
-        print("| Rule | Changes | Additions | Removals |")
-        print("| ---- | ------- | --------- | -------- |")
-        for rule, (additions, removals) in sorted(
-            total_rule_changes.items(),
-            key=lambda x: (x[1][0] + x[1][1]),
-            reverse=True,
-        ):
-            print(f"| {rule} | {additions + removals} | {additions} | {removals} |")
+    return None


 async def clone_and_compare(
    command: RuffCommand,
    ruff_baseline_executable: Path,
    ruff_comparison_executable: Path,
-    target: Target,
+    target: Project,
    cache: Path,
-) -> CheckComparison:
+) -> Comparison:
    """Check a specific repository against two versions of ruff."""
    assert ":" not in target.repo.owner
    assert ":" not in target.repo.name

    match command:
        case RuffCommand.check:
-            ruff_task, create_comparison, options = (
-                ruff_check,
-                create_check_comparison,
+            compare, options = (
+                compare_check,
                target.check_options,
            )
        case RuffCommand.format:
-            ruff_task, create_comparison, options = (
-                ruff_format,
-                create_format_comparison,
+            compare, options = (
+                compare_format,
                target.format_options,
            )
        case _:
-            raise ValueError(f"Unknowm target Ruff command {command}")
+            raise ValueError(f"Unknown target Ruff command {command}")

    checkout_dir = cache.joinpath(f"{target.repo.owner}:{target.repo.name}")
-    async with clone(target.repo, checkout_dir) as cloned_repo:
-        try:
-            async with asyncio.TaskGroup() as tg:
-                baseline_task = tg.create_task(
-                    ruff_task(
-                        executable=ruff_baseline_executable.resolve(),
-                        path=cloned_repo.path,
-                        name=cloned_repo.fullname,
-                        options=options,
-                    ),
-                )
-                comparison_task = tg.create_task(
-                    ruff_task(
-                        executable=ruff_comparison_executable.resolve(),
-                        path=cloned_repo.path,
-                        name=cloned_repo.fullname,
-                        options=options,
-                    ),
-                )
-        except ExceptionGroup as e:
-            raise e.exceptions[0] from e
+    cloned_repo = await target.repo.clone(checkout_dir)

-    return create_comparison(
-        cloned_repo, baseline_task.result(), comparison_task.result()
-    )
+    try:
+        return await compare(
+            ruff_baseline_executable,
+            ruff_comparison_executable,
+            options,
+            cloned_repo,
+        )
+    except ExceptionGroup as e:
+        raise e.exceptions[0] from e


-def create_check_comparison(
-    repo: ClonedRepository, baseline_output: str, comparison_output: str
-) -> CheckComparison:
-    removed, added = set(), set()
-
-    for line in difflib.ndiff(baseline_output, comparison_output):
-        if line.startswith("- "):
-            removed.add(line[2:])
-        elif line.startswith("+ "):
-            added.add(line[2:])
-
-    diff = Diff(removed=removed, added=added)
-
-    return CheckComparison(
-        diff=diff, repo=repo, rule_changes=rule_changes_from_diff(diff)
-    )
-
-
-def rule_changes_from_diff(diff: Diff) -> RuleChanges:
-    """
-    Parse a diff from `ruff check` to determine the additions and removals for each rule.
-    """
-    rule_changes = RuleChanges()
-
-    # Count rule changes
-    for line in diff.lines():
-        # Find rule change for current line or construction
-        # + <rule>/<path>:<line>:<column>: <rule_code> <message>
-        matches = re.search(r": ([A-Z]{1,4}[0-9]{3,4})", line)
-
-        if matches is None:
-            # Handle case where there are no regex matches e.g.
-            # +                 "?application=AIRFLOW&authenticator=TEST_AUTH&role=TEST_ROLE&warehouse=TEST_WAREHOUSE" # noqa: E501, ERA001
-            # Which was found in local testing
-            continue
-
-        rule_code = matches.group(1)
-
-        # Get current additions and removals for this rule
-        current_changes = rule_changes[rule_code]
-
-        # Check if addition or removal depending on the first character
-        if line[0] == "+":
-            current_changes = (current_changes[0] + 1, current_changes[1])
-        elif line[0] == "-":
-            current_changes = (current_changes[0], current_changes[1] + 1)
-
-        rule_changes[rule_code] = current_changes
-
-    return rule_changes
+class JSONEncoder(json.JSONEncoder):
+    def default(self, o):
+        if isinstance(o, Serializable):
+            return o.jsonable()
+        if dataclasses.is_dataclass(o):
+            return dataclasses.asdict(o)
+        if isinstance(o, set):
+            return tuple(o)
+        if isinstance(o, Path):
+            return str(o)
+        if isinstance(o, Exception):
+            return str(o)
+        return super().default(o)
--- a/python/ruff-ecosystem/ruff_ecosystem/markdown.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/markdown.py
@@ -0,0 +1,25 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from ruff_ecosystem.projects import Project
+
+
+def project_section(
+    title: str, content: str, options: str, project: Project
+) -> list[str]:
+    lines = []
+    lines.append(
+        f'<details><summary><a href="{project.repo.url}">{project.repo.fullname}</a> ({title})</summary>'
+    )
+    lines.append(options)
+    lines.append("<p>")
+    lines.append("")
+
+    lines.append(content)
+
+    lines.append("")
+    lines.append("</p>")
+    lines.append("</details>")
+    return lines
--- a/python/ruff-ecosystem/ruff_ecosystem/models.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/models.py
@@ -1,160 +0,0 @@
-from enum import Enum
-from dataclasses import dataclass, field
-from typing import Self, Iterator
-import heapq
-from pathlib import Path
-
-
-class RuffCommand(Enum):
-    check = "check"
-    format = "format"
-
-
-@dataclass(frozen=True)
-class Repository:
-    """
-    A remote GitHub repository
-    """
-
-    owner: str
-    name: str
-    branch: str | None
-
-    @property
-    def fullname(self) -> str:
-        return f"{self.owner}/{self.name}"
-
-    @property
-    def url(self: Self) -> str:
-        return f"https://github.com/{self.owner}/{self.name}"
-
-
-@dataclass(frozen=True)
-class ClonedRepository(Repository):
-    """
-    A cloned GitHub repository, which includes the hash of the cloned commit.
-    """
-
-    commit_hash: str
-    path: Path
-
-    def url_for(self: Self, path: str, line_number: int | None = None) -> str:
-        """
-        Return the remote GitHub URL for the given path in this repository.
-        """
-        # Default to main branch
-        url = f"https://github.com/{self.owner}/{self.name}/blob/{self.commit_hash}/{path}"
-        if line_number:
-            url += f"#L{line_number}"
-        return url
-
-    @property
-    def url(self: Self) -> str:
-        return f"https://github.com/{self.owner}/{self.name}@{self.commit_hash}"
-
-
-@dataclass(frozen=True)
-class Diff:
-    """A diff between two runs of ruff."""
-
-    removed: set[str]
-    added: set[str]
-
-    def __bool__(self: Self) -> bool:
-        """Return true if this diff is non-empty."""
-        return bool(self.removed or self.added)
-
-    def lines(self: Self) -> Iterator[str]:
-        """Iterate through the changed lines in diff format."""
-        for line in heapq.merge(sorted(self.removed), sorted(self.added)):
-            if line in self.removed:
-                yield f"- {line}"
-            else:
-                yield f"+ {line}"
-
-
-@dataclass(frozen=True)
-class RuleChanges:
-    changes: dict[str, tuple[int, int]] = field(default_factory=dict)
-
-    def rule_codes(self) -> list[str]:
-        return list(self.changes.keys())
-
-    def items(self) -> Iterator[tuple[str, tuple[int, int]]]:
-        return self.changes.items()
-
-    def __setitem__(self, key: str, value: tuple[int, int]) -> None:
-        self.changes[key] = value
-
-    def __getitem__(self, key: str) -> tuple[int, int]:
-        return self.changes.get(key, (0, 0))
-
-    def __add__(self, other: Self) -> Self:
-        if not isinstance(other, type(self)):
-            return NotImplemented
-
-        result = self.changes.copy()
-        for rule_code, (added, removed) in other.changes.items():
-            if rule_code in result:
-                result[rule_code] = (
-                    result[rule_code][0] + added,
-                    result[rule_code][1] + removed,
-                )
-            else:
-                result[rule_code] = (added, removed)
-
-        return RuleChanges(changes=result)
-
-
-@dataclass(frozen=True)
-class CheckComparison:
-    diff: Diff
-    repo: ClonedRepository
-    rule_changes: RuleChanges
-
-
-@dataclass(frozen=True)
-class CheckOptions:
-    """
-    Ruff check options
-    """
-
-    select: str = ""
-    ignore: str = ""
-    exclude: str = ""
-
-    # Generating fixes is slow and verbose
-    show_fixes: bool = False
-
-    def summary(self) -> str:
-        return f"select {self.select} ignore {self.ignore} exclude {self.exclude}"
-
-
-@dataclass(frozen=True)
-class FormatOptions:
-    """
-    Ruff format options
-    """
-
-    pass
-
-
-@dataclass(frozen=True)
-class Target:
-    """
-    An ecosystem target
-    """
-
-    repo: Repository
-    check_options: CheckOptions = field(default_factory=CheckOptions)
-    format_options: FormatOptions = field(default_factory=FormatOptions)
-
-
-@dataclass(frozen=True)
-class Result:
-    total_added: int
-    total_removed: int
-    total_rule_changes: RuleChanges
-
-    comparisons: tuple[Target, CheckComparison]
-    errors: tuple[Target, Exception]
--- a/python/ruff-ecosystem/ruff_ecosystem/projects.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/projects.py
@@ -0,0 +1,168 @@
+from __future__ import annotations
+
+from asyncio import create_subprocess_exec
+from dataclasses import dataclass, field
+from enum import Enum
+from pathlib import Path
+from subprocess import PIPE
+from typing import AsyncGenerator, Self
+
+from ruff_ecosystem import logger
+from ruff_ecosystem.check import CheckOptions
+from ruff_ecosystem.format import FormatOptions
+from ruff_ecosystem.types import Serializable
+
+
+@dataclass(frozen=True)
+class Project(Serializable):
+    """
+    An ecosystem target
+    """
+
+    repo: Repository
+    check_options: CheckOptions = field(default_factory=lambda: CheckOptions())
+    format_options: FormatOptions = field(default_factory=lambda: FormatOptions())
+
+
+class RuffCommand(Enum):
+    check = "check"
+    format = "format"
+
+
+class ProjectSetupError(Exception):
+    """An error setting up a project."""
+
+
+@dataclass(frozen=True)
+class Repository(Serializable):
+    """
+    A remote GitHub repository
+    """
+
+    owner: str
+    name: str
+    ref: str | None
+
+    @property
+    def fullname(self) -> str:
+        return f"{self.owner}/{self.name}"
+
+    @property
+    def url(self: Self) -> str:
+        return f"https://github.com/{self.owner}/{self.name}"
+
+    async def clone(
+        self: Self, checkout_dir: Path
+    ) -> AsyncGenerator[ClonedRepository, None]:
+        """
+        Shallow clone this repository
+        """
+        if checkout_dir.exists():
+            logger.debug(f"Reusing {self.owner}:{self.name}")
+
+            if self.ref:
+                logger.debug(f"Checking out ref {self.ref}")
+                process = await create_subprocess_exec(
+                    *["git", "checkout", "-f", self.ref],
+                    cwd=checkout_dir,
+                    env={"GIT_TERMINAL_PROMPT": "0"},
+                    stdout=PIPE,
+                    stderr=PIPE,
+                )
+                if await process.wait() != 0:
+                    _, stderr = await process.communicate()
+                    raise ProjectSetupError(
+                        f"Failed to checkout {self.ref}: {stderr.decode()}"
+                    )
+
+            return ClonedRepository(
+                name=self.name,
+                owner=self.owner,
+                ref=self.ref,
+                path=checkout_dir,
+                commit_hash=await self._get_head_commit(checkout_dir),
+            )
+
+        logger.debug(f"Cloning {self.owner}:{self.name} to {checkout_dir}")
+        command = [
+            "git",
+            "clone",
+            "--config",
+            "advice.detachedHead=false",
+            "--quiet",
+            "--depth",
+            "1",
+            "--no-tags",
+        ]
+        if self.ref:
+            command.extend(["--branch", self.ref])
+
+        command.extend(
+            [
+                f"https://github.com/{self.owner}/{self.name}",
+                checkout_dir,
+            ],
+        )
+
+        process = await create_subprocess_exec(
+            *command, env={"GIT_TERMINAL_PROMPT": "0"}
+        )
+
+        status_code = await process.wait()
+
+        logger.debug(
+            f"Finished cloning {self.fullname} with status {status_code}",
+        )
+        return ClonedRepository(
+            name=self.name,
+            owner=self.owner,
+            ref=self.ref,
+            path=checkout_dir,
+            commit_hash=await self._get_head_commit(checkout_dir),
+        )
+
+    @staticmethod
+    async def _get_head_commit(checkout_dir: Path) -> str:
+        """
+        Return the commit sha for the repository in the checkout directory.
+        """
+        process = await create_subprocess_exec(
+            *["git", "rev-parse", "HEAD"],
+            cwd=checkout_dir,
+            stdout=PIPE,
+        )
+        stdout, _ = await process.communicate()
+        if await process.wait() != 0:
+            raise ProjectSetupError(f"Failed to retrieve commit sha at {checkout_dir}")
+
+        return stdout.decode().strip()
+
+
+@dataclass(frozen=True)
+class ClonedRepository(Repository, Serializable):
+    """
+    A cloned GitHub repository, which includes the hash of the cloned commit.
+    """
+
+    commit_hash: str
+    path: Path
+
+    def url_for(
+        self: Self,
+        path: str,
+        line_number: int | None = None,
+        end_line_number: int | None = None,
+    ) -> str:
+        """
+        Return the remote GitHub URL for the given path in this repository.
+        """
+        url = f"https://github.com/{self.owner}/{self.name}/blob/{self.commit_hash}/{path}"
+        if line_number:
+            url += f"#L{line_number}"
+        if end_line_number:
+            url += f"-L{end_line_number}"
+        return url
+
+    @property
+    def url(self: Self) -> str:
+        return f"https://github.com/{self.owner}/{self.name}@{self.commit_hash}"
--- a/python/ruff-ecosystem/ruff_ecosystem/ruff.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/ruff.py
@@ -1,90 +0,0 @@
-from pathlib import Path
-from ruff_ecosystem import logger
-from ruff_ecosystem.models import CheckOptions, FormatOptions
-import time
-from asyncio import create_subprocess_exec
-from subprocess import PIPE
-from typing import Sequence
-import re
-
-CHECK_SUMMARY_LINE_RE = re.compile(
-    r"^(Found \d+ error.*)|(.*potentially fixable with.*)$"
-)
-
-
-CHECK_DIFF_LINE_RE = re.compile(
-    r"^(?P<pre>[+-]) (?P<inner>(?P<path>[^:]+):(?P<lnum>\d+):\d+:) (?P<post>.*)$",
-)
-
-
-class RuffError(Exception):
-    """An error reported by ruff."""
-
-
-async def ruff_check(
-    *, executable: Path, path: Path, name: str, options: CheckOptions
-) -> Sequence[str]:
-    """Run the given ruff binary against the specified path."""
-    logger.debug(f"Checking {name} with {executable}")
-    ruff_args = ["check", "--no-cache", "--exit-zero"]
-    if options.select:
-        ruff_args.extend(["--select", options.select])
-    if options.ignore:
-        ruff_args.extend(["--ignore", options.ignore])
-    if options.exclude:
-        ruff_args.extend(["--exclude", options.exclude])
-    if options.show_fixes:
-        ruff_args.extend(["--show-fixes", "--ecosystem-ci"])
-
-    start = time.time()
-    proc = await create_subprocess_exec(
-        executable.absolute(),
-        *ruff_args,
-        ".",
-        stdout=PIPE,
-        stderr=PIPE,
-        cwd=path,
-    )
-    result, err = await proc.communicate()
-    end = time.time()
-
-    logger.debug(f"Finished checking {name} with {executable} in {end - start:.2f}")
-
-    if proc.returncode != 0:
-        raise RuffError(err.decode("utf8"))
-
-    lines = [
-        line
-        for line in result.decode("utf8").splitlines()
-        if not CHECK_SUMMARY_LINE_RE.match(line)
-    ]
-
-    return sorted(lines)
-
-
-async def ruff_format(
-    *, executable: Path, path: Path, name: str, options: FormatOptions
-) -> Sequence[str]:
-    """Run the given ruff binary against the specified path."""
-    logger.debug(f"Checking {name} with {executable}")
-    ruff_args = ["format", "--no-cache", "--exit-zero"]
-
-    start = time.time()
-    proc = await create_subprocess_exec(
-        executable.absolute(),
-        *ruff_args,
-        ".",
-        stdout=PIPE,
-        stderr=PIPE,
-        cwd=path,
-    )
-    result, err = await proc.communicate()
-    end = time.time()
-
-    logger.debug(f"Finished formatting {name} with {executable} in {end - start:.2f}")
-
-    if proc.returncode != 0:
-        raise RuffError(err.decode("utf8"))
-
-    lines = result.decode("utf8").splitlines()
-    return lines
--- a/python/ruff-ecosystem/ruff_ecosystem/types.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/types.py
@@ -0,0 +1,70 @@
+from __future__ import annotations
+
+import abc
+import dataclasses
+import difflib
+from dataclasses import dataclass, is_dataclass
+from typing import TYPE_CHECKING, Any, Iterable, Sequence, Generator
+import heapq
+
+if TYPE_CHECKING:
+    from ruff_ecosystem.projects import ClonedRepository, Project
+
+
+class Serializable(abc.ABC):
+    """
+    Allows serialization of content by casting to a JSON-compatible type.
+    """
+
+    def jsonable(self) -> Any:
+        # Default implementation for dataclasses
+        if is_dataclass(self):
+            return dataclasses.asdict(self)
+
+        raise NotImplementedError()
+
+
+class Diff(Serializable):
+    def __init__(self, lines: Iterable[str]) -> None:
+        self.lines = list(lines)
+
+        # Compute added and removed lines once
+        self.added = list(line[2:] for line in self.lines if line.startswith("+ "))
+        self.removed = list(line[2:] for line in self.lines if line.startswith("- "))
+
+    def __bool__(self) -> bool:
+        return bool(self.added or self.removed)
+
+    def __iter__(self) -> Generator[str, None, None]:
+        yield from self.lines
+
+    @property
+    def lines_added(self):
+        return len(self.added)
+
+    @property
+    def lines_removed(self):
+        return len(self.removed)
+
+    @classmethod
+    def new(cls, baseline: Sequence[str], comparison: Sequence[str]):
+        return cls(difflib.ndiff(baseline, comparison))
+
+    def jsonable(self) -> Any:
+        return self.lines
+
+
+@dataclass(frozen=True)
+class Result(Serializable):
+    errored: list[tuple[Project, Exception]]
+    completed: list[tuple[Project, Comparison]]
+
+
+@dataclass(frozen=True)
+class Comparison(Serializable):
+    diff: Diff
+    repo: ClonedRepository
+
+
+class RuffError(Exception):
+    """An error reported by ruff."""