diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 48fd480201..800d815cb5 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -175,7 +175,7 @@ jobs: - cargo-test - determine_changes # Only runs on pull requests, since that is the only we way we can find the base version for comparison. - if: github.event_name == 'pull_request' && needs.determine_changes.outputs.linter == 'true' + if: github.event_name == 'pull_request' steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v4 @@ -196,13 +196,29 @@ jobs: branch: ${{ github.event.pull_request.base.ref }} check_artifacts: true - - name: Run ecosystem check + - name: Install ruff-ecosystem + run: | + pip install ./python/ruff-ecosystem + + - name: Run `ruff check` ecosystem check + if: ${{ needs.determine_changes.outputs.linter == 'true' }} run: | # Make executable, since artifact download doesn't preserve this chmod +x ruff ${{ steps.ruff-target.outputs.download-path }}/ruff - scripts/check_ecosystem.py ruff ${{ steps.ruff-target.outputs.download-path }}/ruff | tee ecosystem-result - cat ecosystem-result > $GITHUB_STEP_SUMMARY + ruff-ecosystem check ruff ${{ steps.ruff-target.outputs.download-path }}/ruff --cache ./checkouts --output-format markdown | tee ecosystem-result + cat ecosystem-result >> $GITHUB_STEP_SUMMARY + + echo ${{ github.event.number }} > pr-number + + - name: Run `ruff format` ecosystem check + if: ${{ needs.determine_changes.outputs.linter == 'true' }} + run: | + # Make executable, since artifact download doesn't preserve this + chmod +x ruff ${{ steps.ruff-target.outputs.download-path }}/ruff + + ruff-ecosystem format ruff ${{ steps.ruff-target.outputs.download-path }}/ruff --cache ./checkouts --output-format markdown | tee ecosystem-result + cat ecosystem-result >> $GITHUB_STEP_SUMMARY echo ${{ github.event.number }} > pr-number diff --git a/python/ruff-ecosystem/pyproject.toml b/python/ruff-ecosystem/pyproject.toml index 919bbe4ab5..1183bcc8f8 100644 --- a/python/ruff-ecosystem/pyproject.toml +++ b/python/ruff-ecosystem/pyproject.toml @@ -8,3 +8,7 @@ version = "0.0.0" [project.scripts] ruff-ecosystem = "ruff_ecosystem.cli:entrypoint" + + +[tool.ruff] +extend-select = ["I"] \ No newline at end of file diff --git a/python/ruff-ecosystem/ruff_ecosystem/__main__.py b/python/ruff-ecosystem/ruff_ecosystem/__main__.py index 8eab9c1c75..bc1e7d2c79 100644 --- a/python/ruff-ecosystem/ruff_ecosystem/__main__.py +++ b/python/ruff-ecosystem/ruff_ecosystem/__main__.py @@ -2,7 +2,7 @@ Enables usage with `python -m ruff_ecosystem` """ -from ruff_ecosystem.cli import entrypoint +import ruff_ecosystem.cli if __name__ == "__main__": - entrypoint() + ruff_ecosystem.cli.entrypoint() diff --git a/python/ruff-ecosystem/ruff_ecosystem/check.py b/python/ruff-ecosystem/ruff_ecosystem/check.py new file mode 100644 index 0000000000..ad048c0e82 --- /dev/null +++ b/python/ruff-ecosystem/ruff_ecosystem/check.py @@ -0,0 +1,341 @@ +from __future__ import annotations + +import asyncio +import re +import time +from asyncio import create_subprocess_exec +from collections import Counter +from dataclasses import dataclass, field +from pathlib import Path +from subprocess import PIPE +from typing import TYPE_CHECKING, Any, Iterator, Self, Sequence + +from ruff_ecosystem import logger +from ruff_ecosystem.types import ( + Comparison, + Diff, + Result, + RuffError, + Serializable, +) +from ruff_ecosystem.markdown import project_section + +if TYPE_CHECKING: + from ruff_ecosystem.projects import ClonedRepository, Project + + +# Matches lines that are summaries rather than diagnostics +CHECK_SUMMARY_LINE_RE = re.compile(r"^(Found \d+ error.*)|(.* fixable with .*)$") + +# Parses a diagnostic line (in a diff) to retrieve path and line number +CHECK_DIFF_LINE_RE = re.compile( + r"^(?P
[+-]) (?P(?P[^:]+):(?P\d+):\d+:) (?P.*)$",
+)
+
+
+async def compare_check(
+    ruff_baseline_executable: Path,
+    ruff_comparison_executable: Path,
+    options: CheckOptions,
+    cloned_repo: ClonedRepository,
+) -> Comparison:
+    async with asyncio.TaskGroup() as tg:
+        baseline_task = tg.create_task(
+            ruff_check(
+                executable=ruff_baseline_executable.resolve(),
+                path=cloned_repo.path,
+                name=cloned_repo.fullname,
+                options=options,
+            ),
+        )
+        comparison_task = tg.create_task(
+            ruff_check(
+                executable=ruff_comparison_executable.resolve(),
+                path=cloned_repo.path,
+                name=cloned_repo.fullname,
+                options=options,
+            ),
+        )
+
+    baseline_output, comparison_output = (
+        baseline_task.result(),
+        comparison_task.result(),
+    )
+    diff = Diff.new(baseline_output, comparison_output)
+
+    return Comparison(diff=diff, repo=cloned_repo)
+
+
+def summarize_check_result(result: Result) -> str:
+    # Calculate the total number of rule changes
+    all_rule_changes = RuleChanges()
+    for _, comparison in result.completed:
+        all_rule_changes.update(RuleChanges.from_diff(comparison.diff))
+
+    lines = []
+    total_removed = all_rule_changes.total_removed()
+    total_added = all_rule_changes.total_added()
+    error_count = len(result.errored)
+
+    if total_removed == 0 and total_added == 0 and error_count == 0:
+        return "\u2705 ecosystem check detected no `ruff check` changes."
+
+    # Summarize the total changes
+    s = "s" if error_count != 1 else ""
+    changes = f"(+{total_added}, -{total_removed}, {error_count} error{s})"
+
+    lines.append(
+        f"\u2139\ufe0f ecosystem check **detected `ruff check` changes**. {changes}"
+    )
+    lines.append("")
+
+    # Then per-project changes
+    for project, comparison in result.completed:
+        if not comparison.diff:
+            continue  # Skip empty diffs
+
+        diff = deduplicate_and_sort_diff(comparison.diff)
+
+        # Display the output diff
+        diff_lines = []
+        for line in limit_rule_lines(diff, project.check_options.max_lines_per_rule):
+            diff_lines.append(add_permalink_to_diagnostic_line(comparison.repo, line))
+
+        lines.extend(
+            project_section(
+                title=f"+{diff.lines_added}, -{diff.lines_removed}",
+                content="\n".join(diff_lines),
+                options=project.check_options.markdown(),
+                project=project,
+            )
+        )
+
+    for project, error in result.errored:
+        lines.extend(
+            project_section(
+                title="error",
+                content=str(error),
+                options="",
+                project=project,
+            )
+        )
+
+    # Display a summary table of changed rules
+    if all_rule_changes:
+        lines.append(f"Rules changed: {len(all_rule_changes.rule_codes())}")
+        lines.append("")
+        lines.append("| Rule | Changes | Additions | Removals |")
+        lines.append("| ---- | ------- | --------- | -------- |")
+        for rule, total in sorted(
+            all_rule_changes.total_changes_by_rule(),
+            key=lambda item: item[1],  # Sort by the total changes
+            reverse=True,
+        ):
+            additions, removals = (
+                all_rule_changes.added[rule],
+                all_rule_changes.removed[rule],
+            )
+            lines.append(f"| {rule} | {total} | {additions} | {removals} |")
+
+    return "\n".join(lines)
+
+
+def add_permalink_to_diagnostic_line(repo: ClonedRepository, line: str) -> str:
+    match = CHECK_DIFF_LINE_RE.match(line)
+    if match is None:
+        return line
+
+    pre, inner, path, lnum, post = match.groups()
+    url = repo.url_for(path, int(lnum))
+    return f"{pre} {inner} {post}"
+
+
+async def ruff_check(
+    *, executable: Path, path: Path, name: str, options: CheckOptions
+) -> Sequence[str]:
+    """Run the given ruff binary against the specified path."""
+    logger.debug(f"Checking {name} with {executable}")
+    ruff_args = ["check", "--no-cache", "--exit-zero"]
+    if options.select:
+        ruff_args.extend(["--select", options.select])
+    if options.ignore:
+        ruff_args.extend(["--ignore", options.ignore])
+    if options.exclude:
+        ruff_args.extend(["--exclude", options.exclude])
+    if options.show_fixes:
+        ruff_args.extend(["--show-fixes", "--ecosystem-ci"])
+
+    start = time.time()
+    proc = await create_subprocess_exec(
+        executable.absolute(),
+        *ruff_args,
+        ".",
+        stdout=PIPE,
+        stderr=PIPE,
+        cwd=path,
+    )
+    result, err = await proc.communicate()
+    end = time.time()
+
+    logger.debug(f"Finished checking {name} with {executable} in {end - start:.2f}s")
+
+    if proc.returncode != 0:
+        raise RuffError(err.decode("utf8"))
+
+    # Strip summary lines so the diff is only diagnostic lines
+    lines = [
+        line
+        for line in result.decode("utf8").splitlines()
+        if not CHECK_SUMMARY_LINE_RE.match(line)
+    ]
+
+    return lines
+
+
+@dataclass(frozen=True)
+class CheckOptions(Serializable):
+    """
+    Ruff check options
+    """
+
+    select: str = ""
+    ignore: str = ""
+    exclude: str = ""
+
+    # Generating fixes is slow and verbose
+    show_fixes: bool = False
+
+    # Limit the number of reported lines per rule
+    max_lines_per_rule: int | None = 50
+
+    def markdown(self) -> str:
+        return f"select {self.select} ignore {self.ignore} exclude {self.exclude}"
+
+
+@dataclass(frozen=True)
+class RuleChanges:
+    """
+    The number of additions and removals by rule code
+    """
+
+    added: Counter = field(default_factory=Counter)
+    removed: Counter = field(default_factory=Counter)
+
+    def rule_codes(self) -> set[str]:
+        return set(self.added.keys()).union(self.removed.keys())
+
+    def __add__(self, other: Self) -> Self:
+        if not isinstance(other, type(self)):
+            return NotImplemented
+
+        new = RuleChanges()
+        new.update(self)
+        new.update(other)
+        return new
+
+    def update(self, other: Self) -> Self:
+        self.added.update(other.added)
+        self.removed.update(other.removed)
+        return self
+
+    def total_added(self) -> int:
+        return sum(self.added.values())
+
+    def total_removed(self) -> int:
+        return sum(self.removed.values())
+
+    def total_changes_by_rule(self) -> Iterator[str, int]:
+        """
+        Yields the sum of changes for each rule
+        """
+        totals = Counter()
+        totals.update(self.added)
+        totals.update(self.removed)
+        yield from totals.items()
+
+    @classmethod
+    def from_diff(cls: type[Self], diff: Diff) -> Self:
+        """
+        Parse a diff from `ruff check` to determine the additions and removals for each rule
+        """
+        rule_changes = cls()
+
+        for line in sorted(diff.added):
+            code = parse_rule_code(line)
+            if code is not None:
+                rule_changes.added[code] += 1
+
+        for line in sorted(diff.removed):
+            code = parse_rule_code(line)
+            if code is not None:
+                rule_changes.removed[code] += 1
+
+        return rule_changes
+
+    def __bool__(self):
+        return bool(self.added or self.removed)
+
+
+def parse_rule_code(line: str) -> str | None:
+    """
+    Parse the rule code from a diagnostic line
+
+    + /:::  
+    """
+    matches = re.search(r": ([A-Z]{1,4}[0-9]{3,4})", line)
+
+    if matches is None:
+        # Handle case where there are no regex matches e.g.
+        # +                 "?application=AIRFLOW&authenticator=TEST_AUTH&role=TEST_ROLE&warehouse=TEST_WAREHOUSE" # noqa: E501, ERA001
+        # Which was found in local testing
+        return None
+
+    return matches.group(1)
+
+
+def deduplicate_and_sort_diff(diff: Diff) -> Diff:
+    """
+    Removes any duplicate lines and any unchanged lines from the diff.
+    """
+    lines = set()
+    for line in diff:
+        if line.startswith("+ "):
+            lines.add(line)
+        elif line.startswith("- "):
+            lines.add(line)
+
+    # Sort without the leading + or -
+    return Diff(list(sorted(lines, key=lambda line: line[2:])))
+
+
+def limit_rule_lines(diff: Diff, max_per_rule: int | None = 100) -> list[str]:
+    """
+    Reduce the diff to include a maximum number of lines for each rule.
+    """
+    if max_per_rule is None:
+        return diff
+
+    counts = Counter()
+    reduced = []
+
+    for line in diff:
+        code = parse_rule_code(line)
+
+        # Do not omit any unparsable lines
+        if not code:
+            reduced.append(line)
+            continue
+
+        counts[code] += 1
+        if counts[code] > max_per_rule:
+            continue
+
+        reduced.append(line)
+
+    # Add lines summarizing the omitted changes
+    for code, count in counts.items():
+        hidden_count = count - max_per_rule
+        if hidden_count > 0:
+            reduced.append(f"{hidden_count} changes omitted for rule {code}")
+
+    return reduced
diff --git a/python/ruff-ecosystem/ruff_ecosystem/cli.py b/python/ruff-ecosystem/ruff_ecosystem/cli.py
index 9f576f58df..010bb09b00 100644
--- a/python/ruff-ecosystem/ruff_ecosystem/cli.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/cli.py
@@ -1,16 +1,18 @@
 import argparse
 import asyncio
 import logging
+import os
+import shutil
+import sys
+import sysconfig
 import tempfile
-from pathlib import Path
 from contextlib import nullcontext
-from ruff_ecosystem.models import RuffCommand
-from ruff_ecosystem.emitters import EmitterType
-from ruff_ecosystem.defaults import DEFAULT_TARGETS
-from ruff_ecosystem.main import main
+from pathlib import Path
 from signal import SIGINT, SIGTERM
 
-import sys
+from ruff_ecosystem.defaults import DEFAULT_TARGETS
+from ruff_ecosystem.main import OutputFormat, main
+from ruff_ecosystem.projects import RuffCommand
 
 
 def excepthook(type, value, tb):
@@ -18,7 +20,8 @@ def excepthook(type, value, tb):
         # we are in interactive mode or we don't have a tty so call the default
         sys.__excepthook__(type, value, tb)
     else:
-        import traceback, pdb
+        import pdb
+        import traceback
 
         traceback.print_exception(type, value, tb)
         print()
@@ -49,7 +52,7 @@ def entrypoint():
                 ruff_baseline_executable=args.ruff_baseline,
                 ruff_comparison_executable=args.ruff_comparison,
                 targets=DEFAULT_TARGETS,
-                emitter=EmitterType(args.output_format).to_emitter(),
+                format=OutputFormat(args.output_format),
                 cache=Path(cache),
                 raise_on_failure=args.pdb,
             )
@@ -84,7 +87,7 @@ def parse_args() -> argparse.Namespace:
     )
     parser.add_argument(
         "--output-format",
-        choices=[option.name for option in EmitterType],
+        choices=[option.name for option in OutputFormat],
         default="json",
         help="Location for caching cloned repositories",
     )
@@ -114,3 +117,21 @@ def parse_args() -> argparse.Namespace:
     )
 
     return parser.parse_args()
+
+
+def get_executable_path(name: str) -> str | None:
+    # Add suffix for Windows executables
+    name += ".exe" if sys.platform == "win32" else ""
+
+    path = os.path.join(sysconfig.get_path("scripts"), name)
+
+    # The executable in the current interpreter's scripts directory.
+    if os.path.exists(path):
+        return path
+
+    # The executable in the global environment.
+    environment_path = shutil.which("ruff")
+    if environment_path:
+        return environment_path
+
+    return None
diff --git a/python/ruff-ecosystem/ruff_ecosystem/defaults.py b/python/ruff-ecosystem/ruff_ecosystem/defaults.py
index b203f5f93f..3c9c6bfc89 100644
--- a/python/ruff-ecosystem/ruff_ecosystem/defaults.py
+++ b/python/ruff-ecosystem/ruff_ecosystem/defaults.py
@@ -1,66 +1,64 @@
-from .models import Repository, CheckOptions, Target
+from ruff_ecosystem.projects import CheckOptions, Project, Repository
 
-# TODO: Consider exporting this as JSON instead for consistent setup
+# TODO: Consider exporting this as JSON
 DEFAULT_TARGETS = [
-    # Target(repo=Repository(owner="DisnakeDev", name="disnake", branch="master")),
-    # Target(repo=Repository(owner="PostHog", name="HouseWatch", branch="main")),
-    # Target(repo=Repository(owner="RasaHQ", name="rasa", branch="main")),
-    # Target(repo=Repository(owner="Snowflake-Labs", name="snowcli", branch="main")),
-    # Target(repo=Repository(owner="aiven", name="aiven-client", branch="main")),
-    # Target(repo=Repository(owner="alteryx", name="featuretools", branch="main")),
-    # Target(
-    #     repo=Repository(owner="apache", name="airflow", branch="main"),
+    # Project(repo=Repository(owner="DisnakeDev", name="disnake", ref="master")),
+    # Project(repo=Repository(owner="PostHog", name="HouseWatch", ref="main")),
+    # Project(repo=Repository(owner="RasaHQ", name="rasa", ref="main")),
+    # Project(repo=Repository(owner="Snowflake-Labs", name="snowcli", ref="main")),
+    # Project(repo=Repository(owner="aiven", name="aiven-client", ref="main")),
+    # Project(repo=Repository(owner="alteryx", name="featuretools", ref="main")),
+    # Project(
+    #     repo=Repository(owner="apache", name="airflow", ref="main"),
     #     check_options=CheckOptions(select="ALL"),
     # ),
-    # Target(repo=Repository(owner="aws", name="aws-sam-cli", branch="develop")),
-    # Target(repo=Repository(owner="bloomberg", name="pytest-memray", branch="main")),
-    # Target(
-    #     repo=Repository(owner="bokeh", name="bokeh", branch="branch-3.3"),
-    #     check_options=CheckOptions(select="ALL"),
-    # ),
-    # Target(repo=Repository(owner="commaai", name="openpilot", branch="master")),
-    # Target(repo=Repository(owner="demisto", name="content", branch="master")),
-    # Target(repo=Repository(owner="docker", name="docker-py", branch="main")),
-    # Target(
-    #     repo=Repository(owner="freedomofpress", name="securedrop", branch="develop")
-    # ),
-    # Target(repo=Repository(owner="fronzbot", name="blinkpy", branch="dev")),
-    # Target(repo=Repository(owner="ibis-project", name="ibis", branch="master")),
-    # Target(repo=Repository(owner="ing-bank", name="probatus", branch="main")),
-    # Target(repo=Repository(owner="jrnl-org", name="jrnl", branch="develop")),
-    # Target(repo=Repository(owner="latchbio", name="latch", branch="main")),
-    # Target(repo=Repository(owner="lnbits", name="lnbits", branch="main")),
-    # Target(repo=Repository(owner="milvus-io", name="pymilvus", branch="master")),
-    # Target(repo=Repository(owner="mlflow", name="mlflow", branch="master")),
-    # Target(repo=Repository(owner="model-bakers", name="model_bakery", branch="main")),
-    # Target(repo=Repository(owner="pandas-dev", name="pandas", branch="main")),
-    # Target(repo=Repository(owner="prefecthq", name="prefect", branch="main")),
-    # Target(repo=Repository(owner="pypa", name="build", branch="main")),
-    # Target(repo=Repository(owner="pypa", name="cibuildwheel", branch="main")),
-    # Target(repo=Repository(owner="pypa", name="pip", branch="main")),
-    # Target(repo=Repository(owner="pypa", name="setuptools", branch="main")),
-    # Target(repo=Repository(owner="python", name="mypy", branch="master")),
-    # Target(
+    # Project(repo=Repository(owner="aws", name="aws-sam-cli", ref="develop")),
+    # Project(repo=Repository(owner="bloomberg", name="pytest-memray", ref="main")),
+    Project(
+        repo=Repository(owner="bokeh", name="bokeh", ref="branch-3.3"),
+        check_options=CheckOptions(select="ALL"),
+    ),
+    # Project(repo=Repository(owner="commaai", name="openpilot", ref="master")),
+    # Project(repo=Repository(owner="demisto", name="content", ref="master")),
+    # Project(repo=Repository(owner="docker", name="docker-py", ref="main")),
+    # Project(repo=Repository(owner="freedomofpress", name="securedrop", ref="develop")),
+    # Project(repo=Repository(owner="fronzbot", name="blinkpy", ref="dev")),
+    # Project(repo=Repository(owner="ibis-project", name="ibis", ref="master")),
+    # Project(repo=Repository(owner="ing-bank", name="probatus", ref="main")),
+    # Project(repo=Repository(owner="jrnl-org", name="jrnl", ref="develop")),
+    # Project(repo=Repository(owner="latchbio", name="latch", ref="main")),
+    # Project(repo=Repository(owner="lnbits", name="lnbits", ref="main")),
+    # Project(repo=Repository(owner="milvus-io", name="pymilvus", ref="master")),
+    # Project(repo=Repository(owner="mlflow", name="mlflow", ref="master")),
+    # Project(repo=Repository(owner="model-bakers", name="model_bakery", ref="main")),
+    # Project(repo=Repository(owner="pandas-dev", name="pandas", ref="main")),
+    # Project(repo=Repository(owner="prefecthq", name="prefect", ref="main")),
+    # Project(repo=Repository(owner="pypa", name="build", ref="main")),
+    # Project(repo=Repository(owner="pypa", name="cibuildwheel", ref="main")),
+    # Project(repo=Repository(owner="pypa", name="pip", ref="main")),
+    # Project(repo=Repository(owner="pypa", name="setuptools", ref="main")),
+    # Project(repo=Repository(owner="python", name="mypy", ref="master")),
+    # Project(
     #     repo=Repository(
     #         owner="python",
     #         name="typeshed",
-    #         branch="main",
+    #         ref="main",
     #     ),
     #     check_options=CheckOptions(select="PYI"),
     # ),
-    # Target(repo=Repository(owner="python-poetry", name="poetry", branch="master")),
-    # Target(repo=Repository(owner="reflex-dev", name="reflex", branch="main")),
-    # Target(repo=Repository(owner="rotki", name="rotki", branch="develop")),
-    # Target(repo=Repository(owner="scikit-build", name="scikit-build", branch="main")),
-    # Target(
-    #     repo=Repository(owner="scikit-build", name="scikit-build-core", branch="main")
+    # Project(repo=Repository(owner="python-poetry", name="poetry", ref="master")),
+    # Project(repo=Repository(owner="reflex-dev", name="reflex", ref="main")),
+    # Project(repo=Repository(owner="rotki", name="rotki", ref="develop")),
+    # Project(repo=Repository(owner="scikit-build", name="scikit-build", ref="main")),
+    # Project(
+    #     repo=Repository(owner="scikit-build", name="scikit-build-core", ref="main")
+    # ),
+    # Project(repo=Repository(owner="sphinx-doc", name="sphinx", ref="master")),
+    # Project(repo=Repository(owner="spruceid", name="siwe-py", ref="main")),
+    # Project(repo=Repository(owner="tiangolo", name="fastapi", ref="master")),
+    # Project(repo=Repository(owner="yandex", name="ch-backup", ref="main")),
+    # Project(
+    #     repo=Repository(owner="zulip", name="zulip", ref="main"),
+    #     check_options=CheckOptions(select="ALL"),
     # ),
-    # Target(repo=Repository(owner="sphinx-doc", name="sphinx", branch="master")),
-    # Target(repo=Repository(owner="spruceid", name="siwe-py", branch="main")),
-    # Target(repo=Repository(owner="tiangolo", name="fastapi", branch="master")),
-    # Target(repo=Repository(owner="yandex", name="ch-backup", branch="main")),
-    Target(
-        repo=Repository(owner="zulip", name="zulip", branch="main"),
-        check_options=CheckOptions(select="ALL"),
-    ),
 ]
diff --git a/python/ruff-ecosystem/ruff_ecosystem/emitters.py b/python/ruff-ecosystem/ruff_ecosystem/emitters.py
deleted file mode 100644
index a0c64df84f..0000000000
--- a/python/ruff-ecosystem/ruff_ecosystem/emitters.py
+++ /dev/null
@@ -1,99 +0,0 @@
-from enum import Enum
-import abc
-from ruff_ecosystem.models import Target, Diff, ClonedRepository, Result
-from ruff_ecosystem.ruff import CHECK_DIFF_LINE_RE
-import traceback
-import json
-from pathlib import Path
-import dataclasses
-
-
-class Emitter(abc.ABC):
-    @abc.abstractclassmethod
-    def emit_error(cls, target: Target, exc: Exception):
-        pass
-
-    @abc.abstractclassmethod
-    def emit_diff(cls, target: Target, diff: Diff, cloned_repo: ClonedRepository):
-        pass
-
-    @abc.abstractclassmethod
-    def emit_result(cls, result: Result):
-        pass
-
-
-class DebugEmitter(Emitter):
-    def emit_error(cls, target: Target, exc: Exception):
-        print(f"Error in {target.repo.fullname}")
-        traceback.print_exception(exc)
-
-    def emit_diff(cls, target: Target, diff: Diff, cloned_repo: ClonedRepository):
-        pass
-
-
-class JSONEmitter(Emitter):
-    class DataclassJSONEncoder(json.JSONEncoder):
-        def default(self, o):
-            if dataclasses.is_dataclass(o):
-                return dataclasses.asdict(o)
-            if isinstance(o, set):
-                return tuple(o)
-            if isinstance(o, Path):
-                return str(o)
-            return super().default(o)
-
-    def emit_error(cls, target: Target, exc: Exception):
-        pass
-
-    def emit_diff(cls, target: Target, diff: Diff, cloned_repo: ClonedRepository):
-        pass
-
-    def emit_result(cls, result: Result):
-        print(json.dumps(result, indent=4, cls=cls.DataclassJSONEncoder))
-
-
-class MarkdownEmitter(Emitter):
-    def emit_error(cls, target: Target, exc: Exception):
-        cls._print(title="error", content=f"```\n{exc}\n```", target=target)
-
-    def emit_diff(cls, target: Target, diff: Diff, cloned_repo: ClonedRepository):
-        changes = f"+{len(diff.added)}, -{len(diff.removed)}"
-
-        content = ""
-        for line in list(diff):
-            match = CHECK_DIFF_LINE_RE.match(line)
-            if match is None:
-                content += line + "\n"
-                continue
-
-            pre, inner, path, lnum, post = match.groups()
-            url = cloned_repo.url_for(path, int(lnum))
-            content += f"{pre} {inner} {post}" + "\n"
-
-        cls._print(title=changes, content=f"
\n{content}\n
", target=target) - - def _print(cls, title: str, content: str, target: Target): - print(f"
{target.repo.fullname} ({title})") - print(target.repo.url, target.check_options.summary()) - print("

") - print() - - print(content) - - print() - print("

") - print("
") - - -class EmitterType(Enum): - markdown = "markdown" - json = "json" - - def to_emitter(self) -> Emitter: - match self: - case self.markdown: - return MarkdownEmitter() - case self.json: - return JSONEmitter() - case _: - raise ValueError("Unknown emitter type {self}") diff --git a/python/ruff-ecosystem/ruff_ecosystem/format.py b/python/ruff-ecosystem/ruff_ecosystem/format.py new file mode 100644 index 0000000000..027a744ff1 --- /dev/null +++ b/python/ruff-ecosystem/ruff_ecosystem/format.py @@ -0,0 +1,238 @@ +from __future__ import annotations + +import time +from asyncio import create_subprocess_exec +from dataclasses import dataclass +from pathlib import Path +from subprocess import PIPE +from unidiff import PatchSet +from typing import TYPE_CHECKING, Self, Sequence +import re +from ruff_ecosystem import logger +from ruff_ecosystem.markdown import project_section +from ruff_ecosystem.types import Comparison, Diff, Result, RuffError + +if TYPE_CHECKING: + from ruff_ecosystem.projects import ClonedRepository, Project + + +FORMAT_IGNORE_LINES = re.compile("^warning: `ruff format` is a work-in-progress.*") + + +def summarize_format_result(result: Result) -> str: + lines = [] + total_lines_removed = total_lines_added = 0 + total_files_modified = 0 + error_count = len(result.errored) + patch_sets = [] + + for project, comparison in result.completed: + total_lines_added += comparison.diff.lines_added + total_lines_removed += comparison.diff.lines_removed + + patch_set = PatchSet("\n".join(comparison.diff.lines)) + patch_sets.append(patch_set) + total_files_modified += len(patch_set.modified_files) + + if total_lines_removed == 0 and total_lines_added == 0 and error_count == 0: + return "\u2705 ecosystem check detected no format changes." + + # Summarize the total changes + es = "s" if error_count != 1 else "" + fs = "s" if total_files_modified != 1 else "" + changes = f"(+{total_lines_added}, -{total_lines_removed} lines in {total_files_modified} file{fs}, {error_count} error{es})" + lines.append(f"\u2139\ufe0f ecosystem check **detected format changes**. {changes}") + lines.append("") + + # Then per-project changes + for (project, comparison), patch_set in zip(result.completed, patch_sets): + if not comparison.diff: + continue # Skip empty diffs + + files = len(patch_set.modified_files) + s = "s" if files != 1 else "" + title = f"+{comparison.diff.lines_added}, -{comparison.diff.lines_removed} lines in {files} file{s}" + + lines.extend( + project_section( + title=title, + content=patch_set_with_permalinks(patch_set, comparison.repo), + options=project.format_options.markdown(), + project=project, + ) + ) + + for project, error in result.errored: + lines.extend( + project_section( + title="error", + content=str(error), + options="", + project=project, + ) + ) + + return "\n".join(lines) + + +async def ruff_format( + *, + executable: Path, + path: Path, + name: str, + options: FormatOptions, + diff: bool = False, +) -> Sequence[str]: + """Run the given ruff binary against the specified path.""" + logger.debug(f"Formatting {name} with {executable}") + ruff_args = ["format"] + + if diff: + ruff_args.append("--diff") + + start = time.time() + proc = await create_subprocess_exec( + executable.absolute(), + *ruff_args, + ".", + stdout=PIPE, + stderr=PIPE, + cwd=path, + ) + result, err = await proc.communicate() + end = time.time() + + logger.debug(f"Finished formatting {name} with {executable} in {end - start:.2f}s") + + if proc.returncode not in [0, 1]: + raise RuffError(err.decode("utf8")) + + lines = result.decode("utf8").splitlines() + return lines + + +async def black_format( + *, + executable: Path, + path: Path, + name: str, +) -> Sequence[str]: + """Run the given black binary against the specified path.""" + logger.debug(f"Formatting {name} with {executable}") + black_args = [] + + start = time.time() + proc = await create_subprocess_exec( + executable.absolute(), + *black_args, + ".", + stdout=PIPE, + stderr=PIPE, + cwd=path, + ) + result, err = await proc.communicate() + end = time.time() + + logger.debug(f"Finished formatting {name} with {executable} in {end - start:.2f}s") + + if proc.returncode != 0: + raise RuffError(err.decode("utf8")) + + lines = result.decode("utf8").splitlines() + return [line for line in lines if not FORMAT_IGNORE_LINES.match(line)] + + +async def compare_format( + ruff_baseline_executable: Path, + ruff_comparison_executable: Path, + options: FormatOptions, + cloned_repo: ClonedRepository, +): + # Run format without diff to get the baseline + await ruff_format( + executable=ruff_baseline_executable.resolve(), + path=cloned_repo.path, + name=cloned_repo.fullname, + options=options, + ) + # Then get the diff from stdout + diff = await ruff_format( + executable=ruff_comparison_executable.resolve(), + path=cloned_repo.path, + name=cloned_repo.fullname, + options=options, + diff=True, + ) + + return create_format_comparison(cloned_repo, FormatDiff(lines=diff)) + + +def create_format_comparison(repo: ClonedRepository, diff: str) -> FormatComparison: + return FormatComparison(diff=diff, repo=repo) + + +@dataclass(frozen=True) +class FormatOptions: + """ + Ruff format options + """ + + pass + + def markdown(self: Self) -> str: + return "" + + +@dataclass(frozen=True) +class FormatDiff(Diff): + """A diff from ruff format.""" + + lines: list[str] + + def __bool__(self: Self) -> bool: + """Return true if this diff is non-empty.""" + return bool(self.lines) + + @property + def added(self) -> set[str]: + return set(line for line in self.lines if line.startswith("+")) + + @property + def removed(self) -> set[str]: + return set(line for line in self.lines if line.startswith("-")) + + +@dataclass(frozen=True) +class FormatComparison(Comparison): + diff: FormatDiff + repo: ClonedRepository + + +@dataclass(frozen=True) +class FormatResult(Result): + comparisons: tuple[Project, FormatComparison] + + +def patch_set_with_permalinks(patch_set: PatchSet, repo: ClonedRepository) -> str: + lines = [] + for file_patch in patch_set: + file_link = repo.url_for(file_patch.path) + lines.append(f"{file_patch.path}") + for hunk in file_patch: + hunk_link = repo.url_for( + file_patch.path, + hunk.source_start, + hunk.source_start + hunk.source_length, + ) + hunk_lines = str(hunk).splitlines() + + # Add a link before the hunk + link_title = file_patch.path + hunk_link.split(file_patch.path)[-1] + lines.append(f"{link_title}") + + # Wrap the contents of the hunk in a diff code block + lines.append("```diff") + lines.extend(hunk_lines[1:]) + lines.append("```") + + return "\n".join(lines) diff --git a/python/ruff-ecosystem/ruff_ecosystem/git.py b/python/ruff-ecosystem/ruff_ecosystem/git.py deleted file mode 100644 index 16cb6ca1c5..0000000000 --- a/python/ruff-ecosystem/ruff_ecosystem/git.py +++ /dev/null @@ -1,72 +0,0 @@ -from ruff_ecosystem.models import Repository, ClonedRepository -from contextlib import asynccontextmanager -from pathlib import Path -from typing import AsyncGenerator -from asyncio import create_subprocess_exec -from subprocess import PIPE -from ruff_ecosystem import logger - - -@asynccontextmanager -async def clone( - repo: Repository, checkout_dir: Path -) -> AsyncGenerator[ClonedRepository, None]: - """Shallow clone this repository to a temporary directory.""" - if checkout_dir.exists(): - logger.debug(f"Reusing {repo.owner}:{repo.name}") - yield await _cloned_repository(repo, checkout_dir) - return - - logger.debug(f"Cloning {repo.owner}:{repo.name} to {checkout_dir}") - command = [ - "git", - "clone", - "--config", - "advice.detachedHead=false", - "--quiet", - "--depth", - "1", - "--no-tags", - ] - if repo.branch: - command.extend(["--branch", repo.branch]) - - command.extend( - [ - f"https://github.com/{repo.owner}/{repo.name}", - checkout_dir, - ], - ) - - process = await create_subprocess_exec(*command, env={"GIT_TERMINAL_PROMPT": "0"}) - - status_code = await process.wait() - - logger.debug( - f"Finished cloning {repo.fullname} with status {status_code}", - ) - yield await _cloned_repository(repo, checkout_dir) - - -async def _cloned_repository(repo: Repository, checkout_dir: Path) -> ClonedRepository: - return ClonedRepository( - name=repo.name, - owner=repo.owner, - branch=repo.branch, - path=checkout_dir, - commit_hash=await _get_commit_hash(checkout_dir), - ) - - -async def _get_commit_hash(checkout_dir: Path) -> str: - """ - Return the commit sha for the repository in the checkout directory. - """ - process = await create_subprocess_exec( - *["git", "rev-parse", "HEAD"], - cwd=checkout_dir, - stdout=PIPE, - ) - stdout, _ = await process.communicate() - assert await process.wait() == 0, f"Failed to retrieve commit sha at {checkout_dir}" - return stdout.decode().strip() diff --git a/python/ruff-ecosystem/ruff_ecosystem/main.py b/python/ruff-ecosystem/ruff_ecosystem/main.py index bc08e32df1..e271957650 100644 --- a/python/ruff-ecosystem/ruff_ecosystem/main.py +++ b/python/ruff-ecosystem/ruff_ecosystem/main.py @@ -1,32 +1,34 @@ -from ruff_ecosystem.models import ( - RuffCommand, - Target, - Diff, - ClonedRepository, - RuleChanges, - CheckComparison, - Result, -) -from pathlib import Path -from ruff_ecosystem import logger import asyncio -from ruff_ecosystem.git import clone -from ruff_ecosystem.ruff import ruff_check, ruff_format -from ruff_ecosystem.emitters import Emitter -import difflib +import dataclasses +import json +from enum import Enum +from pathlib import Path from typing import TypeVar -import re + +from ruff_ecosystem import logger +from ruff_ecosystem.check import compare_check, summarize_check_result +from ruff_ecosystem.format import compare_format, summarize_format_result +from ruff_ecosystem.projects import ( + Project, + RuffCommand, +) +from ruff_ecosystem.types import Comparison, Result, Serializable T = TypeVar("T") +class OutputFormat(Enum): + markdown = "markdown" + json = "json" + + async def main( command: RuffCommand, ruff_baseline_executable: Path, ruff_comparison_executable: Path, - targets: list[Target], + targets: list[Project], cache: Path | None, - emitter: Emitter, + format: OutputFormat, max_parallelism: int = 50, raise_on_failure: bool = False, ) -> None: @@ -42,7 +44,7 @@ async def main( async with semaphore: return await coroutine - comparisons: list[Exception | CheckComparison] = await asyncio.gather( + comparisons: list[Exception | Comparison] = await asyncio.gather( *[ limited_parallelism( clone_and_compare( @@ -59,177 +61,82 @@ async def main( ) comparisons_by_target = dict(zip(targets, comparisons, strict=True)) - # Calculate totals - total_removed = total_added = errors = 0 - total_rule_changes = RuleChanges() - for comparison in comparisons_by_target.values(): - if isinstance(comparison, Exception): - errors += 1 - else: - total_removed += len(comparison.diff.removed) - total_added += len(comparison.diff.added) - total_rule_changes += comparison.rule_changes - - errors = [] - comparisons = [] + errors, successes = [], [] for target, comparison in comparisons_by_target.items(): if isinstance(comparison, Exception): errors.append((target, comparison)) continue - if comparison.diff: - comparisons.append((target, comparison)) + successes.append((target, comparison)) - else: - continue + result = Result(completed=successes, errored=errors) - result = Result( - total_added=total_added, - total_removed=total_removed, - total_rule_changes=total_rule_changes, - comparisons=comparisons, - errors=errors, - ) + match format: + case OutputFormat.json: + print(json.dumps(result, indent=4, cls=JSONEncoder)) + case OutputFormat.markdown: + match command: + case RuffCommand.check: + print(summarize_check_result(result)) + case RuffCommand.format: + print(summarize_format_result(result)) + case _: + raise ValueError(f"Unknown target Ruff command {command}") + case _: + raise ValueError(f"Unknown output format {format}") - emitter.emit_result(result) - return - - if total_removed == 0 and total_added == 0 and errors == 0: - print("\u2705 ecosystem check detected no changes.") - return - - s = "s" if errors != 1 else "" - changes = f"(+{total_added}, -{total_removed}, {errors} error{s})" - - print(f"\u2139\ufe0f ecosystem check **detected changes**. {changes}") - print() - - for target, comparison in comparisons_by_target.items(): - if isinstance(comparison, Exception): - emitter.emit_error(target, comparison) - continue - - if comparison.diff: - emitter.emit_diff(target, comparison.diff, comparison.repo) - - else: - continue - - if len(total_rule_changes.rule_codes()) > 0: - print(f"Rules changed: {len(total_rule_changes.rule_codes())}") - print() - print("| Rule | Changes | Additions | Removals |") - print("| ---- | ------- | --------- | -------- |") - for rule, (additions, removals) in sorted( - total_rule_changes.items(), - key=lambda x: (x[1][0] + x[1][1]), - reverse=True, - ): - print(f"| {rule} | {additions + removals} | {additions} | {removals} |") + return None async def clone_and_compare( command: RuffCommand, ruff_baseline_executable: Path, ruff_comparison_executable: Path, - target: Target, + target: Project, cache: Path, -) -> CheckComparison: +) -> Comparison: """Check a specific repository against two versions of ruff.""" assert ":" not in target.repo.owner assert ":" not in target.repo.name match command: case RuffCommand.check: - ruff_task, create_comparison, options = ( - ruff_check, - create_check_comparison, + compare, options = ( + compare_check, target.check_options, ) case RuffCommand.format: - ruff_task, create_comparison, options = ( - ruff_format, - create_format_comparison, + compare, options = ( + compare_format, target.format_options, ) case _: - raise ValueError(f"Unknowm target Ruff command {command}") + raise ValueError(f"Unknown target Ruff command {command}") checkout_dir = cache.joinpath(f"{target.repo.owner}:{target.repo.name}") - async with clone(target.repo, checkout_dir) as cloned_repo: - try: - async with asyncio.TaskGroup() as tg: - baseline_task = tg.create_task( - ruff_task( - executable=ruff_baseline_executable.resolve(), - path=cloned_repo.path, - name=cloned_repo.fullname, - options=options, - ), - ) - comparison_task = tg.create_task( - ruff_task( - executable=ruff_comparison_executable.resolve(), - path=cloned_repo.path, - name=cloned_repo.fullname, - options=options, - ), - ) - except ExceptionGroup as e: - raise e.exceptions[0] from e + cloned_repo = await target.repo.clone(checkout_dir) - return create_comparison( - cloned_repo, baseline_task.result(), comparison_task.result() - ) + try: + return await compare( + ruff_baseline_executable, + ruff_comparison_executable, + options, + cloned_repo, + ) + except ExceptionGroup as e: + raise e.exceptions[0] from e -def create_check_comparison( - repo: ClonedRepository, baseline_output: str, comparison_output: str -) -> CheckComparison: - removed, added = set(), set() - - for line in difflib.ndiff(baseline_output, comparison_output): - if line.startswith("- "): - removed.add(line[2:]) - elif line.startswith("+ "): - added.add(line[2:]) - - diff = Diff(removed=removed, added=added) - - return CheckComparison( - diff=diff, repo=repo, rule_changes=rule_changes_from_diff(diff) - ) - - -def rule_changes_from_diff(diff: Diff) -> RuleChanges: - """ - Parse a diff from `ruff check` to determine the additions and removals for each rule. - """ - rule_changes = RuleChanges() - - # Count rule changes - for line in diff.lines(): - # Find rule change for current line or construction - # + /::: - matches = re.search(r": ([A-Z]{1,4}[0-9]{3,4})", line) - - if matches is None: - # Handle case where there are no regex matches e.g. - # + "?application=AIRFLOW&authenticator=TEST_AUTH&role=TEST_ROLE&warehouse=TEST_WAREHOUSE" # noqa: E501, ERA001 - # Which was found in local testing - continue - - rule_code = matches.group(1) - - # Get current additions and removals for this rule - current_changes = rule_changes[rule_code] - - # Check if addition or removal depending on the first character - if line[0] == "+": - current_changes = (current_changes[0] + 1, current_changes[1]) - elif line[0] == "-": - current_changes = (current_changes[0], current_changes[1] + 1) - - rule_changes[rule_code] = current_changes - - return rule_changes +class JSONEncoder(json.JSONEncoder): + def default(self, o): + if isinstance(o, Serializable): + return o.jsonable() + if dataclasses.is_dataclass(o): + return dataclasses.asdict(o) + if isinstance(o, set): + return tuple(o) + if isinstance(o, Path): + return str(o) + if isinstance(o, Exception): + return str(o) + return super().default(o) diff --git a/python/ruff-ecosystem/ruff_ecosystem/markdown.py b/python/ruff-ecosystem/ruff_ecosystem/markdown.py new file mode 100644 index 0000000000..f30ab64b75 --- /dev/null +++ b/python/ruff-ecosystem/ruff_ecosystem/markdown.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from ruff_ecosystem.projects import Project + + +def project_section( + title: str, content: str, options: str, project: Project +) -> list[str]: + lines = [] + lines.append( + f'
{project.repo.fullname} ({title})' + ) + lines.append(options) + lines.append("

") + lines.append("") + + lines.append(content) + + lines.append("") + lines.append("

") + lines.append("
") + return lines diff --git a/python/ruff-ecosystem/ruff_ecosystem/models.py b/python/ruff-ecosystem/ruff_ecosystem/models.py deleted file mode 100644 index be5c6cc60d..0000000000 --- a/python/ruff-ecosystem/ruff_ecosystem/models.py +++ /dev/null @@ -1,160 +0,0 @@ -from enum import Enum -from dataclasses import dataclass, field -from typing import Self, Iterator -import heapq -from pathlib import Path - - -class RuffCommand(Enum): - check = "check" - format = "format" - - -@dataclass(frozen=True) -class Repository: - """ - A remote GitHub repository - """ - - owner: str - name: str - branch: str | None - - @property - def fullname(self) -> str: - return f"{self.owner}/{self.name}" - - @property - def url(self: Self) -> str: - return f"https://github.com/{self.owner}/{self.name}" - - -@dataclass(frozen=True) -class ClonedRepository(Repository): - """ - A cloned GitHub repository, which includes the hash of the cloned commit. - """ - - commit_hash: str - path: Path - - def url_for(self: Self, path: str, line_number: int | None = None) -> str: - """ - Return the remote GitHub URL for the given path in this repository. - """ - # Default to main branch - url = f"https://github.com/{self.owner}/{self.name}/blob/{self.commit_hash}/{path}" - if line_number: - url += f"#L{line_number}" - return url - - @property - def url(self: Self) -> str: - return f"https://github.com/{self.owner}/{self.name}@{self.commit_hash}" - - -@dataclass(frozen=True) -class Diff: - """A diff between two runs of ruff.""" - - removed: set[str] - added: set[str] - - def __bool__(self: Self) -> bool: - """Return true if this diff is non-empty.""" - return bool(self.removed or self.added) - - def lines(self: Self) -> Iterator[str]: - """Iterate through the changed lines in diff format.""" - for line in heapq.merge(sorted(self.removed), sorted(self.added)): - if line in self.removed: - yield f"- {line}" - else: - yield f"+ {line}" - - -@dataclass(frozen=True) -class RuleChanges: - changes: dict[str, tuple[int, int]] = field(default_factory=dict) - - def rule_codes(self) -> list[str]: - return list(self.changes.keys()) - - def items(self) -> Iterator[tuple[str, tuple[int, int]]]: - return self.changes.items() - - def __setitem__(self, key: str, value: tuple[int, int]) -> None: - self.changes[key] = value - - def __getitem__(self, key: str) -> tuple[int, int]: - return self.changes.get(key, (0, 0)) - - def __add__(self, other: Self) -> Self: - if not isinstance(other, type(self)): - return NotImplemented - - result = self.changes.copy() - for rule_code, (added, removed) in other.changes.items(): - if rule_code in result: - result[rule_code] = ( - result[rule_code][0] + added, - result[rule_code][1] + removed, - ) - else: - result[rule_code] = (added, removed) - - return RuleChanges(changes=result) - - -@dataclass(frozen=True) -class CheckComparison: - diff: Diff - repo: ClonedRepository - rule_changes: RuleChanges - - -@dataclass(frozen=True) -class CheckOptions: - """ - Ruff check options - """ - - select: str = "" - ignore: str = "" - exclude: str = "" - - # Generating fixes is slow and verbose - show_fixes: bool = False - - def summary(self) -> str: - return f"select {self.select} ignore {self.ignore} exclude {self.exclude}" - - -@dataclass(frozen=True) -class FormatOptions: - """ - Ruff format options - """ - - pass - - -@dataclass(frozen=True) -class Target: - """ - An ecosystem target - """ - - repo: Repository - check_options: CheckOptions = field(default_factory=CheckOptions) - format_options: FormatOptions = field(default_factory=FormatOptions) - - -@dataclass(frozen=True) -class Result: - total_added: int - total_removed: int - total_rule_changes: RuleChanges - - comparisons: tuple[Target, CheckComparison] - errors: tuple[Target, Exception] diff --git a/python/ruff-ecosystem/ruff_ecosystem/projects.py b/python/ruff-ecosystem/ruff_ecosystem/projects.py new file mode 100644 index 0000000000..069bb7ae23 --- /dev/null +++ b/python/ruff-ecosystem/ruff_ecosystem/projects.py @@ -0,0 +1,168 @@ +from __future__ import annotations + +from asyncio import create_subprocess_exec +from dataclasses import dataclass, field +from enum import Enum +from pathlib import Path +from subprocess import PIPE +from typing import AsyncGenerator, Self + +from ruff_ecosystem import logger +from ruff_ecosystem.check import CheckOptions +from ruff_ecosystem.format import FormatOptions +from ruff_ecosystem.types import Serializable + + +@dataclass(frozen=True) +class Project(Serializable): + """ + An ecosystem target + """ + + repo: Repository + check_options: CheckOptions = field(default_factory=lambda: CheckOptions()) + format_options: FormatOptions = field(default_factory=lambda: FormatOptions()) + + +class RuffCommand(Enum): + check = "check" + format = "format" + + +class ProjectSetupError(Exception): + """An error setting up a project.""" + + +@dataclass(frozen=True) +class Repository(Serializable): + """ + A remote GitHub repository + """ + + owner: str + name: str + ref: str | None + + @property + def fullname(self) -> str: + return f"{self.owner}/{self.name}" + + @property + def url(self: Self) -> str: + return f"https://github.com/{self.owner}/{self.name}" + + async def clone( + self: Self, checkout_dir: Path + ) -> AsyncGenerator[ClonedRepository, None]: + """ + Shallow clone this repository + """ + if checkout_dir.exists(): + logger.debug(f"Reusing {self.owner}:{self.name}") + + if self.ref: + logger.debug(f"Checking out ref {self.ref}") + process = await create_subprocess_exec( + *["git", "checkout", "-f", self.ref], + cwd=checkout_dir, + env={"GIT_TERMINAL_PROMPT": "0"}, + stdout=PIPE, + stderr=PIPE, + ) + if await process.wait() != 0: + _, stderr = await process.communicate() + raise ProjectSetupError( + f"Failed to checkout {self.ref}: {stderr.decode()}" + ) + + return ClonedRepository( + name=self.name, + owner=self.owner, + ref=self.ref, + path=checkout_dir, + commit_hash=await self._get_head_commit(checkout_dir), + ) + + logger.debug(f"Cloning {self.owner}:{self.name} to {checkout_dir}") + command = [ + "git", + "clone", + "--config", + "advice.detachedHead=false", + "--quiet", + "--depth", + "1", + "--no-tags", + ] + if self.ref: + command.extend(["--branch", self.ref]) + + command.extend( + [ + f"https://github.com/{self.owner}/{self.name}", + checkout_dir, + ], + ) + + process = await create_subprocess_exec( + *command, env={"GIT_TERMINAL_PROMPT": "0"} + ) + + status_code = await process.wait() + + logger.debug( + f"Finished cloning {self.fullname} with status {status_code}", + ) + return ClonedRepository( + name=self.name, + owner=self.owner, + ref=self.ref, + path=checkout_dir, + commit_hash=await self._get_head_commit(checkout_dir), + ) + + @staticmethod + async def _get_head_commit(checkout_dir: Path) -> str: + """ + Return the commit sha for the repository in the checkout directory. + """ + process = await create_subprocess_exec( + *["git", "rev-parse", "HEAD"], + cwd=checkout_dir, + stdout=PIPE, + ) + stdout, _ = await process.communicate() + if await process.wait() != 0: + raise ProjectSetupError(f"Failed to retrieve commit sha at {checkout_dir}") + + return stdout.decode().strip() + + +@dataclass(frozen=True) +class ClonedRepository(Repository, Serializable): + """ + A cloned GitHub repository, which includes the hash of the cloned commit. + """ + + commit_hash: str + path: Path + + def url_for( + self: Self, + path: str, + line_number: int | None = None, + end_line_number: int | None = None, + ) -> str: + """ + Return the remote GitHub URL for the given path in this repository. + """ + url = f"https://github.com/{self.owner}/{self.name}/blob/{self.commit_hash}/{path}" + if line_number: + url += f"#L{line_number}" + if end_line_number: + url += f"-L{end_line_number}" + return url + + @property + def url(self: Self) -> str: + return f"https://github.com/{self.owner}/{self.name}@{self.commit_hash}" diff --git a/python/ruff-ecosystem/ruff_ecosystem/ruff.py b/python/ruff-ecosystem/ruff_ecosystem/ruff.py deleted file mode 100644 index 74fd624bb4..0000000000 --- a/python/ruff-ecosystem/ruff_ecosystem/ruff.py +++ /dev/null @@ -1,90 +0,0 @@ -from pathlib import Path -from ruff_ecosystem import logger -from ruff_ecosystem.models import CheckOptions, FormatOptions -import time -from asyncio import create_subprocess_exec -from subprocess import PIPE -from typing import Sequence -import re - -CHECK_SUMMARY_LINE_RE = re.compile( - r"^(Found \d+ error.*)|(.*potentially fixable with.*)$" -) - - -CHECK_DIFF_LINE_RE = re.compile( - r"^(?P
[+-]) (?P(?P[^:]+):(?P\d+):\d+:) (?P.*)$",
-)
-
-
-class RuffError(Exception):
-    """An error reported by ruff."""
-
-
-async def ruff_check(
-    *, executable: Path, path: Path, name: str, options: CheckOptions
-) -> Sequence[str]:
-    """Run the given ruff binary against the specified path."""
-    logger.debug(f"Checking {name} with {executable}")
-    ruff_args = ["check", "--no-cache", "--exit-zero"]
-    if options.select:
-        ruff_args.extend(["--select", options.select])
-    if options.ignore:
-        ruff_args.extend(["--ignore", options.ignore])
-    if options.exclude:
-        ruff_args.extend(["--exclude", options.exclude])
-    if options.show_fixes:
-        ruff_args.extend(["--show-fixes", "--ecosystem-ci"])
-
-    start = time.time()
-    proc = await create_subprocess_exec(
-        executable.absolute(),
-        *ruff_args,
-        ".",
-        stdout=PIPE,
-        stderr=PIPE,
-        cwd=path,
-    )
-    result, err = await proc.communicate()
-    end = time.time()
-
-    logger.debug(f"Finished checking {name} with {executable} in {end - start:.2f}")
-
-    if proc.returncode != 0:
-        raise RuffError(err.decode("utf8"))
-
-    lines = [
-        line
-        for line in result.decode("utf8").splitlines()
-        if not CHECK_SUMMARY_LINE_RE.match(line)
-    ]
-
-    return sorted(lines)
-
-
-async def ruff_format(
-    *, executable: Path, path: Path, name: str, options: FormatOptions
-) -> Sequence[str]:
-    """Run the given ruff binary against the specified path."""
-    logger.debug(f"Checking {name} with {executable}")
-    ruff_args = ["format", "--no-cache", "--exit-zero"]
-
-    start = time.time()
-    proc = await create_subprocess_exec(
-        executable.absolute(),
-        *ruff_args,
-        ".",
-        stdout=PIPE,
-        stderr=PIPE,
-        cwd=path,
-    )
-    result, err = await proc.communicate()
-    end = time.time()
-
-    logger.debug(f"Finished formatting {name} with {executable} in {end - start:.2f}")
-
-    if proc.returncode != 0:
-        raise RuffError(err.decode("utf8"))
-
-    lines = result.decode("utf8").splitlines()
-    return lines
diff --git a/python/ruff-ecosystem/ruff_ecosystem/types.py b/python/ruff-ecosystem/ruff_ecosystem/types.py
new file mode 100644
index 0000000000..ae8d3de217
--- /dev/null
+++ b/python/ruff-ecosystem/ruff_ecosystem/types.py
@@ -0,0 +1,70 @@
+from __future__ import annotations
+
+import abc
+import dataclasses
+import difflib
+from dataclasses import dataclass, is_dataclass
+from typing import TYPE_CHECKING, Any, Iterable, Sequence, Generator
+import heapq
+
+if TYPE_CHECKING:
+    from ruff_ecosystem.projects import ClonedRepository, Project
+
+
+class Serializable(abc.ABC):
+    """
+    Allows serialization of content by casting to a JSON-compatible type.
+    """
+
+    def jsonable(self) -> Any:
+        # Default implementation for dataclasses
+        if is_dataclass(self):
+            return dataclasses.asdict(self)
+
+        raise NotImplementedError()
+
+
+class Diff(Serializable):
+    def __init__(self, lines: Iterable[str]) -> None:
+        self.lines = list(lines)
+
+        # Compute added and removed lines once
+        self.added = list(line[2:] for line in self.lines if line.startswith("+ "))
+        self.removed = list(line[2:] for line in self.lines if line.startswith("- "))
+
+    def __bool__(self) -> bool:
+        return bool(self.added or self.removed)
+
+    def __iter__(self) -> Generator[str, None, None]:
+        yield from self.lines
+
+    @property
+    def lines_added(self):
+        return len(self.added)
+
+    @property
+    def lines_removed(self):
+        return len(self.removed)
+
+    @classmethod
+    def new(cls, baseline: Sequence[str], comparison: Sequence[str]):
+        return cls(difflib.ndiff(baseline, comparison))
+
+    def jsonable(self) -> Any:
+        return self.lines
+
+
+@dataclass(frozen=True)
+class Result(Serializable):
+    errored: list[tuple[Project, Exception]]
+    completed: list[tuple[Project, Comparison]]
+
+
+@dataclass(frozen=True)
+class Comparison(Serializable):
+    diff: Diff
+    repo: ClonedRepository
+
+
+class RuffError(Exception):
+    """An error reported by ruff."""