From 9d6444138b2351801e52a53ecc23cecb53905d2a Mon Sep 17 00:00:00 2001
From: Charlie Marsh <charlie.r.marsh@gmail.com>
Date: Sat, 23 Dec 2023 16:43:11 -0500
Subject: [PATCH] Remove lexing and parsing from the linter benchmark (#9264)

## Summary

This PR adds some helper structs to the linter paths to enable passing
in the pre-computed tokens and parsed source code during benchmarking,
to remove lexing and parsing from the overall linter benchmark
measurement. We already remove parsing for the formatter, and we have
separate benchmarks for the lexer and the parser, so this should make it
much easier to measure linter performance changes.
---
 crates/ruff_benchmark/benches/linter.rs      |  17 ++-
 crates/ruff_cli/src/diagnostics.rs           |  24 +++-
 crates/ruff_linter/src/linter.rs             | 111 ++++++++++++++++---
 crates/ruff_linter/src/rules/pyflakes/mod.rs |   4 +-
 crates/ruff_linter/src/test.rs               |   6 +-
 crates/ruff_python_parser/src/lexer.rs       |   4 +-
 crates/ruff_python_parser/src/string.rs      |   4 +-
 crates/ruff_wasm/src/lib.rs                  |   4 +-
 8 files changed, 144 insertions(+), 30 deletions(-)

diff --git a/crates/ruff_benchmark/benches/linter.rs b/crates/ruff_benchmark/benches/linter.rs
index a98061fc82..b8f6bbd00d 100644
--- a/crates/ruff_benchmark/benches/linter.rs
+++ b/crates/ruff_benchmark/benches/linter.rs
@@ -2,7 +2,7 @@ use ruff_benchmark::criterion::{
     criterion_group, criterion_main, BenchmarkGroup, BenchmarkId, Criterion, Throughput,
 };
 use ruff_benchmark::{TestCase, TestFile, TestFileDownloadError};
-use ruff_linter::linter::lint_only;
+use ruff_linter::linter::{lint_only, ParseSource};
 use ruff_linter::rule_selector::PreviewOptions;
 use ruff_linter::settings::rule_table::RuleTable;
 use ruff_linter::settings::types::PreviewMode;
@@ -10,6 +10,7 @@ use ruff_linter::settings::{flags, LinterSettings};
 use ruff_linter::source_kind::SourceKind;
 use ruff_linter::{registry::Rule, RuleSelector};
 use ruff_python_ast::PySourceType;
+use ruff_python_parser::{lexer, parse_program_tokens, Mode};
 
 #[cfg(target_os = "windows")]
 #[global_allocator]
@@ -53,7 +54,13 @@ fn benchmark_linter(mut group: BenchmarkGroup, settings: &LinterSettings) {
             BenchmarkId::from_parameter(case.name()),
             &case,
             |b, case| {
-                let kind = SourceKind::Python(case.code().to_string());
+                // Tokenize the source.
+                let tokens = lexer::lex(case.code(), Mode::Module).collect::<Vec<_>>();
+
+                // Parse the source.
+                let ast =
+                    parse_program_tokens(tokens.clone(), case.code(), case.name(), false).unwrap();
+
                 b.iter(|| {
                     let path = case.path();
                     let result = lint_only(
@@ -61,8 +68,12 @@ fn benchmark_linter(mut group: BenchmarkGroup, settings: &LinterSettings) {
                         None,
                         settings,
                         flags::Noqa::Enabled,
-                        &kind,
+                        &SourceKind::Python(case.code().to_string()),
                         PySourceType::from(path.as_path()),
+                        ParseSource::Precomputed {
+                            tokens: &tokens,
+                            ast: &ast,
+                        },
                     );
 
                     // Assert that file contains no parse errors
diff --git a/crates/ruff_cli/src/diagnostics.rs b/crates/ruff_cli/src/diagnostics.rs
index 076963fee5..b29354b12e 100644
--- a/crates/ruff_cli/src/diagnostics.rs
+++ b/crates/ruff_cli/src/diagnostics.rs
@@ -12,7 +12,7 @@ use rustc_hash::FxHashMap;
 
 use crate::cache::{Cache, FileCacheKey, LintCacheData};
 use ruff_diagnostics::Diagnostic;
-use ruff_linter::linter::{lint_fix, lint_only, FixTable, FixerResult, LinterResult};
+use ruff_linter::linter::{lint_fix, lint_only, FixTable, FixerResult, LinterResult, ParseSource};
 use ruff_linter::logging::DisplayParseError;
 use ruff_linter::message::Message;
 use ruff_linter::pyproject_toml::lint_pyproject_toml;
@@ -303,12 +303,28 @@ pub(crate) fn lint_path(
             (result, fixed)
         } else {
             // If we fail to fix, lint the original source code.
-            let result = lint_only(path, package, settings, noqa, &source_kind, source_type);
+            let result = lint_only(
+                path,
+                package,
+                settings,
+                noqa,
+                &source_kind,
+                source_type,
+                ParseSource::None,
+            );
             let fixed = FxHashMap::default();
             (result, fixed)
         }
     } else {
-        let result = lint_only(path, package, settings, noqa, &source_kind, source_type);
+        let result = lint_only(
+            path,
+            package,
+            settings,
+            noqa,
+            &source_kind,
+            source_type,
+            ParseSource::None,
+        );
         let fixed = FxHashMap::default();
         (result, fixed)
     };
@@ -444,6 +460,7 @@ pub(crate) fn lint_stdin(
                 noqa,
                 &source_kind,
                 source_type,
+                ParseSource::None,
             );
             let fixed = FxHashMap::default();
 
@@ -462,6 +479,7 @@ pub(crate) fn lint_stdin(
             noqa,
             &source_kind,
             source_type,
+            ParseSource::None,
         );
         let fixed = FxHashMap::default();
         (result, fixed)
diff --git a/crates/ruff_linter/src/linter.rs b/crates/ruff_linter/src/linter.rs
index f6382e8b30..2d81cc961c 100644
--- a/crates/ruff_linter/src/linter.rs
+++ b/crates/ruff_linter/src/linter.rs
@@ -11,7 +11,7 @@ use rustc_hash::FxHashMap;
 use ruff_diagnostics::Diagnostic;
 use ruff_notebook::Notebook;
 use ruff_python_ast::imports::ImportMap;
-use ruff_python_ast::PySourceType;
+use ruff_python_ast::{PySourceType, Suite};
 use ruff_python_codegen::Stylist;
 use ruff_python_index::Indexer;
 use ruff_python_parser::lexer::LexResult;
@@ -73,7 +73,6 @@ pub struct FixerResult<'a> {
 pub fn check_path(
     path: &Path,
     package: Option<&Path>,
-    tokens: Vec<LexResult>,
     locator: &Locator,
     stylist: &Stylist,
     indexer: &Indexer,
@@ -82,6 +81,7 @@ pub fn check_path(
     noqa: flags::Noqa,
     source_kind: &SourceKind,
     source_type: PySourceType,
+    tokens: TokenSource,
 ) -> LinterResult<(Vec<Diagnostic>, Option<ImportMap>)> {
     // Aggregate all diagnostics.
     let mut diagnostics = vec![];
@@ -144,12 +144,8 @@ pub fn check_path(
             .iter_enabled()
             .any(|rule_code| rule_code.lint_source().is_imports());
     if use_ast || use_imports || use_doc_lines {
-        match ruff_python_parser::parse_program_tokens(
-            tokens,
-            source_kind.source_code(),
-            &path.to_string_lossy(),
-            source_type.is_ipynb(),
-        ) {
+        // Parse, if the AST wasn't pre-provided provided.
+        match tokens.into_ast_source(source_kind, source_type, path) {
             Ok(python_ast) => {
                 let cell_offsets = source_kind.as_ipy_notebook().map(Notebook::cell_offsets);
                 if use_ast {
@@ -325,7 +321,6 @@ pub fn add_noqa_to_path(
     } = check_path(
         path,
         package,
-        tokens,
         &locator,
         &stylist,
         &indexer,
@@ -334,6 +329,7 @@ pub fn add_noqa_to_path(
         flags::Noqa::Disabled,
         source_kind,
         source_type,
+        TokenSource::Tokens(tokens),
     );
 
     // Log any parse errors.
@@ -365,10 +361,10 @@ pub fn lint_only(
     noqa: flags::Noqa,
     source_kind: &SourceKind,
     source_type: PySourceType,
+    data: ParseSource,
 ) -> LinterResult<(Vec<Message>, Option<ImportMap>)> {
     // Tokenize once.
-    let tokens: Vec<LexResult> =
-        ruff_python_parser::tokenize(source_kind.source_code(), source_type.as_mode());
+    let tokens = data.into_token_source(source_kind, source_type);
 
     // Map row and column locations to byte slices (lazily).
     let locator = Locator::new(source_kind.source_code());
@@ -391,7 +387,6 @@ pub fn lint_only(
     let result = check_path(
         path,
         package,
-        tokens,
         &locator,
         &stylist,
         &indexer,
@@ -400,6 +395,7 @@ pub fn lint_only(
         noqa,
         source_kind,
         source_type,
+        tokens,
     );
 
     result.map(|(diagnostics, imports)| {
@@ -487,7 +483,6 @@ pub fn lint_fix<'a>(
         let result = check_path(
             path,
             package,
-            tokens,
             &locator,
             &stylist,
             &indexer,
@@ -496,6 +491,7 @@ pub fn lint_fix<'a>(
             noqa,
             &transformed,
             source_type,
+            TokenSource::Tokens(tokens),
         );
 
         if iterations == 0 {
@@ -632,6 +628,95 @@ This indicates a bug in Ruff. If you could open an issue at:
     }
 }
 
+#[derive(Debug, Clone)]
+pub enum ParseSource<'a> {
+    /// Extract the tokens and AST from the given source code.
+    None,
+    /// Use the precomputed tokens and AST.
+    Precomputed {
+        tokens: &'a [LexResult],
+        ast: &'a Suite,
+    },
+}
+
+impl<'a> ParseSource<'a> {
+    /// Convert to a [`TokenSource`], tokenizing if necessary.
+    fn into_token_source(
+        self,
+        source_kind: &SourceKind,
+        source_type: PySourceType,
+    ) -> TokenSource<'a> {
+        match self {
+            Self::None => TokenSource::Tokens(ruff_python_parser::tokenize(
+                source_kind.source_code(),
+                source_type.as_mode(),
+            )),
+            Self::Precomputed { tokens, ast } => TokenSource::Precomputed { tokens, ast },
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub enum TokenSource<'a> {
+    /// Use the precomputed tokens to generate the AST.
+    Tokens(Vec<LexResult>),
+    /// Use the precomputed tokens and AST.
+    Precomputed {
+        tokens: &'a [LexResult],
+        ast: &'a Suite,
+    },
+}
+
+impl Deref for TokenSource<'_> {
+    type Target = [LexResult];
+
+    fn deref(&self) -> &Self::Target {
+        match self {
+            Self::Tokens(tokens) => tokens,
+            Self::Precomputed { tokens, .. } => tokens,
+        }
+    }
+}
+
+impl<'a> TokenSource<'a> {
+    /// Convert to an [`AstSource`], parsing if necessary.
+    fn into_ast_source(
+        self,
+        source_kind: &SourceKind,
+        source_type: PySourceType,
+        path: &Path,
+    ) -> Result<AstSource<'a>, ParseError> {
+        match self {
+            Self::Tokens(tokens) => Ok(AstSource::Ast(ruff_python_parser::parse_program_tokens(
+                tokens,
+                source_kind.source_code(),
+                &path.to_string_lossy(),
+                source_type.is_ipynb(),
+            )?)),
+            Self::Precomputed { ast, .. } => Ok(AstSource::Precomputed(ast)),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub enum AstSource<'a> {
+    /// Extract the AST from the given source code.
+    Ast(Suite),
+    /// Use the precomputed AST.
+    Precomputed(&'a Suite),
+}
+
+impl Deref for AstSource<'_> {
+    type Target = Suite;
+
+    fn deref(&self) -> &Self::Target {
+        match self {
+            Self::Ast(ast) => ast,
+            Self::Precomputed(ast) => ast,
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::path::Path;
diff --git a/crates/ruff_linter/src/rules/pyflakes/mod.rs b/crates/ruff_linter/src/rules/pyflakes/mod.rs
index e4a313f54b..84b54f9126 100644
--- a/crates/ruff_linter/src/rules/pyflakes/mod.rs
+++ b/crates/ruff_linter/src/rules/pyflakes/mod.rs
@@ -23,7 +23,7 @@ mod tests {
     use ruff_source_file::Locator;
     use ruff_text_size::Ranged;
 
-    use crate::linter::{check_path, LinterResult};
+    use crate::linter::{check_path, LinterResult, TokenSource};
     use crate::registry::{AsRule, Linter, Rule};
     use crate::rules::pyflakes;
     use crate::settings::types::PreviewMode;
@@ -560,7 +560,6 @@ mod tests {
         } = check_path(
             Path::new("<filename>"),
             None,
-            tokens,
             &locator,
             &stylist,
             &indexer,
@@ -569,6 +568,7 @@ mod tests {
             flags::Noqa::Enabled,
             &source_kind,
             source_type,
+            TokenSource::Tokens(tokens),
         );
         diagnostics.sort_by_key(Ranged::start);
         let actual = diagnostics
diff --git a/crates/ruff_linter/src/test.rs b/crates/ruff_linter/src/test.rs
index d193e49cc9..9f7ec4d802 100644
--- a/crates/ruff_linter/src/test.rs
+++ b/crates/ruff_linter/src/test.rs
@@ -21,7 +21,7 @@ use ruff_text_size::Ranged;
 
 use crate::directives;
 use crate::fix::{fix_file, FixResult};
-use crate::linter::{check_path, LinterResult};
+use crate::linter::{check_path, LinterResult, TokenSource};
 use crate::message::{Emitter, EmitterContext, Message, TextEmitter};
 use crate::packaging::detect_package_root;
 use crate::registry::AsRule;
@@ -129,7 +129,6 @@ pub(crate) fn test_contents<'a>(
         path,
         path.parent()
             .and_then(|parent| detect_package_root(parent, &settings.namespace_packages)),
-        tokens,
         &locator,
         &stylist,
         &indexer,
@@ -138,6 +137,7 @@ pub(crate) fn test_contents<'a>(
         flags::Noqa::Enabled,
         source_kind,
         source_type,
+        TokenSource::Tokens(tokens),
     );
 
     let source_has_errors = error.is_some();
@@ -195,7 +195,6 @@ pub(crate) fn test_contents<'a>(
             } = check_path(
                 path,
                 None,
-                tokens,
                 &locator,
                 &stylist,
                 &indexer,
@@ -204,6 +203,7 @@ pub(crate) fn test_contents<'a>(
                 flags::Noqa::Enabled,
                 &transformed,
                 source_type,
+                TokenSource::Tokens(tokens),
             );
 
             if let Some(fixed_error) = fixed_error {
diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
index 3831bc28d0..180c2839fe 100644
--- a/crates/ruff_python_parser/src/lexer.rs
+++ b/crates/ruff_python_parser/src/lexer.rs
@@ -1293,7 +1293,7 @@ impl FusedIterator for Lexer<'_> {}
 /// [lexer] implementation.
 ///
 /// [lexer]: crate::lexer
-#[derive(Debug, PartialEq)]
+#[derive(Debug, Clone, PartialEq)]
 pub struct LexicalError {
     /// The type of error that occurred.
     pub error: LexicalErrorType,
@@ -1309,7 +1309,7 @@ impl LexicalError {
 }
 
 /// Represents the different types of errors that can occur during lexing.
-#[derive(Debug, PartialEq)]
+#[derive(Debug, Clone, PartialEq)]
 pub enum LexicalErrorType {
     // TODO: Can probably be removed, the places it is used seem to be able
     // to use the `UnicodeError` variant instead.
diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs
index ab9106ff31..3abef4c8a4 100644
--- a/crates/ruff_python_parser/src/string.rs
+++ b/crates/ruff_python_parser/src/string.rs
@@ -409,7 +409,7 @@ pub(crate) fn concatenated_strings(
 
 // TODO: consolidate these with ParseError
 /// An error that occurred during parsing of an f-string.
-#[derive(Debug, PartialEq)]
+#[derive(Debug, Clone, PartialEq)]
 struct FStringError {
     /// The type of error that occurred.
     pub(crate) error: FStringErrorType,
@@ -427,7 +427,7 @@ impl From<FStringError> for LexicalError {
 }
 
 /// Represents the different types of errors that can occur during parsing of an f-string.
-#[derive(Debug, PartialEq)]
+#[derive(Debug, Clone, PartialEq)]
 pub enum FStringErrorType {
     /// Expected a right brace after an opened left brace.
     UnclosedLbrace,
diff --git a/crates/ruff_wasm/src/lib.rs b/crates/ruff_wasm/src/lib.rs
index a067cde03e..5316fcf7dd 100644
--- a/crates/ruff_wasm/src/lib.rs
+++ b/crates/ruff_wasm/src/lib.rs
@@ -7,7 +7,7 @@ use wasm_bindgen::prelude::*;
 use ruff_formatter::{FormatResult, Formatted, IndentStyle};
 use ruff_linter::directives;
 use ruff_linter::line_width::{IndentWidth, LineLength};
-use ruff_linter::linter::{check_path, LinterResult};
+use ruff_linter::linter::{check_path, LinterResult, TokenSource};
 use ruff_linter::registry::AsRule;
 use ruff_linter::settings::types::PythonVersion;
 use ruff_linter::settings::{flags, DEFAULT_SELECTORS, DUMMY_VARIABLE_RGX};
@@ -182,7 +182,6 @@ impl Workspace {
         } = check_path(
             Path::new("<filename>"),
             None,
-            tokens,
             &locator,
             &stylist,
             &indexer,
@@ -191,6 +190,7 @@ impl Workspace {
             flags::Noqa::Enabled,
             &source_kind,
             source_type,
+            TokenSource::Tokens(tokens),
         );
 
         let source_code = locator.to_source_code();