perf(pycodestyle): Remove regex captures (#3735)

2023-03-28 09:50:34 +02:00 · 2023-03-28 09:50:34 +02:00 · 1d724b1495
parent 113a8b8fda
commit 1d724b1495
7 changed files with 154 additions and 55 deletions
--- a/crates/ruff/src/checkers/logical_lines.rs
+++ b/crates/ruff/src/checkers/logical_lines.rs
@ -43,6 +43,20 @@ pub fn check_logical_lines(
 ) -> Vec<Diagnostic> {
    let mut diagnostics = vec![];

+    #[cfg(feature = "logical_lines")]
+    let should_fix_missing_whitespace =
+        autofix.into() && settings.rules.should_fix(Rule::MissingWhitespace);
+
+    #[cfg(not(feature = "logical_lines"))]
+    let should_fix_missing_whitespace = false;
+
+    #[cfg(feature = "logical_lines")]
+    let should_fix_whitespace_before_parameters =
+        autofix.into() && settings.rules.should_fix(Rule::WhitespaceBeforeParameters);
+
+    #[cfg(not(feature = "logical_lines"))]
+    let should_fix_whitespace_before_parameters = false;
+
    let indent_char = stylist.indentation().as_char();
    let mut prev_line = None;
    let mut prev_indent_level = None;
@ -152,15 +166,12 @@ pub fn check_logical_lines(
                }
            }

-            #[cfg(feature = "logical_lines")]
-            let should_fix = autofix.into() && settings.rules.should_fix(Rule::MissingWhitespace);
-
-            #[cfg(not(feature = "logical_lines"))]
-            let should_fix = false;
-
-            for diagnostic in
-                missing_whitespace(line.text(), start_loc.row(), should_fix, indent_level)
-            {
+            for diagnostic in missing_whitespace(
+                line.text(),
+                start_loc.row(),
+                should_fix_missing_whitespace,
+                indent_level,
+            ) {
                if settings.rules.enabled(diagnostic.kind.rule()) {
                    diagnostics.push(diagnostic);
                }
@ -168,14 +179,9 @@ pub fn check_logical_lines(
        }

        if line.flags().contains(TokenFlags::BRACKET) {
-            #[cfg(feature = "logical_lines")]
-            let should_fix =
-                autofix.into() && settings.rules.should_fix(Rule::WhitespaceBeforeParameters);
-
-            #[cfg(not(feature = "logical_lines"))]
-            let should_fix = false;
-
-            for diagnostic in whitespace_before_parameters(line.tokens(), should_fix) {
+            for diagnostic in
+                whitespace_before_parameters(line.tokens(), should_fix_whitespace_before_parameters)
+            {
                if settings.rules.enabled(diagnostic.kind.rule()) {
                    diagnostics.push(diagnostic);
                }
--- a/crates/ruff/src/rules/pycodestyle/logical_lines.rs
+++ b/crates/ruff/src/rules/pycodestyle/logical_lines.rs
@ -45,7 +45,8 @@ impl<'a> LogicalLines<'a> {
        assert!(u32::try_from(tokens.len()).is_ok());

        let single_token = tokens.len() == 1;
-        let mut builder = LogicalLinesBuilder::with_token_capacity(tokens.len());
+        let mut builder =
+            LogicalLinesBuilder::with_capacity(tokens.len(), locator.contents().len());
        let mut parens: u32 = 0;

        for (start, token, end) in tokens.iter().flatten() {
@ -280,10 +281,11 @@ pub struct LogicalLinesBuilder<'a> {
 }

 impl<'a> LogicalLinesBuilder<'a> {
-    fn with_token_capacity(capacity: usize) -> Self {
+    fn with_capacity(tokens: usize, string: usize) -> Self {
        Self {
-            tokens: Vec::with_capacity(capacity),
-            mappings: Mappings::with_capacity(capacity + 1),
+            tokens: Vec::with_capacity(tokens),
+            mappings: Mappings::with_capacity(tokens + 1),
+            text: String::with_capacity(string),
            ..Self::default()
        }
    }
@ -340,6 +342,9 @@ impl<'a> LogicalLinesBuilder<'a> {

        // TODO(charlie): "Mute" strings.
        let text = if let Tok::String { value, .. } = token {
+            // Replace the content of strings with a non-whs sequence because some lints
+            // search for whitespace in the document and whitespace inside of the string
+            // would complicate the search.
            Cow::Owned(format!("\"{}\"", "x".repeat(value.width())))
        } else {
            Cow::Borrowed(locator.slice(Range {
--- a/crates/ruff/src/rules/pycodestyle/rules/extraneous_whitespace.rs
+++ b/crates/ruff/src/rules/pycodestyle/rules/extraneous_whitespace.rs
@ -103,17 +103,16 @@ impl Violation for WhitespaceBeforePunctuation {

 // TODO(charlie): Pycodestyle has a negative lookahead on the end.
 static EXTRANEOUS_WHITESPACE_REGEX: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"([\[({][ \t]|[ \t][]}),;:])").unwrap());
+    Lazy::new(|| Regex::new(r"[\[({][ \t]|[ \t][]}),;:]").unwrap());

 /// E201, E202, E203
 #[cfg(feature = "logical_lines")]
 pub fn extraneous_whitespace(line: &str) -> Vec<(usize, DiagnosticKind)> {
    let mut diagnostics = vec![];
-    for line_match in EXTRANEOUS_WHITESPACE_REGEX.captures_iter(line) {
-        let match_ = line_match.get(1).unwrap();
-        let text = match_.as_str();
+    for line_match in EXTRANEOUS_WHITESPACE_REGEX.find_iter(line) {
+        let text = &line[line_match.range()];
        let char = text.trim();
-        let found = match_.start();
+        let found = line_match.start();
        if text.chars().last().unwrap().is_ascii_whitespace() {
            diagnostics.push((found + 1, WhitespaceAfterOpenBracket.into()));
        } else if line.chars().nth(found - 1).map_or(false, |c| c != ',') {
--- a/crates/ruff/src/rules/pycodestyle/rules/mod.rs
+++ b/crates/ruff/src/rules/pycodestyle/rules/mod.rs
@ -86,3 +86,60 @@ mod whitespace_around_keywords;
 mod whitespace_around_named_parameter_equals;
 mod whitespace_before_comment;
 mod whitespace_before_parameters;
+
+#[allow(unused)]
+enum Whitespace {
+    None,
+    Single,
+    Many,
+    Tab,
+}
+
+impl Whitespace {
+    #[allow(dead_code)]
+    fn leading(content: &str) -> (usize, Self) {
+        let mut offset = 0;
+        let mut kind = Self::None;
+
+        for c in content.chars() {
+            if c == '\t' {
+                kind = Self::Tab;
+                offset += 1;
+            } else if c.is_whitespace() {
+                kind = match kind {
+                    Whitespace::None => Whitespace::Single,
+                    Whitespace::Single | Whitespace::Many => Whitespace::Many,
+                    Whitespace::Tab => Whitespace::Tab,
+                };
+                offset += c.len_utf8();
+            } else {
+                break;
+            }
+        }
+
+        (offset, kind)
+    }
+
+    #[allow(dead_code)]
+    fn trailing(content: &str) -> (Self, usize) {
+        let mut count = 0u32;
+        let mut offset = 0;
+
+        for c in content.chars().rev() {
+            if c == '\t' {
+                return (Self::Tab, offset + 1);
+            } else if c.is_whitespace() {
+                count += 1;
+                offset += c.len_utf8();
+            } else {
+                break;
+            }
+        }
+
+        match count {
+            0 => (Self::None, 0),
+            1 => (Self::Single, offset),
+            _ => (Self::Many, offset),
+        }
+    }
+}
--- a/crates/ruff/src/rules/pycodestyle/rules/space_around_operator.rs
+++ b/crates/ruff/src/rules/pycodestyle/rules/space_around_operator.rs
@ -2,10 +2,15 @@

 use once_cell::sync::Lazy;
 use regex::Regex;
+use rustpython_parser::ast::Location;
+use rustpython_parser::Tok;

+use crate::rules::pycodestyle::helpers::is_op_token;
+use crate::rules::pycodestyle::rules::Whitespace;
 use ruff_diagnostics::DiagnosticKind;
 use ruff_diagnostics::Violation;
 use ruff_macros::{derive_message_formats, violation};
+use ruff_python_ast::source_code::Locator;

 /// ## What it does
 /// Checks for extraneous tabs before an operator.
@ -123,28 +128,41 @@ impl Violation for MultipleSpacesAfterOperator {
    }
 }

-static OPERATOR_REGEX: Lazy<Regex> =
-    Lazy::new(|| Regex::new(r"[^,\s](\s*)(?:[-+*/|!<=>%&^]+|:=)(\s*)").unwrap());
+static OPERATOR_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"[-+*/|!<=>%&^]+|:=").unwrap());

 /// E221, E222, E223, E224
 #[cfg(feature = "logical_lines")]
 pub fn space_around_operator(line: &str) -> Vec<(usize, DiagnosticKind)> {
    let mut diagnostics = vec![];
-    for line_match in OPERATOR_REGEX.captures_iter(line) {
-        let before = line_match.get(1).unwrap();
-        let after = line_match.get(2).unwrap();
+    let mut last_end = None;

-        if before.as_str().contains('\t') {
-            diagnostics.push((before.start(), TabBeforeOperator.into()));
-        } else if before.as_str().len() > 1 {
-            diagnostics.push((before.start(), MultipleSpacesBeforeOperator.into()));
+    for line_match in OPERATOR_REGEX.find_iter(line) {
+        if last_end != Some(line_match.start()) {
+            let before = &line[..line_match.start()];
+
+            match Whitespace::trailing(before) {
+                (Whitespace::Tab, offset) => {
+                    diagnostics.push((line_match.start() - offset, TabBeforeOperator.into()));
+                }
+                (Whitespace::Many, offset) => diagnostics.push((
+                    line_match.start() - offset,
+                    MultipleSpacesBeforeOperator.into(),
+                )),
+                _ => {}
+            }
        }

-        if after.as_str().contains('\t') {
-            diagnostics.push((after.start(), TabAfterOperator.into()));
-        } else if after.as_str().len() > 1 {
-            diagnostics.push((after.start(), MultipleSpacesAfterOperator.into()));
+        let after = &line[line_match.end()..];
+        let (leading_offset, leading_kind) = Whitespace::leading(after);
+        match leading_kind {
+            Whitespace::Tab => diagnostics.push((line_match.end(), TabAfterOperator.into())),
+            Whitespace::Many => {
+                diagnostics.push((line_match.end(), MultipleSpacesAfterOperator.into()));
+            }
+            _ => {}
        }
+
+        last_end = Some(line_match.end() + leading_offset);
    }
    diagnostics
 }
--- a/crates/ruff/src/rules/pycodestyle/rules/whitespace_around_keywords.rs
+++ b/crates/ruff/src/rules/pycodestyle/rules/whitespace_around_keywords.rs
@ -3,6 +3,7 @@
 use once_cell::sync::Lazy;
 use regex::Regex;

+use crate::rules::pycodestyle::rules::Whitespace;
 use ruff_diagnostics::DiagnosticKind;
 use ruff_diagnostics::Violation;
 use ruff_macros::{derive_message_formats, violation};
@ -111,28 +112,41 @@ impl Violation for TabBeforeKeyword {
 }

 static KEYWORD_REGEX: Lazy<Regex> = Lazy::new(|| {
-    Regex::new(r"(\s*)\b(?:False|None|True|and|as|assert|async|await|break|class|continue|def|del|elif|else|except|finally|for|from|global|if|import|in|is|lambda|nonlocal|not|or|pass|raise|return|try|while|with|yield)\b(\s*)").unwrap()
+    Regex::new(r"\b(False|None|True|and|as|assert|async|await|break|class|continue|def|del|elif|else|except|finally|for|from|global|if|import|in|is|lambda|nonlocal|not|or|pass|raise|return|try|while|with|yield)\b").unwrap()
 });

 /// E271, E272, E273, E274
 #[cfg(feature = "logical_lines")]
 pub fn whitespace_around_keywords(line: &str) -> Vec<(usize, DiagnosticKind)> {
    let mut diagnostics = vec![];
-    for line_match in KEYWORD_REGEX.captures_iter(line) {
-        let before = line_match.get(1).unwrap();
-        let after = line_match.get(2).unwrap();
+    let mut last_end = None;

-        if before.as_str().contains('\t') {
-            diagnostics.push((before.start(), TabBeforeKeyword.into()));
-        } else if before.as_str().len() > 1 {
-            diagnostics.push((before.start(), MultipleSpacesBeforeKeyword.into()));
+    for line_match in KEYWORD_REGEX.find_iter(line) {
+        if last_end != Some(line_match.start()) {
+            let before = &line[..line_match.start()];
+            match Whitespace::trailing(before) {
+                (Whitespace::Tab, offset) => {
+                    diagnostics.push((line_match.start() - offset, TabBeforeKeyword.into()));
+                }
+                (Whitespace::Many, offset) => diagnostics.push((
+                    line_match.start() - offset,
+                    MultipleSpacesBeforeKeyword.into(),
+                )),
+                _ => {}
+            }
        }

-        if after.as_str().contains('\t') {
-            diagnostics.push((after.start(), TabAfterKeyword.into()));
-        } else if after.as_str().len() > 1 {
-            diagnostics.push((after.start(), MultipleSpacesAfterKeyword.into()));
+        let after = &line[line_match.end()..];
+        let (leading_offset, leading_kind) = Whitespace::leading(after);
+        match leading_kind {
+            Whitespace::Tab => diagnostics.push((line_match.end(), TabAfterKeyword.into())),
+            Whitespace::Many => {
+                diagnostics.push((line_match.end(), MultipleSpacesAfterKeyword.into()));
+            }
+            _ => {}
        }
+
+        last_end = Some(line_match.end() + leading_offset);
    }
    diagnostics
 }
--- a/crates/ruff/src/rules/pycodestyle/snapshots/ruffrulespycodestyletestsE274_E27.py.snap
+++ b/crates/ruff/src/rules/pycodestyle/snapshots/ruffrulespycodestyletestsE274_E27.py.snap
@ -9,10 +9,10 @@ expression: diagnostics
    fixable: false
  location:
    row: 28
-    column: 1
+    column: 2
  end_location:
    row: 28
-    column: 1
+    column: 2
  fix:
    edits: []
  parent: ~
@ -23,10 +23,10 @@ expression: diagnostics
    fixable: false
  location:
    row: 30
-    column: 4
+    column: 5
  end_location:
    row: 30
-    column: 4
+    column: 5
  fix:
    edits: []
  parent: ~