diff --git a/Cargo.lock b/Cargo.lock index 50796dd688..bc11ece0d6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -28,9 +28,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.0.5" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783" +checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab" dependencies = [ "memchr", ] @@ -2168,6 +2168,7 @@ dependencies = [ name = "ruff_linter" version = "0.0.291" dependencies = [ + "aho-corasick", "annotate-snippets 0.9.1", "anyhow", "bitflags 2.4.0", diff --git a/crates/ruff_linter/Cargo.toml b/crates/ruff_linter/Cargo.toml index bea3b26f6b..80ffae6dc0 100644 --- a/crates/ruff_linter/Cargo.toml +++ b/crates/ruff_linter/Cargo.toml @@ -29,6 +29,7 @@ ruff_python_parser = { path = "../ruff_python_parser" } ruff_source_file = { path = "../ruff_source_file", features = ["serde"] } ruff_text_size = { path = "../ruff_text_size" } +aho-corasick = { version = "1.1.1" } annotate-snippets = { version = "0.9.1", features = ["color"] } anyhow = { workspace = true } bitflags = { workspace = true } diff --git a/crates/ruff_linter/src/rules/eradicate/detection.rs b/crates/ruff_linter/src/rules/eradicate/detection.rs index ac9c6e0e4c..9d4eef33ef 100644 --- a/crates/ruff_linter/src/rules/eradicate/detection.rs +++ b/crates/ruff_linter/src/rules/eradicate/detection.rs @@ -1,43 +1,48 @@ /// See: [eradicate.py](https://github.com/myint/eradicate/blob/98f199940979c94447a461d50d27862b118b282d/eradicate.py) +use aho_corasick::AhoCorasick; use once_cell::sync::Lazy; -use regex::Regex; +use regex::{Regex, RegexSet}; use ruff_python_parser::parse_suite; +static CODE_INDICATORS: Lazy = Lazy::new(|| { + AhoCorasick::new([ + "(", ")", "[", "]", "{", "}", ":", "=", "%", "print", "return", "break", "continue", + "import", + ]) + .unwrap() +}); + static ALLOWLIST_REGEX: Lazy = Lazy::new(|| { Regex::new( - r"^(?i)(?:pylint|pyright|noqa|nosec|region|endregion|type:\s*ignore|fmt:\s*(on|off)|isort:\s*(on|off|skip|skip_file|split|dont-add-imports(:\s*\[.*?])?)|mypy:|SPDX-License-Identifier:)", + r"^(?i)(?:pylint|pyright|noqa|nosec|region|endregion|type:\s*ignore|fmt:\s*(on|off)|isort:\s*(on|off|skip|skip_file|split|dont-add-imports(:\s*\[.*?])?)|mypy:|SPDX-License-Identifier:|(?:en)?coding[:=][ \t]*([-_.a-zA-Z0-9]+))", ).unwrap() }); -static BRACKET_REGEX: Lazy = Lazy::new(|| Regex::new(r"^[()\[\]{}\s]+$").unwrap()); -static CODE_INDICATORS: &[&str] = &[ - "(", ")", "[", "]", "{", "}", ":", "=", "%", "print", "return", "break", "continue", "import", -]; -static CODE_KEYWORDS: Lazy> = Lazy::new(|| { - vec![ - Regex::new(r"^\s*elif\s+.*\s*:\s*$").unwrap(), - Regex::new(r"^\s*else\s*:\s*$").unwrap(), - Regex::new(r"^\s*try\s*:\s*$").unwrap(), - Regex::new(r"^\s*finally\s*:\s*$").unwrap(), - Regex::new(r"^\s*except\s+.*\s*:\s*$").unwrap(), - ] -}); -static CODING_COMMENT_REGEX: Lazy = - Lazy::new(|| Regex::new(r"^.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)").unwrap()); + static HASH_NUMBER: Lazy = Lazy::new(|| Regex::new(r"#\d").unwrap()); -static MULTILINE_ASSIGNMENT_REGEX: Lazy = - Lazy::new(|| Regex::new(r"^\s*([(\[]\s*)?(\w+\s*,\s*)*\w+\s*([)\]]\s*)?=.*[(\[{]$").unwrap()); -static PARTIAL_DICTIONARY_REGEX: Lazy = - Lazy::new(|| Regex::new(r#"^\s*['"]\w+['"]\s*:.+[,{]\s*(#.*)?$"#).unwrap()); -static PRINT_RETURN_REGEX: Lazy = Lazy::new(|| Regex::new(r"^(print|return)\b\s*").unwrap()); + +static POSITIVE_CASES: Lazy = Lazy::new(|| { + RegexSet::new([ + // Keywords + r"^(?:elif\s+.*\s*:|else\s*:|try\s*:|finally\s*:|except\s+.*\s*:)$", + // Partial dictionary + r#"^['"]\w+['"]\s*:.+[,{]\s*(#.*)?$"#, + // Multiline assignment + r"^(?:[(\[]\s*)?(?:\w+\s*,\s*)*\w+\s*([)\]]\s*)?=.*[(\[{]$", + // Brackets, + r"^[()\[\]{}\s]+$", + ]) + .unwrap() +}); /// Returns `true` if a comment contains Python code. pub(crate) fn comment_contains_code(line: &str, task_tags: &[String]) -> bool { - let line = if let Some(line) = line.trim().strip_prefix('#') { - line.trim_start_matches([' ', '#']) - } else { + let line = line.trim_start_matches([' ', '#']).trim_end(); + + // Fast path: if none of the indicators are present, the line is not code. + if !CODE_INDICATORS.is_match(line) { return false; - }; + } // Ignore task tag comments (e.g., "# TODO(tom): Refactor"). if line @@ -48,65 +53,36 @@ pub(crate) fn comment_contains_code(line: &str, task_tags: &[String]) -> bool { return false; } - // Ignore non-comment related hashes (e.g., "# Issue #999"). - if HASH_NUMBER.is_match(line) { - return false; - } - // Ignore whitelisted comments. if ALLOWLIST_REGEX.is_match(line) { return false; } - if CODING_COMMENT_REGEX.is_match(line) { + // Ignore non-comment related hashes (e.g., "# Issue #999"). + if HASH_NUMBER.is_match(line) { return false; } - // Check that this is possibly code. - if CODE_INDICATORS.iter().all(|symbol| !line.contains(symbol)) { - return false; - } - - if multiline_case(line) { - return true; - } - - if CODE_KEYWORDS.iter().any(|symbol| symbol.is_match(line)) { - return true; - } - - let line = PRINT_RETURN_REGEX.replace_all(line, ""); - - if PARTIAL_DICTIONARY_REGEX.is_match(&line) { - return true; - } - - // Finally, compile the source code. - parse_suite(&line, "").is_ok() -} - -/// Returns `true` if a line is probably part of some multiline code. -fn multiline_case(line: &str) -> bool { + // If the comment made it this far, and ends in a continuation, assume it's code. if line.ends_with('\\') { return true; } - if MULTILINE_ASSIGNMENT_REGEX.is_match(line) { + // If the comment matches any of the specified positive cases, assume it's code. + if POSITIVE_CASES.is_match(line) { return true; } - if BRACKET_REGEX.is_match(line) { - return true; - } - - false + // Finally, compile the source code. + parse_suite(line, "").is_ok() } #[cfg(test)] mod tests { - use super::comment_contains_code; use crate::settings::TASK_TAGS; + use super::comment_contains_code; + #[test] fn comment_contains_code_basic() { assert!(comment_contains_code("# x = 1", &[])); @@ -127,7 +103,6 @@ mod tests { assert!(!comment_contains_code("# 123", &[])); assert!(!comment_contains_code("# 123.1", &[])); assert!(!comment_contains_code("# 1, 2, 3", &[])); - assert!(!comment_contains_code("x = 1 # x = 1", &[])); assert!(!comment_contains_code( "# pylint: disable=redefined-outer-name", &[] @@ -150,7 +125,6 @@ mod tests { fn comment_contains_code_with_print() { assert!(comment_contains_code("#print", &[])); assert!(comment_contains_code("#print(1)", &[])); - assert!(comment_contains_code("#print 1", &[])); assert!(!comment_contains_code("#to print", &[])); }