mirror of https://github.com/astral-sh/ruff
Improve performance of `commented-out-code` (~50-80%) (#7706)
## Summary
This PR implements a variety of optimizations to improve performance of
the Eradicate rule, which always shows up in all-rules benchmarks and
bothers me. (These improvements are not hugely important, but it was
kind of a fun Friday thing to spent a bit of time on.)
The improvements include:
- Doing cheaper work first (checking for some explicit substrings
upfront).
- Using `aho-corasick` to speed an exact substring search.
- Merging multiple regular expressions using a `RegexSet`.
- Removing some unnecessary `\s*` and other pieces from the regular
expressions (since we already trim strings before matching on them).
## Test Plan
I benchmarked this function in a standalone crate using a variety of
cases. Criterion reports that this version is up to 80% faster, and
almost every case is at least 50% faster:
```
Eradicate/Detection/# Warn if we are installing over top of an existing installation. This can
time: [101.84 ns 102.32 ns 102.82 ns]
change: [-77.166% -77.062% -76.943%] (p = 0.00 < 0.05)
Performance has improved.
Found 3 outliers among 100 measurements (3.00%)
3 (3.00%) high mild
Eradicate/Detection/#from foo import eradicate
time: [74.872 ns 75.096 ns 75.314 ns]
change: [-84.180% -84.131% -84.079%] (p = 0.00 < 0.05)
Performance has improved.
Found 1 outliers among 100 measurements (1.00%)
1 (1.00%) high mild
Eradicate/Detection/# encoding: utf8
time: [46.522 ns 46.862 ns 47.237 ns]
change: [-29.408% -28.918% -28.471%] (p = 0.00 < 0.05)
Performance has improved.
Found 7 outliers among 100 measurements (7.00%)
6 (6.00%) high mild
1 (1.00%) high severe
Eradicate/Detection/# Issue #999
time: [16.942 ns 16.994 ns 17.058 ns]
change: [-57.243% -57.064% -56.815%] (p = 0.00 < 0.05)
Performance has improved.
Found 3 outliers among 100 measurements (3.00%)
2 (2.00%) high mild
1 (1.00%) high severe
Eradicate/Detection/# type: ignore
time: [43.074 ns 43.163 ns 43.262 ns]
change: [-17.614% -17.390% -17.152%] (p = 0.00 < 0.05)
Performance has improved.
Found 5 outliers among 100 measurements (5.00%)
3 (3.00%) high mild
2 (2.00%) high severe
Eradicate/Detection/# user_content_type, _ = TimelineEvent.objects.using(db_alias).get_or_create(
time: [209.40 ns 209.81 ns 210.23 ns]
change: [-32.806% -32.630% -32.470%] (p = 0.00 < 0.05)
Performance has improved.
Eradicate/Detection/# this is = to that :(
time: [72.659 ns 73.068 ns 73.473 ns]
change: [-68.884% -68.775% -68.655%] (p = 0.00 < 0.05)
Performance has improved.
Found 9 outliers among 100 measurements (9.00%)
7 (7.00%) high mild
2 (2.00%) high severe
Eradicate/Detection/#except Exception:
time: [92.063 ns 92.366 ns 92.691 ns]
change: [-64.204% -64.052% -63.909%] (p = 0.00 < 0.05)
Performance has improved.
Found 4 outliers among 100 measurements (4.00%)
2 (2.00%) high mild
2 (2.00%) high severe
Eradicate/Detection/#print(1)
time: [68.359 ns 68.537 ns 68.725 ns]
change: [-72.424% -72.356% -72.278%] (p = 0.00 < 0.05)
Performance has improved.
Found 2 outliers among 100 measurements (2.00%)
1 (1.00%) low mild
1 (1.00%) high mild
Eradicate/Detection/#'key': 1 + 1,
time: [79.604 ns 79.865 ns 80.135 ns]
change: [-69.787% -69.667% -69.549%] (p = 0.00 < 0.05)
Performance has improved.
```
This commit is contained in:
parent
e9f8b91eb5
commit
8c8988ea40
|
|
@ -28,9 +28,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.0.5"
|
||||
version = "1.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783"
|
||||
checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
|
@ -2168,6 +2168,7 @@ dependencies = [
|
|||
name = "ruff_linter"
|
||||
version = "0.0.291"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"annotate-snippets 0.9.1",
|
||||
"anyhow",
|
||||
"bitflags 2.4.0",
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ ruff_python_parser = { path = "../ruff_python_parser" }
|
|||
ruff_source_file = { path = "../ruff_source_file", features = ["serde"] }
|
||||
ruff_text_size = { path = "../ruff_text_size" }
|
||||
|
||||
aho-corasick = { version = "1.1.1" }
|
||||
annotate-snippets = { version = "0.9.1", features = ["color"] }
|
||||
anyhow = { workspace = true }
|
||||
bitflags = { workspace = true }
|
||||
|
|
|
|||
|
|
@ -1,43 +1,48 @@
|
|||
/// See: [eradicate.py](https://github.com/myint/eradicate/blob/98f199940979c94447a461d50d27862b118b282d/eradicate.py)
|
||||
use aho_corasick::AhoCorasick;
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use regex::{Regex, RegexSet};
|
||||
|
||||
use ruff_python_parser::parse_suite;
|
||||
|
||||
static CODE_INDICATORS: Lazy<AhoCorasick> = Lazy::new(|| {
|
||||
AhoCorasick::new([
|
||||
"(", ")", "[", "]", "{", "}", ":", "=", "%", "print", "return", "break", "continue",
|
||||
"import",
|
||||
])
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
static ALLOWLIST_REGEX: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(
|
||||
r"^(?i)(?:pylint|pyright|noqa|nosec|region|endregion|type:\s*ignore|fmt:\s*(on|off)|isort:\s*(on|off|skip|skip_file|split|dont-add-imports(:\s*\[.*?])?)|mypy:|SPDX-License-Identifier:)",
|
||||
r"^(?i)(?:pylint|pyright|noqa|nosec|region|endregion|type:\s*ignore|fmt:\s*(on|off)|isort:\s*(on|off|skip|skip_file|split|dont-add-imports(:\s*\[.*?])?)|mypy:|SPDX-License-Identifier:|(?:en)?coding[:=][ \t]*([-_.a-zA-Z0-9]+))",
|
||||
).unwrap()
|
||||
});
|
||||
static BRACKET_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[()\[\]{}\s]+$").unwrap());
|
||||
static CODE_INDICATORS: &[&str] = &[
|
||||
"(", ")", "[", "]", "{", "}", ":", "=", "%", "print", "return", "break", "continue", "import",
|
||||
];
|
||||
static CODE_KEYWORDS: Lazy<Vec<Regex>> = Lazy::new(|| {
|
||||
vec![
|
||||
Regex::new(r"^\s*elif\s+.*\s*:\s*$").unwrap(),
|
||||
Regex::new(r"^\s*else\s*:\s*$").unwrap(),
|
||||
Regex::new(r"^\s*try\s*:\s*$").unwrap(),
|
||||
Regex::new(r"^\s*finally\s*:\s*$").unwrap(),
|
||||
Regex::new(r"^\s*except\s+.*\s*:\s*$").unwrap(),
|
||||
]
|
||||
});
|
||||
static CODING_COMMENT_REGEX: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r"^.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)").unwrap());
|
||||
|
||||
static HASH_NUMBER: Lazy<Regex> = Lazy::new(|| Regex::new(r"#\d").unwrap());
|
||||
static MULTILINE_ASSIGNMENT_REGEX: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r"^\s*([(\[]\s*)?(\w+\s*,\s*)*\w+\s*([)\]]\s*)?=.*[(\[{]$").unwrap());
|
||||
static PARTIAL_DICTIONARY_REGEX: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"^\s*['"]\w+['"]\s*:.+[,{]\s*(#.*)?$"#).unwrap());
|
||||
static PRINT_RETURN_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^(print|return)\b\s*").unwrap());
|
||||
|
||||
static POSITIVE_CASES: Lazy<RegexSet> = Lazy::new(|| {
|
||||
RegexSet::new([
|
||||
// Keywords
|
||||
r"^(?:elif\s+.*\s*:|else\s*:|try\s*:|finally\s*:|except\s+.*\s*:)$",
|
||||
// Partial dictionary
|
||||
r#"^['"]\w+['"]\s*:.+[,{]\s*(#.*)?$"#,
|
||||
// Multiline assignment
|
||||
r"^(?:[(\[]\s*)?(?:\w+\s*,\s*)*\w+\s*([)\]]\s*)?=.*[(\[{]$",
|
||||
// Brackets,
|
||||
r"^[()\[\]{}\s]+$",
|
||||
])
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
/// Returns `true` if a comment contains Python code.
|
||||
pub(crate) fn comment_contains_code(line: &str, task_tags: &[String]) -> bool {
|
||||
let line = if let Some(line) = line.trim().strip_prefix('#') {
|
||||
line.trim_start_matches([' ', '#'])
|
||||
} else {
|
||||
let line = line.trim_start_matches([' ', '#']).trim_end();
|
||||
|
||||
// Fast path: if none of the indicators are present, the line is not code.
|
||||
if !CODE_INDICATORS.is_match(line) {
|
||||
return false;
|
||||
};
|
||||
}
|
||||
|
||||
// Ignore task tag comments (e.g., "# TODO(tom): Refactor").
|
||||
if line
|
||||
|
|
@ -48,65 +53,36 @@ pub(crate) fn comment_contains_code(line: &str, task_tags: &[String]) -> bool {
|
|||
return false;
|
||||
}
|
||||
|
||||
// Ignore non-comment related hashes (e.g., "# Issue #999").
|
||||
if HASH_NUMBER.is_match(line) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Ignore whitelisted comments.
|
||||
if ALLOWLIST_REGEX.is_match(line) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if CODING_COMMENT_REGEX.is_match(line) {
|
||||
// Ignore non-comment related hashes (e.g., "# Issue #999").
|
||||
if HASH_NUMBER.is_match(line) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check that this is possibly code.
|
||||
if CODE_INDICATORS.iter().all(|symbol| !line.contains(symbol)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if multiline_case(line) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if CODE_KEYWORDS.iter().any(|symbol| symbol.is_match(line)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
let line = PRINT_RETURN_REGEX.replace_all(line, "");
|
||||
|
||||
if PARTIAL_DICTIONARY_REGEX.is_match(&line) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Finally, compile the source code.
|
||||
parse_suite(&line, "<filename>").is_ok()
|
||||
}
|
||||
|
||||
/// Returns `true` if a line is probably part of some multiline code.
|
||||
fn multiline_case(line: &str) -> bool {
|
||||
// If the comment made it this far, and ends in a continuation, assume it's code.
|
||||
if line.ends_with('\\') {
|
||||
return true;
|
||||
}
|
||||
|
||||
if MULTILINE_ASSIGNMENT_REGEX.is_match(line) {
|
||||
// If the comment matches any of the specified positive cases, assume it's code.
|
||||
if POSITIVE_CASES.is_match(line) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if BRACKET_REGEX.is_match(line) {
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
// Finally, compile the source code.
|
||||
parse_suite(line, "<filename>").is_ok()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::comment_contains_code;
|
||||
use crate::settings::TASK_TAGS;
|
||||
|
||||
use super::comment_contains_code;
|
||||
|
||||
#[test]
|
||||
fn comment_contains_code_basic() {
|
||||
assert!(comment_contains_code("# x = 1", &[]));
|
||||
|
|
@ -127,7 +103,6 @@ mod tests {
|
|||
assert!(!comment_contains_code("# 123", &[]));
|
||||
assert!(!comment_contains_code("# 123.1", &[]));
|
||||
assert!(!comment_contains_code("# 1, 2, 3", &[]));
|
||||
assert!(!comment_contains_code("x = 1 # x = 1", &[]));
|
||||
assert!(!comment_contains_code(
|
||||
"# pylint: disable=redefined-outer-name",
|
||||
&[]
|
||||
|
|
@ -150,7 +125,6 @@ mod tests {
|
|||
fn comment_contains_code_with_print() {
|
||||
assert!(comment_contains_code("#print", &[]));
|
||||
assert!(comment_contains_code("#print(1)", &[]));
|
||||
assert!(comment_contains_code("#print 1", &[]));
|
||||
|
||||
assert!(!comment_contains_code("#to print", &[]));
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue