Use `memchr` for tab-indentation detection (#9853)

## Summary

The benchmarks show a pretty consistent 1% speedup here for all-rules,
though not enough to trigger our threshold of course:

![Screenshot 2024-02-05 at 11 55
59 PM](https://github.com/astral-sh/ruff/assets/1309177/317dca3f-f25f-46f5-8ea8-894a1747d006)
This commit is contained in:
Charlie Marsh 2024-02-06 06:44:56 -08:00 committed by GitHub
parent a662c2447c
commit c34908f5ad
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 44 additions and 39 deletions

View File

@ -95,7 +95,7 @@ pub(crate) fn check_tokens(
} }
if settings.rules.enabled(Rule::TabIndentation) { if settings.rules.enabled(Rule::TabIndentation) {
pycodestyle::rules::tab_indentation(&mut diagnostics, tokens, locator, indexer); pycodestyle::rules::tab_indentation(&mut diagnostics, locator, indexer);
} }
if settings.rules.any_enabled(&[ if settings.rules.any_enabled(&[

View File

@ -1,11 +1,8 @@
use ruff_diagnostics::{Diagnostic, Violation}; use ruff_diagnostics::{Diagnostic, Violation};
use ruff_macros::{derive_message_formats, violation}; use ruff_macros::{derive_message_formats, violation};
use ruff_python_index::Indexer; use ruff_python_index::Indexer;
use ruff_python_parser::lexer::LexResult;
use ruff_python_parser::Tok;
use ruff_python_trivia::leading_indentation;
use ruff_source_file::Locator; use ruff_source_file::Locator;
use ruff_text_size::{TextLen, TextRange, TextSize}; use ruff_text_size::{TextRange, TextSize};
/// ## What it does /// ## What it does
/// Checks for indentation that uses tabs. /// Checks for indentation that uses tabs.
@ -48,44 +45,52 @@ impl Violation for TabIndentation {
/// W191 /// W191
pub(crate) fn tab_indentation( pub(crate) fn tab_indentation(
diagnostics: &mut Vec<Diagnostic>, diagnostics: &mut Vec<Diagnostic>,
tokens: &[LexResult],
locator: &Locator, locator: &Locator,
indexer: &Indexer, indexer: &Indexer,
) { ) {
// Always check the first line for tab indentation as there's no newline let contents = locator.contents().as_bytes();
// token before it. let mut offset = 0;
tab_indentation_at_line_start(diagnostics, locator, TextSize::default()); while let Some(index) = memchr::memchr(b'\t', &contents[offset..]) {
// If we find a tab in the file, grab the entire line.
let range = locator.full_line_range(TextSize::try_from(offset + index).unwrap());
for (tok, range) in tokens.iter().flatten() { // Determine whether the tab is part of the line's indentation.
if matches!(tok, Tok::Newline | Tok::NonLogicalNewline) { if let Some(indent) = tab_indentation_at_line_start(range.start(), locator, indexer) {
tab_indentation_at_line_start(diagnostics, locator, range.end()); diagnostics.push(Diagnostic::new(TabIndentation, indent));
}
} }
// The lexer doesn't emit `Newline` / `NonLogicalNewline` for a line // Advance to the next line.
// continuation character (`\`), so we need to manually check for tab offset = range.end().to_usize();
// indentation for lines that follow a line continuation character.
for continuation_line in indexer.continuation_line_starts() {
tab_indentation_at_line_start(
diagnostics,
locator,
locator.full_line_end(*continuation_line),
);
} }
} }
/// Checks for indentation that uses tabs for a line starting at /// If a line includes tabs in its indentation, returns the range of the
/// the given [`TextSize`]. /// indent.
fn tab_indentation_at_line_start( fn tab_indentation_at_line_start(
diagnostics: &mut Vec<Diagnostic>,
locator: &Locator,
line_start: TextSize, line_start: TextSize,
) { locator: &Locator,
let indent = leading_indentation(locator.after(line_start)); indexer: &Indexer,
if indent.find('\t').is_some() { ) -> Option<TextRange> {
diagnostics.push(Diagnostic::new( let mut contains_tab = false;
TabIndentation, for (i, char) in locator.after(line_start).as_bytes().iter().enumerate() {
TextRange::at(line_start, indent.text_len()), match char {
)); // If we find a tab character, report it as a violation.
b'\t' => {
contains_tab = true;
} }
// If we find a space, continue.
b' ' | b'\x0C' => {}
// If we find a non-whitespace character, stop.
_ => {
if contains_tab {
let range = TextRange::at(line_start, TextSize::try_from(i).unwrap());
if !indexer.multiline_ranges().contains_range(range) {
return Some(range);
}
}
break;
}
}
}
None
} }