From e04ef423344837c916e75a7b09ea674711a104e0 Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Wed, 26 Apr 2023 21:15:47 +0200 Subject: [PATCH] Use `memchr` to speedup newline search on x86 (#3985) --- Cargo.lock | 1 + crates/ruff/src/importer.rs | 3 +- crates/ruff/src/message/gitlab.rs | 19 ++- crates/ruff/src/noqa.rs | 6 +- crates/ruff_python_ast/Cargo.toml | 1 + crates/ruff_python_ast/src/newlines.rs | 117 ++++++++++++++---- .../src/source_code/generator.rs | 6 +- .../src/source_code/locator.rs | 22 ++-- crates/ruff_python_ast/src/source_code/mod.rs | 3 +- .../src/source_code/stylist.rs | 83 ++++--------- 10 files changed, 147 insertions(+), 114 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b51a3c2fa6..43631701f4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2204,6 +2204,7 @@ dependencies = [ "is-macro", "itertools", "log", + "memchr", "num-bigint", "num-traits", "once_cell", diff --git a/crates/ruff/src/importer.rs b/crates/ruff/src/importer.rs index 3af7d1bbe9..db929307a1 100644 --- a/crates/ruff/src/importer.rs +++ b/crates/ruff/src/importer.rs @@ -234,11 +234,12 @@ fn top_of_file_insertion(body: &[Stmt], locator: &Locator, stylist: &Stylist) -> #[cfg(test)] mod tests { use anyhow::Result; + use ruff_python_ast::newlines::LineEnding; use ruff_text_size::TextSize; use rustpython_parser as parser; use rustpython_parser::lexer::LexResult; - use ruff_python_ast::source_code::{LineEnding, Locator, Stylist}; + use ruff_python_ast::source_code::{Locator, Stylist}; use crate::importer::{top_of_file_insertion, Insertion}; diff --git a/crates/ruff/src/message/gitlab.rs b/crates/ruff/src/message/gitlab.rs index b699884b4b..355bdf0dbd 100644 --- a/crates/ruff/src/message/gitlab.rs +++ b/crates/ruff/src/message/gitlab.rs @@ -1,6 +1,7 @@ use crate::fs::{relativize_path, relativize_path_to}; use crate::message::{Emitter, EmitterContext, Message}; use crate::registry::AsRule; +use ruff_python_ast::source_code::SourceLocation; use serde::ser::SerializeSeq; use serde::{Serialize, Serializer}; use serde_json::json; @@ -56,6 +57,9 @@ impl Serialize for SerializedMessages<'_> { let mut s = serializer.serialize_seq(Some(self.messages.len()))?; for message in self.messages { + let start_location = message.compute_start_location(); + let end_location = message.compute_end_location(); + let lines = if self.context.is_jupyter_notebook(message.filename()) { // We can't give a reasonable location for the structured formats, // so we show one that's clearly a fallback @@ -64,8 +68,6 @@ impl Serialize for SerializedMessages<'_> { "end": 1 }) } else { - let start_location = message.compute_start_location(); - let end_location = message.compute_end_location(); json!({ "begin": start_location.row, "end": end_location.row @@ -80,7 +82,7 @@ impl Serialize for SerializedMessages<'_> { let value = json!({ "description": format!("({}) {}", message.kind.rule().noqa_code(), message.kind.body), "severity": "major", - "fingerprint": fingerprint(message), + "fingerprint": fingerprint(message, &start_location, &end_location), "location": { "path": path, "lines": lines @@ -95,10 +97,14 @@ impl Serialize for SerializedMessages<'_> { } /// Generate a unique fingerprint to identify a violation. -fn fingerprint(message: &Message) -> String { +fn fingerprint( + message: &Message, + start_location: &SourceLocation, + end_location: &SourceLocation, +) -> String { let Message { kind, - range, + range: _, fix: _fix, file, noqa_offset: _, @@ -107,7 +113,8 @@ fn fingerprint(message: &Message) -> String { let mut hasher = DefaultHasher::new(); kind.rule().hash(&mut hasher); - range.hash(&mut hasher); + start_location.hash(&mut hasher); + end_location.hash(&mut hasher); file.name().hash(&mut hasher); format!("{:x}", hasher.finish()) diff --git a/crates/ruff/src/noqa.rs b/crates/ruff/src/noqa.rs index 9bab41bf9d..4434837f01 100644 --- a/crates/ruff/src/noqa.rs +++ b/crates/ruff/src/noqa.rs @@ -11,7 +11,8 @@ use regex::Regex; use ruff_text_size::{TextLen, TextRange, TextSize}; use ruff_diagnostics::Diagnostic; -use ruff_python_ast::source_code::{LineEnding, Locator}; +use ruff_python_ast::newlines::LineEnding; +use ruff_python_ast::source_code::Locator; use crate::codes::NoqaCode; use crate::registry::{AsRule, Rule, RuleSet}; @@ -511,7 +512,8 @@ mod tests { use ruff_text_size::{TextRange, TextSize}; use ruff_diagnostics::Diagnostic; - use ruff_python_ast::source_code::{LineEnding, Locator}; + use ruff_python_ast::newlines::LineEnding; + use ruff_python_ast::source_code::Locator; use crate::noqa::{add_noqa_inner, NoqaMapping, NOQA_LINE_REGEX}; use crate::rules::pycodestyle::rules::AmbiguousVariableName; diff --git a/crates/ruff_python_ast/Cargo.toml b/crates/ruff_python_ast/Cargo.toml index d20db48eed..1c38445bed 100644 --- a/crates/ruff_python_ast/Cargo.toml +++ b/crates/ruff_python_ast/Cargo.toml @@ -16,6 +16,7 @@ bitflags = { workspace = true } is-macro = { workspace = true } itertools = { workspace = true } log = { workspace = true } +memchr = "2.5.0" num-bigint = { version = "0.4.3" } num-traits = { version = "0.2.15" } once_cell = { workspace = true } diff --git a/crates/ruff_python_ast/src/newlines.rs b/crates/ruff_python_ast/src/newlines.rs index 54edacaaa7..5e3b97f85a 100644 --- a/crates/ruff_python_ast/src/newlines.rs +++ b/crates/ruff_python_ast/src/newlines.rs @@ -1,3 +1,4 @@ +use memchr::{memchr2, memrchr2}; use ruff_text_size::{TextLen, TextRange, TextSize}; use std::iter::FusedIterator; use std::ops::Deref; @@ -50,6 +51,30 @@ impl<'a> UniversalNewlineIterator<'a> { } } +/// Finds the next newline character. Returns its position and the [`LineEnding`]. +#[inline] +pub fn find_newline(text: &str) -> Option<(usize, LineEnding)> { + let bytes = text.as_bytes(); + if let Some(position) = memchr2(b'\n', b'\r', bytes) { + // SAFETY: memchr guarantees to return valid positions + #[allow(unsafe_code)] + let newline_character = unsafe { *bytes.get_unchecked(position) }; + + let line_ending = match newline_character { + // Explicit branch for `\n` as this is the most likely path + b'\n' => LineEnding::Lf, + // '\r\n' + b'\r' if bytes.get(position.saturating_add(1)) == Some(&b'\n') => LineEnding::CrLf, + // '\r' + _ => LineEnding::Cr, + }; + + Some((position, line_ending)) + } else { + None + } +} + impl<'a> Iterator for UniversalNewlineIterator<'a> { type Item = Line<'a>; @@ -59,35 +84,25 @@ impl<'a> Iterator for UniversalNewlineIterator<'a> { return None; } - let line = match self.text.find(['\n', '\r']) { - // Non-last line - Some(line_end) => { - let offset: usize = match self.text.as_bytes()[line_end] { - // Explicit branch for `\n` as this is the most likely path - b'\n' => 1, - // '\r\n' - b'\r' if self.text.as_bytes().get(line_end + 1) == Some(&b'\n') => 2, - // '\r' - _ => 1, - }; + let line = if let Some((newline_position, line_ending)) = find_newline(self.text) { + let (text, remainder) = self.text.split_at(newline_position + line_ending.len()); - let (text, remainder) = self.text.split_at(line_end + offset); + let line = Line { + offset: self.offset, + text, + }; - let line = Line { - offset: self.offset, - text, - }; + self.text = remainder; + self.offset += text.text_len(); - self.text = remainder; - self.offset += text.text_len(); - - line - } - // Last line - None => Line { + line + } + // Last line + else { + Line { offset: self.offset, text: std::mem::take(&mut self.text), - }, + } }; Some(line) @@ -116,7 +131,7 @@ impl DoubleEndedIterator for UniversalNewlineIterator<'_> { // Find the end of the previous line. The previous line is the text up to, but not including // the newline character. - let line = if let Some(line_end) = haystack.rfind(['\n', '\r']) { + let line = if let Some(line_end) = memrchr2(b'\n', b'\r', haystack.as_bytes()) { // '\n' or '\r' or '\r\n' let (remainder, line) = self.text.split_at(line_end + 1); self.text = remainder; @@ -268,6 +283,58 @@ impl PartialEq> for &str { } } +/// The line ending style used in Python source code. +/// See +#[derive(Debug, PartialEq, Eq, Copy, Clone)] +pub enum LineEnding { + Lf, + Cr, + CrLf, +} + +impl Default for LineEnding { + fn default() -> Self { + if cfg!(windows) { + LineEnding::CrLf + } else { + LineEnding::Lf + } + } +} + +impl LineEnding { + pub const fn as_str(&self) -> &'static str { + match self { + LineEnding::Lf => "\n", + LineEnding::CrLf => "\r\n", + LineEnding::Cr => "\r", + } + } + + #[allow(clippy::len_without_is_empty)] + pub const fn len(&self) -> usize { + match self { + LineEnding::Lf | LineEnding::Cr => 1, + LineEnding::CrLf => 2, + } + } + + pub const fn text_len(&self) -> TextSize { + match self { + LineEnding::Lf | LineEnding::Cr => TextSize::new(1), + LineEnding::CrLf => TextSize::new(2), + } + } +} + +impl Deref for LineEnding { + type Target = str; + + fn deref(&self) -> &Self::Target { + self.as_str() + } +} + #[cfg(test)] mod tests { use super::UniversalNewlineIterator; diff --git a/crates/ruff_python_ast/src/source_code/generator.rs b/crates/ruff_python_ast/src/source_code/generator.rs index 600d2f41c9..f1e4bf7f84 100644 --- a/crates/ruff_python_ast/src/source_code/generator.rs +++ b/crates/ruff_python_ast/src/source_code/generator.rs @@ -9,9 +9,10 @@ use rustpython_parser::ast::{ }; use rustpython_parser::ConversionFlag; +use crate::newlines::LineEnding; use ruff_rustpython::vendor::{bytes, str}; -use crate::source_code::stylist::{Indentation, LineEnding, Quote, Stylist}; +use crate::source_code::stylist::{Indentation, Quote, Stylist}; mod precedence { pub const ASSIGN: u8 = 3; @@ -1256,9 +1257,10 @@ impl<'a> Generator<'a> { #[cfg(test)] mod tests { + use crate::newlines::LineEnding; use rustpython_parser as parser; - use crate::source_code::stylist::{Indentation, LineEnding, Quote}; + use crate::source_code::stylist::{Indentation, Quote}; use crate::source_code::Generator; fn round_trip(contents: &str) -> String { diff --git a/crates/ruff_python_ast/src/source_code/locator.rs b/crates/ruff_python_ast/src/source_code/locator.rs index 24932d6e4e..6a56e75584 100644 --- a/crates/ruff_python_ast/src/source_code/locator.rs +++ b/crates/ruff_python_ast/src/source_code/locator.rs @@ -1,6 +1,8 @@ //! Struct used to efficiently slice source code at (row, column) Locations. +use crate::newlines::find_newline; use crate::source_code::{LineIndex, OneIndexed, SourceCode, SourceLocation}; +use memchr::{memchr2, memrchr2}; use once_cell::unsync::OnceCell; use ruff_text_size::{TextLen, TextRange, TextSize}; use std::ops::Add; @@ -68,7 +70,8 @@ impl<'a> Locator<'a> { /// ## Panics /// If `offset` is out of bounds. pub fn line_start(&self, offset: TextSize) -> TextSize { - if let Some(index) = self.contents[TextRange::up_to(offset)].rfind(['\n', '\r']) { + let bytes = self.contents[TextRange::up_to(offset)].as_bytes(); + if let Some(index) = memrchr2(b'\n', b'\r', bytes) { // SAFETY: Safe because `index < offset` TextSize::try_from(index).unwrap().add(TextSize::from(1)) } else { @@ -101,19 +104,8 @@ impl<'a> Locator<'a> { /// If `offset` is passed the end of the content. pub fn full_line_end(&self, offset: TextSize) -> TextSize { let slice = &self.contents[usize::from(offset)..]; - if let Some(index) = slice.find(['\n', '\r']) { - let bytes = slice.as_bytes(); - - // `\r\n` - let relative_offset = if bytes[index] == b'\r' && bytes.get(index + 1) == Some(&b'\n') { - TextSize::try_from(index + 2).unwrap() - } - // `\r` or `\n` - else { - TextSize::try_from(index + 1).unwrap() - }; - - offset.add(relative_offset) + if let Some((index, line_ending)) = find_newline(slice) { + offset + TextSize::try_from(index).unwrap() + line_ending.text_len() } else { self.contents.text_len() } @@ -139,7 +131,7 @@ impl<'a> Locator<'a> { /// If `offset` is passed the end of the content. pub fn line_end(&self, offset: TextSize) -> TextSize { let slice = &self.contents[usize::from(offset)..]; - if let Some(index) = slice.find(['\n', '\r']) { + if let Some(index) = memchr2(b'\n', b'\r', slice.as_bytes()) { offset + TextSize::try_from(index).unwrap() } else { self.contents.text_len() diff --git a/crates/ruff_python_ast/src/source_code/mod.rs b/crates/ruff_python_ast/src/source_code/mod.rs index 5c0b3769ed..591162f491 100644 --- a/crates/ruff_python_ast/src/source_code/mod.rs +++ b/crates/ruff_python_ast/src/source_code/mod.rs @@ -15,8 +15,7 @@ use rustpython_parser::{lexer, Mode, ParseError}; use serde::{Deserialize, Serialize}; use std::fmt::{Debug, Formatter}; use std::sync::Arc; - -pub use stylist::{LineEnding, Stylist}; +pub use stylist::Stylist; /// Run round-trip source code generation on a given Python code. pub fn round_trip(code: &str, source_path: &str) -> Result { diff --git a/crates/ruff_python_ast/src/source_code/stylist.rs b/crates/ruff_python_ast/src/source_code/stylist.rs index dcd25b8b8d..0c45351618 100644 --- a/crates/ruff_python_ast/src/source_code/stylist.rs +++ b/crates/ruff_python_ast/src/source_code/stylist.rs @@ -7,6 +7,7 @@ use once_cell::unsync::OnceCell; use rustpython_parser::lexer::LexResult; use rustpython_parser::Tok; +use crate::newlines::{find_newline, LineEnding}; use ruff_rustpython::vendor; use crate::source_code::Locator; @@ -29,9 +30,12 @@ impl<'a> Stylist<'a> { } pub fn line_ending(&'a self) -> LineEnding { - *self - .line_ending - .get_or_init(|| detect_line_ending(self.locator.contents()).unwrap_or_default()) + *self.line_ending.get_or_init(|| { + let contents = self.locator.contents(); + find_newline(contents) + .map(|(_, ending)| ending) + .unwrap_or_default() + }) } pub fn from_tokens(tokens: &[LexResult], locator: &'a Locator<'a>) -> Self { @@ -158,65 +162,13 @@ impl Deref for Indentation { } } -/// The line ending style used in Python source code. -/// See -#[derive(Debug, PartialEq, Eq, Copy, Clone)] -pub enum LineEnding { - Lf, - Cr, - CrLf, -} - -impl Default for LineEnding { - fn default() -> Self { - if cfg!(windows) { - LineEnding::CrLf - } else { - LineEnding::Lf - } - } -} - -impl LineEnding { - pub const fn as_str(&self) -> &'static str { - match self { - LineEnding::CrLf => "\r\n", - LineEnding::Lf => "\n", - LineEnding::Cr => "\r", - } - } -} - -impl Deref for LineEnding { - type Target = str; - - fn deref(&self) -> &Self::Target { - self.as_str() - } -} - -/// Detect the line ending style of the given contents. -fn detect_line_ending(contents: &str) -> Option { - if let Some(position) = contents.find(['\n', '\r']) { - let bytes = contents.as_bytes(); - if bytes[position] == b'\n' { - Some(LineEnding::Lf) - } else if bytes.get(position.saturating_add(1)) == Some(&b'\n') { - Some(LineEnding::CrLf) - } else { - Some(LineEnding::Cr) - } - } else { - None - } -} - #[cfg(test)] mod tests { + use crate::newlines::{find_newline, LineEnding}; use rustpython_parser::lexer::lex; use rustpython_parser::Mode; - use crate::source_code::stylist::{detect_line_ending, Indentation, LineEnding, Quote}; + use crate::source_code::stylist::{Indentation, Quote}; use crate::source_code::{Locator, Stylist}; #[test] @@ -354,15 +306,24 @@ a = "v" #[test] fn line_ending() { let contents = "x = 1"; - assert_eq!(detect_line_ending(contents), None); + assert_eq!(find_newline(contents).map(|(_, ending)| ending), None); let contents = "x = 1\n"; - assert_eq!(detect_line_ending(contents), Some(LineEnding::Lf)); + assert_eq!( + find_newline(contents).map(|(_, ending)| ending), + Some(LineEnding::Lf) + ); let contents = "x = 1\r"; - assert_eq!(detect_line_ending(contents), Some(LineEnding::Cr)); + assert_eq!( + find_newline(contents).map(|(_, ending)| ending), + Some(LineEnding::Cr) + ); let contents = "x = 1\r\n"; - assert_eq!(detect_line_ending(contents), Some(LineEnding::CrLf)); + assert_eq!( + find_newline(contents).map(|(_, ending)| ending), + Some(LineEnding::CrLf) + ); } }