From e04ef423344837c916e75a7b09ea674711a104e0 Mon Sep 17 00:00:00 2001
From: Micha Reiser <micha@reiser.io>
Date: Wed, 26 Apr 2023 21:15:47 +0200
Subject: [PATCH] Use `memchr` to speedup newline search on x86 (#3985)

---
 Cargo.lock                                    |   1 +
 crates/ruff/src/importer.rs                   |   3 +-
 crates/ruff/src/message/gitlab.rs             |  19 ++-
 crates/ruff/src/noqa.rs                       |   6 +-
 crates/ruff_python_ast/Cargo.toml             |   1 +
 crates/ruff_python_ast/src/newlines.rs        | 117 ++++++++++++++----
 .../src/source_code/generator.rs              |   6 +-
 .../src/source_code/locator.rs                |  22 ++--
 crates/ruff_python_ast/src/source_code/mod.rs |   3 +-
 .../src/source_code/stylist.rs                |  83 ++++---------
 10 files changed, 147 insertions(+), 114 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b51a3c2fa6..43631701f4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2204,6 +2204,7 @@ dependencies = [
  "is-macro",
  "itertools",
  "log",
+ "memchr",
  "num-bigint",
  "num-traits",
  "once_cell",
diff --git a/crates/ruff/src/importer.rs b/crates/ruff/src/importer.rs
index 3af7d1bbe9..db929307a1 100644
--- a/crates/ruff/src/importer.rs
+++ b/crates/ruff/src/importer.rs
@@ -234,11 +234,12 @@ fn top_of_file_insertion(body: &[Stmt], locator: &Locator, stylist: &Stylist) ->
 #[cfg(test)]
 mod tests {
     use anyhow::Result;
+    use ruff_python_ast::newlines::LineEnding;
     use ruff_text_size::TextSize;
     use rustpython_parser as parser;
     use rustpython_parser::lexer::LexResult;
 
-    use ruff_python_ast::source_code::{LineEnding, Locator, Stylist};
+    use ruff_python_ast::source_code::{Locator, Stylist};
 
     use crate::importer::{top_of_file_insertion, Insertion};
 
diff --git a/crates/ruff/src/message/gitlab.rs b/crates/ruff/src/message/gitlab.rs
index b699884b4b..355bdf0dbd 100644
--- a/crates/ruff/src/message/gitlab.rs
+++ b/crates/ruff/src/message/gitlab.rs
@@ -1,6 +1,7 @@
 use crate::fs::{relativize_path, relativize_path_to};
 use crate::message::{Emitter, EmitterContext, Message};
 use crate::registry::AsRule;
+use ruff_python_ast::source_code::SourceLocation;
 use serde::ser::SerializeSeq;
 use serde::{Serialize, Serializer};
 use serde_json::json;
@@ -56,6 +57,9 @@ impl Serialize for SerializedMessages<'_> {
         let mut s = serializer.serialize_seq(Some(self.messages.len()))?;
 
         for message in self.messages {
+            let start_location = message.compute_start_location();
+            let end_location = message.compute_end_location();
+
             let lines = if self.context.is_jupyter_notebook(message.filename()) {
                 // We can't give a reasonable location for the structured formats,
                 // so we show one that's clearly a fallback
@@ -64,8 +68,6 @@ impl Serialize for SerializedMessages<'_> {
                     "end": 1
                 })
             } else {
-                let start_location = message.compute_start_location();
-                let end_location = message.compute_end_location();
                 json!({
                     "begin": start_location.row,
                     "end": end_location.row
@@ -80,7 +82,7 @@ impl Serialize for SerializedMessages<'_> {
             let value = json!({
                 "description": format!("({}) {}", message.kind.rule().noqa_code(), message.kind.body),
                 "severity": "major",
-                "fingerprint": fingerprint(message),
+                "fingerprint": fingerprint(message, &start_location, &end_location),
                 "location": {
                     "path": path,
                     "lines": lines
@@ -95,10 +97,14 @@ impl Serialize for SerializedMessages<'_> {
 }
 
 /// Generate a unique fingerprint to identify a violation.
-fn fingerprint(message: &Message) -> String {
+fn fingerprint(
+    message: &Message,
+    start_location: &SourceLocation,
+    end_location: &SourceLocation,
+) -> String {
     let Message {
         kind,
-        range,
+        range: _,
         fix: _fix,
         file,
         noqa_offset: _,
@@ -107,7 +113,8 @@ fn fingerprint(message: &Message) -> String {
     let mut hasher = DefaultHasher::new();
 
     kind.rule().hash(&mut hasher);
-    range.hash(&mut hasher);
+    start_location.hash(&mut hasher);
+    end_location.hash(&mut hasher);
     file.name().hash(&mut hasher);
 
     format!("{:x}", hasher.finish())
diff --git a/crates/ruff/src/noqa.rs b/crates/ruff/src/noqa.rs
index 9bab41bf9d..4434837f01 100644
--- a/crates/ruff/src/noqa.rs
+++ b/crates/ruff/src/noqa.rs
@@ -11,7 +11,8 @@ use regex::Regex;
 use ruff_text_size::{TextLen, TextRange, TextSize};
 
 use ruff_diagnostics::Diagnostic;
-use ruff_python_ast::source_code::{LineEnding, Locator};
+use ruff_python_ast::newlines::LineEnding;
+use ruff_python_ast::source_code::Locator;
 
 use crate::codes::NoqaCode;
 use crate::registry::{AsRule, Rule, RuleSet};
@@ -511,7 +512,8 @@ mod tests {
     use ruff_text_size::{TextRange, TextSize};
 
     use ruff_diagnostics::Diagnostic;
-    use ruff_python_ast::source_code::{LineEnding, Locator};
+    use ruff_python_ast::newlines::LineEnding;
+    use ruff_python_ast::source_code::Locator;
 
     use crate::noqa::{add_noqa_inner, NoqaMapping, NOQA_LINE_REGEX};
     use crate::rules::pycodestyle::rules::AmbiguousVariableName;
diff --git a/crates/ruff_python_ast/Cargo.toml b/crates/ruff_python_ast/Cargo.toml
index d20db48eed..1c38445bed 100644
--- a/crates/ruff_python_ast/Cargo.toml
+++ b/crates/ruff_python_ast/Cargo.toml
@@ -16,6 +16,7 @@ bitflags = { workspace = true }
 is-macro = { workspace = true }
 itertools = { workspace = true }
 log = { workspace = true }
+memchr = "2.5.0"
 num-bigint = { version = "0.4.3" }
 num-traits = { version = "0.2.15" }
 once_cell = { workspace = true }
diff --git a/crates/ruff_python_ast/src/newlines.rs b/crates/ruff_python_ast/src/newlines.rs
index 54edacaaa7..5e3b97f85a 100644
--- a/crates/ruff_python_ast/src/newlines.rs
+++ b/crates/ruff_python_ast/src/newlines.rs
@@ -1,3 +1,4 @@
+use memchr::{memchr2, memrchr2};
 use ruff_text_size::{TextLen, TextRange, TextSize};
 use std::iter::FusedIterator;
 use std::ops::Deref;
@@ -50,6 +51,30 @@ impl<'a> UniversalNewlineIterator<'a> {
     }
 }
 
+/// Finds the next newline character. Returns its position and the [`LineEnding`].
+#[inline]
+pub fn find_newline(text: &str) -> Option<(usize, LineEnding)> {
+    let bytes = text.as_bytes();
+    if let Some(position) = memchr2(b'\n', b'\r', bytes) {
+        // SAFETY: memchr guarantees to return valid positions
+        #[allow(unsafe_code)]
+        let newline_character = unsafe { *bytes.get_unchecked(position) };
+
+        let line_ending = match newline_character {
+            // Explicit branch for `\n` as this is the most likely path
+            b'\n' => LineEnding::Lf,
+            // '\r\n'
+            b'\r' if bytes.get(position.saturating_add(1)) == Some(&b'\n') => LineEnding::CrLf,
+            // '\r'
+            _ => LineEnding::Cr,
+        };
+
+        Some((position, line_ending))
+    } else {
+        None
+    }
+}
+
 impl<'a> Iterator for UniversalNewlineIterator<'a> {
     type Item = Line<'a>;
 
@@ -59,35 +84,25 @@ impl<'a> Iterator for UniversalNewlineIterator<'a> {
             return None;
         }
 
-        let line = match self.text.find(['\n', '\r']) {
-            // Non-last line
-            Some(line_end) => {
-                let offset: usize = match self.text.as_bytes()[line_end] {
-                    // Explicit branch for `\n` as this is the most likely path
-                    b'\n' => 1,
-                    // '\r\n'
-                    b'\r' if self.text.as_bytes().get(line_end + 1) == Some(&b'\n') => 2,
-                    // '\r'
-                    _ => 1,
-                };
+        let line = if let Some((newline_position, line_ending)) = find_newline(self.text) {
+            let (text, remainder) = self.text.split_at(newline_position + line_ending.len());
 
-                let (text, remainder) = self.text.split_at(line_end + offset);
+            let line = Line {
+                offset: self.offset,
+                text,
+            };
 
-                let line = Line {
-                    offset: self.offset,
-                    text,
-                };
+            self.text = remainder;
+            self.offset += text.text_len();
 
-                self.text = remainder;
-                self.offset += text.text_len();
-
-                line
-            }
-            // Last line
-            None => Line {
+            line
+        }
+        // Last line
+        else {
+            Line {
                 offset: self.offset,
                 text: std::mem::take(&mut self.text),
-            },
+            }
         };
 
         Some(line)
@@ -116,7 +131,7 @@ impl DoubleEndedIterator for UniversalNewlineIterator<'_> {
 
         // Find the end of the previous line. The previous line is the text up to, but not including
         // the newline character.
-        let line = if let Some(line_end) = haystack.rfind(['\n', '\r']) {
+        let line = if let Some(line_end) = memrchr2(b'\n', b'\r', haystack.as_bytes()) {
             // '\n' or '\r' or '\r\n'
             let (remainder, line) = self.text.split_at(line_end + 1);
             self.text = remainder;
@@ -268,6 +283,58 @@ impl PartialEq<Line<'_>> for &str {
     }
 }
 
+/// The line ending style used in Python source code.
+/// See <https://docs.python.org/3/reference/lexical_analysis.html#physical-lines>
+#[derive(Debug, PartialEq, Eq, Copy, Clone)]
+pub enum LineEnding {
+    Lf,
+    Cr,
+    CrLf,
+}
+
+impl Default for LineEnding {
+    fn default() -> Self {
+        if cfg!(windows) {
+            LineEnding::CrLf
+        } else {
+            LineEnding::Lf
+        }
+    }
+}
+
+impl LineEnding {
+    pub const fn as_str(&self) -> &'static str {
+        match self {
+            LineEnding::Lf => "\n",
+            LineEnding::CrLf => "\r\n",
+            LineEnding::Cr => "\r",
+        }
+    }
+
+    #[allow(clippy::len_without_is_empty)]
+    pub const fn len(&self) -> usize {
+        match self {
+            LineEnding::Lf | LineEnding::Cr => 1,
+            LineEnding::CrLf => 2,
+        }
+    }
+
+    pub const fn text_len(&self) -> TextSize {
+        match self {
+            LineEnding::Lf | LineEnding::Cr => TextSize::new(1),
+            LineEnding::CrLf => TextSize::new(2),
+        }
+    }
+}
+
+impl Deref for LineEnding {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        self.as_str()
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::UniversalNewlineIterator;
diff --git a/crates/ruff_python_ast/src/source_code/generator.rs b/crates/ruff_python_ast/src/source_code/generator.rs
index 600d2f41c9..f1e4bf7f84 100644
--- a/crates/ruff_python_ast/src/source_code/generator.rs
+++ b/crates/ruff_python_ast/src/source_code/generator.rs
@@ -9,9 +9,10 @@ use rustpython_parser::ast::{
 };
 use rustpython_parser::ConversionFlag;
 
+use crate::newlines::LineEnding;
 use ruff_rustpython::vendor::{bytes, str};
 
-use crate::source_code::stylist::{Indentation, LineEnding, Quote, Stylist};
+use crate::source_code::stylist::{Indentation, Quote, Stylist};
 
 mod precedence {
     pub const ASSIGN: u8 = 3;
@@ -1256,9 +1257,10 @@ impl<'a> Generator<'a> {
 
 #[cfg(test)]
 mod tests {
+    use crate::newlines::LineEnding;
     use rustpython_parser as parser;
 
-    use crate::source_code::stylist::{Indentation, LineEnding, Quote};
+    use crate::source_code::stylist::{Indentation, Quote};
     use crate::source_code::Generator;
 
     fn round_trip(contents: &str) -> String {
diff --git a/crates/ruff_python_ast/src/source_code/locator.rs b/crates/ruff_python_ast/src/source_code/locator.rs
index 24932d6e4e..6a56e75584 100644
--- a/crates/ruff_python_ast/src/source_code/locator.rs
+++ b/crates/ruff_python_ast/src/source_code/locator.rs
@@ -1,6 +1,8 @@
 //! Struct used to efficiently slice source code at (row, column) Locations.
 
+use crate::newlines::find_newline;
 use crate::source_code::{LineIndex, OneIndexed, SourceCode, SourceLocation};
+use memchr::{memchr2, memrchr2};
 use once_cell::unsync::OnceCell;
 use ruff_text_size::{TextLen, TextRange, TextSize};
 use std::ops::Add;
@@ -68,7 +70,8 @@ impl<'a> Locator<'a> {
     /// ## Panics
     /// If `offset` is out of bounds.
     pub fn line_start(&self, offset: TextSize) -> TextSize {
-        if let Some(index) = self.contents[TextRange::up_to(offset)].rfind(['\n', '\r']) {
+        let bytes = self.contents[TextRange::up_to(offset)].as_bytes();
+        if let Some(index) = memrchr2(b'\n', b'\r', bytes) {
             // SAFETY: Safe because `index < offset`
             TextSize::try_from(index).unwrap().add(TextSize::from(1))
         } else {
@@ -101,19 +104,8 @@ impl<'a> Locator<'a> {
     /// If `offset` is passed the end of the content.
     pub fn full_line_end(&self, offset: TextSize) -> TextSize {
         let slice = &self.contents[usize::from(offset)..];
-        if let Some(index) = slice.find(['\n', '\r']) {
-            let bytes = slice.as_bytes();
-
-            // `\r\n`
-            let relative_offset = if bytes[index] == b'\r' && bytes.get(index + 1) == Some(&b'\n') {
-                TextSize::try_from(index + 2).unwrap()
-            }
-            // `\r` or `\n`
-            else {
-                TextSize::try_from(index + 1).unwrap()
-            };
-
-            offset.add(relative_offset)
+        if let Some((index, line_ending)) = find_newline(slice) {
+            offset + TextSize::try_from(index).unwrap() + line_ending.text_len()
         } else {
             self.contents.text_len()
         }
@@ -139,7 +131,7 @@ impl<'a> Locator<'a> {
     /// If `offset` is passed the end of the content.
     pub fn line_end(&self, offset: TextSize) -> TextSize {
         let slice = &self.contents[usize::from(offset)..];
-        if let Some(index) = slice.find(['\n', '\r']) {
+        if let Some(index) = memchr2(b'\n', b'\r', slice.as_bytes()) {
             offset + TextSize::try_from(index).unwrap()
         } else {
             self.contents.text_len()
diff --git a/crates/ruff_python_ast/src/source_code/mod.rs b/crates/ruff_python_ast/src/source_code/mod.rs
index 5c0b3769ed..591162f491 100644
--- a/crates/ruff_python_ast/src/source_code/mod.rs
+++ b/crates/ruff_python_ast/src/source_code/mod.rs
@@ -15,8 +15,7 @@ use rustpython_parser::{lexer, Mode, ParseError};
 use serde::{Deserialize, Serialize};
 use std::fmt::{Debug, Formatter};
 use std::sync::Arc;
-
-pub use stylist::{LineEnding, Stylist};
+pub use stylist::Stylist;
 
 /// Run round-trip source code generation on a given Python code.
 pub fn round_trip(code: &str, source_path: &str) -> Result<String, ParseError> {
diff --git a/crates/ruff_python_ast/src/source_code/stylist.rs b/crates/ruff_python_ast/src/source_code/stylist.rs
index dcd25b8b8d..0c45351618 100644
--- a/crates/ruff_python_ast/src/source_code/stylist.rs
+++ b/crates/ruff_python_ast/src/source_code/stylist.rs
@@ -7,6 +7,7 @@ use once_cell::unsync::OnceCell;
 use rustpython_parser::lexer::LexResult;
 use rustpython_parser::Tok;
 
+use crate::newlines::{find_newline, LineEnding};
 use ruff_rustpython::vendor;
 
 use crate::source_code::Locator;
@@ -29,9 +30,12 @@ impl<'a> Stylist<'a> {
     }
 
     pub fn line_ending(&'a self) -> LineEnding {
-        *self
-            .line_ending
-            .get_or_init(|| detect_line_ending(self.locator.contents()).unwrap_or_default())
+        *self.line_ending.get_or_init(|| {
+            let contents = self.locator.contents();
+            find_newline(contents)
+                .map(|(_, ending)| ending)
+                .unwrap_or_default()
+        })
     }
 
     pub fn from_tokens(tokens: &[LexResult], locator: &'a Locator<'a>) -> Self {
@@ -158,65 +162,13 @@ impl Deref for Indentation {
     }
 }
 
-/// The line ending style used in Python source code.
-/// See <https://docs.python.org/3/reference/lexical_analysis.html#physical-lines>
-#[derive(Debug, PartialEq, Eq, Copy, Clone)]
-pub enum LineEnding {
-    Lf,
-    Cr,
-    CrLf,
-}
-
-impl Default for LineEnding {
-    fn default() -> Self {
-        if cfg!(windows) {
-            LineEnding::CrLf
-        } else {
-            LineEnding::Lf
-        }
-    }
-}
-
-impl LineEnding {
-    pub const fn as_str(&self) -> &'static str {
-        match self {
-            LineEnding::CrLf => "\r\n",
-            LineEnding::Lf => "\n",
-            LineEnding::Cr => "\r",
-        }
-    }
-}
-
-impl Deref for LineEnding {
-    type Target = str;
-
-    fn deref(&self) -> &Self::Target {
-        self.as_str()
-    }
-}
-
-/// Detect the line ending style of the given contents.
-fn detect_line_ending(contents: &str) -> Option<LineEnding> {
-    if let Some(position) = contents.find(['\n', '\r']) {
-        let bytes = contents.as_bytes();
-        if bytes[position] == b'\n' {
-            Some(LineEnding::Lf)
-        } else if bytes.get(position.saturating_add(1)) == Some(&b'\n') {
-            Some(LineEnding::CrLf)
-        } else {
-            Some(LineEnding::Cr)
-        }
-    } else {
-        None
-    }
-}
-
 #[cfg(test)]
 mod tests {
+    use crate::newlines::{find_newline, LineEnding};
     use rustpython_parser::lexer::lex;
     use rustpython_parser::Mode;
 
-    use crate::source_code::stylist::{detect_line_ending, Indentation, LineEnding, Quote};
+    use crate::source_code::stylist::{Indentation, Quote};
     use crate::source_code::{Locator, Stylist};
 
     #[test]
@@ -354,15 +306,24 @@ a = "v"
     #[test]
     fn line_ending() {
         let contents = "x = 1";
-        assert_eq!(detect_line_ending(contents), None);
+        assert_eq!(find_newline(contents).map(|(_, ending)| ending), None);
 
         let contents = "x = 1\n";
-        assert_eq!(detect_line_ending(contents), Some(LineEnding::Lf));
+        assert_eq!(
+            find_newline(contents).map(|(_, ending)| ending),
+            Some(LineEnding::Lf)
+        );
 
         let contents = "x = 1\r";
-        assert_eq!(detect_line_ending(contents), Some(LineEnding::Cr));
+        assert_eq!(
+            find_newline(contents).map(|(_, ending)| ending),
+            Some(LineEnding::Cr)
+        );
 
         let contents = "x = 1\r\n";
-        assert_eq!(detect_line_ending(contents), Some(LineEnding::CrLf));
+        assert_eq!(
+            find_newline(contents).map(|(_, ending)| ending),
+            Some(LineEnding::CrLf)
+        );
     }
 }