From b845e81c4a651386e0fa10571b85e8dfd914cd79 Mon Sep 17 00:00:00 2001
From: Charlie Marsh <charlie.r.marsh@gmail.com>
Date: Mon, 8 Dec 2025 08:50:51 -0500
Subject: [PATCH] Use `memchr` for computing line indexes (#21838)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

Some benchmarks with Claude's help:

| File | Size | Baseline | Optimized | Speedup |

|---------------------|-------|----------------------|----------------------|---------|
| numpy/globals.py | 3 KB | 1.48 µs (1.95 GiB/s) | 740 ns (3.89 GiB/s) |
2.0x |
| unicode/pypinyin.py | 4 KB | 2.04 µs (2.01 GiB/s) | 1.18 µs (3.49
GiB/s) | 1.7x |
| pydantic/types.py | 26 KB | 13.1 µs (1.90 GiB/s) | 5.88 µs (4.23
GiB/s) | 2.2x |
| numpy/ctypeslib.py | 17 KB | 8.45 µs (1.92 GiB/s) | 3.94 µs (4.13
GiB/s) | 2.1x |
| large/dataset.py | 41 KB | 21.6 µs (1.84 GiB/s) | 11.2 µs (3.55 GiB/s)
| 1.9x |

I think that I originally thought we _had_ to iterate
character-by-character here because we needed to do the ASCII check, but
the ASCII check can be vectorized by LLVM (and the "search for newlines"
can be done with `memchr`).
---
 crates/ruff_source_file/src/line_index.rs | 31 +++++++++++++----------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/crates/ruff_source_file/src/line_index.rs b/crates/ruff_source_file/src/line_index.rs
index 4adb9d17b7..c1d0769e83 100644
--- a/crates/ruff_source_file/src/line_index.rs
+++ b/crates/ruff_source_file/src/line_index.rs
@@ -33,26 +33,29 @@ impl LineIndex {
         line_starts.push(TextSize::default());
 
         let bytes = text.as_bytes();
-        let mut utf8 = false;
 
         assert!(u32::try_from(bytes.len()).is_ok());
 
-        for (i, byte) in bytes.iter().enumerate() {
-            utf8 |= !byte.is_ascii();
-
-            match byte {
-                // Only track one line break for `\r\n`.
-                b'\r' if bytes.get(i + 1) == Some(&b'\n') => continue,
-                b'\n' | b'\r' => {
-                    // SAFETY: Assertion above guarantees `i <= u32::MAX`
-                    #[expect(clippy::cast_possible_truncation)]
-                    line_starts.push(TextSize::from(i as u32) + TextSize::from(1));
-                }
-                _ => {}
+        for i in memchr::memchr2_iter(b'\n', b'\r', bytes) {
+            // Skip `\r` in `\r\n` sequences (only count the `\n`).
+            if bytes[i] == b'\r' && bytes.get(i + 1) == Some(&b'\n') {
+                continue;
             }
+            // SAFETY: Assertion above guarantees `i <= u32::MAX`
+            #[expect(clippy::cast_possible_truncation)]
+            line_starts.push(TextSize::from(i as u32) + TextSize::from(1));
         }
 
-        let kind = if utf8 {
+        // Determine whether the source text is ASCII.
+        //
+        // Empirically, this simple loop is auto-vectorized by LLVM and benchmarks faster than both
+        // `str::is_ascii()` and hand-written SIMD.
+        let mut has_non_ascii = false;
+        for byte in bytes {
+            has_non_ascii |= !byte.is_ascii();
+        }
+
+        let kind = if has_non_ascii {
             IndexKind::Utf8
         } else {
             IndexKind::Ascii