From b845e81c4a651386e0fa10571b85e8dfd914cd79 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Mon, 8 Dec 2025 08:50:51 -0500 Subject: [PATCH] Use `memchr` for computing line indexes (#21838) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Some benchmarks with Claude's help: | File | Size | Baseline | Optimized | Speedup | |---------------------|-------|----------------------|----------------------|---------| | numpy/globals.py | 3 KB | 1.48 µs (1.95 GiB/s) | 740 ns (3.89 GiB/s) | 2.0x | | unicode/pypinyin.py | 4 KB | 2.04 µs (2.01 GiB/s) | 1.18 µs (3.49 GiB/s) | 1.7x | | pydantic/types.py | 26 KB | 13.1 µs (1.90 GiB/s) | 5.88 µs (4.23 GiB/s) | 2.2x | | numpy/ctypeslib.py | 17 KB | 8.45 µs (1.92 GiB/s) | 3.94 µs (4.13 GiB/s) | 2.1x | | large/dataset.py | 41 KB | 21.6 µs (1.84 GiB/s) | 11.2 µs (3.55 GiB/s) | 1.9x | I think that I originally thought we _had_ to iterate character-by-character here because we needed to do the ASCII check, but the ASCII check can be vectorized by LLVM (and the "search for newlines" can be done with `memchr`). --- crates/ruff_source_file/src/line_index.rs | 31 +++++++++++++---------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/crates/ruff_source_file/src/line_index.rs b/crates/ruff_source_file/src/line_index.rs index 4adb9d17b7..c1d0769e83 100644 --- a/crates/ruff_source_file/src/line_index.rs +++ b/crates/ruff_source_file/src/line_index.rs @@ -33,26 +33,29 @@ impl LineIndex { line_starts.push(TextSize::default()); let bytes = text.as_bytes(); - let mut utf8 = false; assert!(u32::try_from(bytes.len()).is_ok()); - for (i, byte) in bytes.iter().enumerate() { - utf8 |= !byte.is_ascii(); - - match byte { - // Only track one line break for `\r\n`. - b'\r' if bytes.get(i + 1) == Some(&b'\n') => continue, - b'\n' | b'\r' => { - // SAFETY: Assertion above guarantees `i <= u32::MAX` - #[expect(clippy::cast_possible_truncation)] - line_starts.push(TextSize::from(i as u32) + TextSize::from(1)); - } - _ => {} + for i in memchr::memchr2_iter(b'\n', b'\r', bytes) { + // Skip `\r` in `\r\n` sequences (only count the `\n`). + if bytes[i] == b'\r' && bytes.get(i + 1) == Some(&b'\n') { + continue; } + // SAFETY: Assertion above guarantees `i <= u32::MAX` + #[expect(clippy::cast_possible_truncation)] + line_starts.push(TextSize::from(i as u32) + TextSize::from(1)); } - let kind = if utf8 { + // Determine whether the source text is ASCII. + // + // Empirically, this simple loop is auto-vectorized by LLVM and benchmarks faster than both + // `str::is_ascii()` and hand-written SIMD. + let mut has_non_ascii = false; + for byte in bytes { + has_non_ascii |= !byte.is_ascii(); + } + + let kind = if has_non_ascii { IndexKind::Utf8 } else { IndexKind::Ascii