Use `memchr` for computing line indexes (#21838)

## Summary

Some benchmarks with Claude's help:

| File | Size | Baseline | Optimized | Speedup |

|---------------------|-------|----------------------|----------------------|---------|
| numpy/globals.py | 3 KB | 1.48 µs (1.95 GiB/s) | 740 ns (3.89 GiB/s) |
2.0x |
| unicode/pypinyin.py | 4 KB | 2.04 µs (2.01 GiB/s) | 1.18 µs (3.49
GiB/s) | 1.7x |
| pydantic/types.py | 26 KB | 13.1 µs (1.90 GiB/s) | 5.88 µs (4.23
GiB/s) | 2.2x |
| numpy/ctypeslib.py | 17 KB | 8.45 µs (1.92 GiB/s) | 3.94 µs (4.13
GiB/s) | 2.1x |
| large/dataset.py | 41 KB | 21.6 µs (1.84 GiB/s) | 11.2 µs (3.55 GiB/s)
| 1.9x |

I think that I originally thought we _had_ to iterate
character-by-character here because we needed to do the ASCII check, but
the ASCII check can be vectorized by LLVM (and the "search for newlines"
can be done with `memchr`).
This commit is contained in:
Charlie Marsh 2025-12-08 08:50:51 -05:00 committed by GitHub
parent c99e10eedc
commit b845e81c4a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 17 additions and 14 deletions

View File

@ -33,26 +33,29 @@ impl LineIndex {
line_starts.push(TextSize::default()); line_starts.push(TextSize::default());
let bytes = text.as_bytes(); let bytes = text.as_bytes();
let mut utf8 = false;
assert!(u32::try_from(bytes.len()).is_ok()); assert!(u32::try_from(bytes.len()).is_ok());
for (i, byte) in bytes.iter().enumerate() { for i in memchr::memchr2_iter(b'\n', b'\r', bytes) {
utf8 |= !byte.is_ascii(); // Skip `\r` in `\r\n` sequences (only count the `\n`).
if bytes[i] == b'\r' && bytes.get(i + 1) == Some(&b'\n') {
match byte { continue;
// Only track one line break for `\r\n`. }
b'\r' if bytes.get(i + 1) == Some(&b'\n') => continue,
b'\n' | b'\r' => {
// SAFETY: Assertion above guarantees `i <= u32::MAX` // SAFETY: Assertion above guarantees `i <= u32::MAX`
#[expect(clippy::cast_possible_truncation)] #[expect(clippy::cast_possible_truncation)]
line_starts.push(TextSize::from(i as u32) + TextSize::from(1)); line_starts.push(TextSize::from(i as u32) + TextSize::from(1));
} }
_ => {}
} // Determine whether the source text is ASCII.
//
// Empirically, this simple loop is auto-vectorized by LLVM and benchmarks faster than both
// `str::is_ascii()` and hand-written SIMD.
let mut has_non_ascii = false;
for byte in bytes {
has_non_ascii |= !byte.is_ascii();
} }
let kind = if utf8 { let kind = if has_non_ascii {
IndexKind::Utf8 IndexKind::Utf8
} else { } else {
IndexKind::Ascii IndexKind::Ascii