mirror of https://github.com/astral-sh/ruff
Use `memchr` for computing line indexes (#21838)
## Summary Some benchmarks with Claude's help: | File | Size | Baseline | Optimized | Speedup | |---------------------|-------|----------------------|----------------------|---------| | numpy/globals.py | 3 KB | 1.48 µs (1.95 GiB/s) | 740 ns (3.89 GiB/s) | 2.0x | | unicode/pypinyin.py | 4 KB | 2.04 µs (2.01 GiB/s) | 1.18 µs (3.49 GiB/s) | 1.7x | | pydantic/types.py | 26 KB | 13.1 µs (1.90 GiB/s) | 5.88 µs (4.23 GiB/s) | 2.2x | | numpy/ctypeslib.py | 17 KB | 8.45 µs (1.92 GiB/s) | 3.94 µs (4.13 GiB/s) | 2.1x | | large/dataset.py | 41 KB | 21.6 µs (1.84 GiB/s) | 11.2 µs (3.55 GiB/s) | 1.9x | I think that I originally thought we _had_ to iterate character-by-character here because we needed to do the ASCII check, but the ASCII check can be vectorized by LLVM (and the "search for newlines" can be done with `memchr`).
This commit is contained in:
parent
c99e10eedc
commit
b845e81c4a
|
|
@ -33,26 +33,29 @@ impl LineIndex {
|
||||||
line_starts.push(TextSize::default());
|
line_starts.push(TextSize::default());
|
||||||
|
|
||||||
let bytes = text.as_bytes();
|
let bytes = text.as_bytes();
|
||||||
let mut utf8 = false;
|
|
||||||
|
|
||||||
assert!(u32::try_from(bytes.len()).is_ok());
|
assert!(u32::try_from(bytes.len()).is_ok());
|
||||||
|
|
||||||
for (i, byte) in bytes.iter().enumerate() {
|
for i in memchr::memchr2_iter(b'\n', b'\r', bytes) {
|
||||||
utf8 |= !byte.is_ascii();
|
// Skip `\r` in `\r\n` sequences (only count the `\n`).
|
||||||
|
if bytes[i] == b'\r' && bytes.get(i + 1) == Some(&b'\n') {
|
||||||
match byte {
|
continue;
|
||||||
// Only track one line break for `\r\n`.
|
}
|
||||||
b'\r' if bytes.get(i + 1) == Some(&b'\n') => continue,
|
|
||||||
b'\n' | b'\r' => {
|
|
||||||
// SAFETY: Assertion above guarantees `i <= u32::MAX`
|
// SAFETY: Assertion above guarantees `i <= u32::MAX`
|
||||||
#[expect(clippy::cast_possible_truncation)]
|
#[expect(clippy::cast_possible_truncation)]
|
||||||
line_starts.push(TextSize::from(i as u32) + TextSize::from(1));
|
line_starts.push(TextSize::from(i as u32) + TextSize::from(1));
|
||||||
}
|
}
|
||||||
_ => {}
|
|
||||||
}
|
// Determine whether the source text is ASCII.
|
||||||
|
//
|
||||||
|
// Empirically, this simple loop is auto-vectorized by LLVM and benchmarks faster than both
|
||||||
|
// `str::is_ascii()` and hand-written SIMD.
|
||||||
|
let mut has_non_ascii = false;
|
||||||
|
for byte in bytes {
|
||||||
|
has_non_ascii |= !byte.is_ascii();
|
||||||
}
|
}
|
||||||
|
|
||||||
let kind = if utf8 {
|
let kind = if has_non_ascii {
|
||||||
IndexKind::Utf8
|
IndexKind::Utf8
|
||||||
} else {
|
} else {
|
||||||
IndexKind::Ascii
|
IndexKind::Ascii
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue