mirror of
https://github.com/astral-sh/ruff
synced 2026-01-21 13:30:49 -05:00
## Summary
On `main`, string lexing consists of walking through the string
character-by-character to search for the closing quote (with some
nuance: we also need to skip escaped characters, and error if we see
newlines in non-triple-quoted strings). This PR rewrites `lex_string` to
instead use `memchr` to search for the closing quote, which is
significantly faster. On my machine, at least, the `globals.py`
benchmark (which contains a lot of docstrings) gets 40% faster...
```text
lexer/numpy/globals.py time: [3.6410 µs 3.6496 µs 3.6585 µs]
thrpt: [806.53 MiB/s 808.49 MiB/s 810.41 MiB/s]
change:
time: [-40.413% -40.185% -39.984%] (p = 0.00 < 0.05)
thrpt: [+66.623% +67.181% +67.822%]
Performance has improved.
Found 2 outliers among 100 measurements (2.00%)
2 (2.00%) high mild
lexer/unicode/pypinyin.py
time: [12.422 µs 12.445 µs 12.467 µs]
thrpt: [337.03 MiB/s 337.65 MiB/s 338.27 MiB/s]
change:
time: [-9.4213% -9.1930% -8.9586%] (p = 0.00 < 0.05)
thrpt: [+9.8401% +10.124% +10.401%]
Performance has improved.
Found 3 outliers among 100 measurements (3.00%)
1 (1.00%) high mild
2 (2.00%) high severe
lexer/pydantic/types.py time: [107.45 µs 107.50 µs 107.56 µs]
thrpt: [237.11 MiB/s 237.24 MiB/s 237.35 MiB/s]
change:
time: [-4.0108% -3.7005% -3.3787%] (p = 0.00 < 0.05)
thrpt: [+3.4968% +3.8427% +4.1784%]
Performance has improved.
Found 7 outliers among 100 measurements (7.00%)
2 (2.00%) high mild
5 (5.00%) high severe
lexer/numpy/ctypeslib.py
time: [46.123 µs 46.165 µs 46.208 µs]
thrpt: [360.36 MiB/s 360.69 MiB/s 361.01 MiB/s]
change:
time: [-19.313% -18.996% -18.710%] (p = 0.00 < 0.05)
thrpt: [+23.016% +23.451% +23.935%]
Performance has improved.
Found 8 outliers among 100 measurements (8.00%)
3 (3.00%) low mild
1 (1.00%) high mild
4 (4.00%) high severe
lexer/large/dataset.py time: [231.07 µs 231.19 µs 231.33 µs]
thrpt: [175.87 MiB/s 175.97 MiB/s 176.06 MiB/s]
change:
time: [-2.0437% -1.7663% -1.4922%] (p = 0.00 < 0.05)
thrpt: [+1.5148% +1.7981% +2.0864%]
Performance has improved.
Found 10 outliers among 100 measurements (10.00%)
5 (5.00%) high mild
5 (5.00%) high severe
```
154 lines
4.3 KiB
Rust
154 lines
4.3 KiB
Rust
use ruff_text_size::{TextLen, TextSize};
|
|
use std::str::Chars;
|
|
|
|
pub(crate) const EOF_CHAR: char = '\0';
|
|
|
|
#[derive(Clone, Debug)]
|
|
pub(super) struct Cursor<'a> {
|
|
chars: Chars<'a>,
|
|
source_length: TextSize,
|
|
#[cfg(debug_assertions)]
|
|
prev_char: char,
|
|
}
|
|
|
|
impl<'a> Cursor<'a> {
|
|
pub(crate) fn new(source: &'a str) -> Self {
|
|
Self {
|
|
source_length: source.text_len(),
|
|
chars: source.chars(),
|
|
#[cfg(debug_assertions)]
|
|
prev_char: EOF_CHAR,
|
|
}
|
|
}
|
|
|
|
/// Returns the previous token. Useful for debug assertions.
|
|
#[cfg(debug_assertions)]
|
|
pub(super) const fn previous(&self) -> char {
|
|
self.prev_char
|
|
}
|
|
|
|
/// Peeks the next character from the input stream without consuming it.
|
|
/// Returns [`EOF_CHAR`] if the file is at the end of the file.
|
|
pub(super) fn first(&self) -> char {
|
|
self.chars.clone().next().unwrap_or(EOF_CHAR)
|
|
}
|
|
|
|
/// Peeks the second character from the input stream without consuming it.
|
|
/// Returns [`EOF_CHAR`] if the position is past the end of the file.
|
|
pub(super) fn second(&self) -> char {
|
|
let mut chars = self.chars.clone();
|
|
chars.next();
|
|
chars.next().unwrap_or(EOF_CHAR)
|
|
}
|
|
|
|
/// Returns the remaining text to lex.
|
|
pub(super) fn rest(&self) -> &'a str {
|
|
self.chars.as_str()
|
|
}
|
|
|
|
// SAFETY: The `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
|
|
#[allow(clippy::cast_possible_truncation)]
|
|
pub(super) fn text_len(&self) -> TextSize {
|
|
TextSize::new(self.chars.as_str().len() as u32)
|
|
}
|
|
|
|
pub(super) fn token_len(&self) -> TextSize {
|
|
self.source_length - self.text_len()
|
|
}
|
|
|
|
pub(super) fn start_token(&mut self) {
|
|
self.source_length = self.text_len();
|
|
}
|
|
|
|
pub(super) fn is_eof(&self) -> bool {
|
|
self.chars.as_str().is_empty()
|
|
}
|
|
|
|
/// Consumes the next character
|
|
pub(super) fn bump(&mut self) -> Option<char> {
|
|
let prev = self.chars.next()?;
|
|
|
|
#[cfg(debug_assertions)]
|
|
{
|
|
self.prev_char = prev;
|
|
}
|
|
|
|
Some(prev)
|
|
}
|
|
|
|
pub(super) fn eat_char(&mut self, c: char) -> bool {
|
|
if self.first() == c {
|
|
self.bump();
|
|
true
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
pub(super) fn eat_char2(&mut self, c1: char, c2: char) -> bool {
|
|
let mut chars = self.chars.clone();
|
|
if chars.next() == Some(c1) && chars.next() == Some(c2) {
|
|
self.bump();
|
|
self.bump();
|
|
true
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
pub(super) fn eat_char3(&mut self, c1: char, c2: char, c3: char) -> bool {
|
|
let mut chars = self.chars.clone();
|
|
if chars.next() == Some(c1) && chars.next() == Some(c2) && chars.next() == Some(c3) {
|
|
self.bump();
|
|
self.bump();
|
|
self.bump();
|
|
true
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
pub(super) fn eat_if<F>(&mut self, mut predicate: F) -> Option<char>
|
|
where
|
|
F: FnMut(char) -> bool,
|
|
{
|
|
if predicate(self.first()) && !self.is_eof() {
|
|
self.bump()
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
/// Eats symbols while predicate returns true or until the end of file is reached.
|
|
#[inline]
|
|
pub(super) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
|
|
// It was tried making optimized version of this for eg. line comments, but
|
|
// LLVM can inline all of this and compile it down to fast iteration over bytes.
|
|
while predicate(self.first()) && !self.is_eof() {
|
|
self.bump();
|
|
}
|
|
}
|
|
|
|
/// Skips the next `count` bytes.
|
|
///
|
|
/// ## Panics
|
|
/// - If `count` is larger than the remaining bytes in the input stream.
|
|
/// - If `count` indexes into a multi-byte character.
|
|
pub(super) fn skip_bytes(&mut self, count: usize) {
|
|
#[cfg(debug_assertions)]
|
|
{
|
|
self.prev_char = self.chars.as_str()[..count]
|
|
.chars()
|
|
.next_back()
|
|
.unwrap_or('\0');
|
|
}
|
|
|
|
self.chars = self.chars.as_str()[count..].chars();
|
|
}
|
|
|
|
/// Skips to the end of the input stream.
|
|
pub(super) fn skip_to_end(&mut self) {
|
|
self.chars = "".chars();
|
|
}
|
|
}
|