From 6fffde72e7859a8efdebb91e5942440d6de2aa18 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Thu, 8 Feb 2024 09:23:06 -0800 Subject: [PATCH] Use `memchr` for string lexing (#9888) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary On `main`, string lexing consists of walking through the string character-by-character to search for the closing quote (with some nuance: we also need to skip escaped characters, and error if we see newlines in non-triple-quoted strings). This PR rewrites `lex_string` to instead use `memchr` to search for the closing quote, which is significantly faster. On my machine, at least, the `globals.py` benchmark (which contains a lot of docstrings) gets 40% faster... ```text lexer/numpy/globals.py time: [3.6410 µs 3.6496 µs 3.6585 µs] thrpt: [806.53 MiB/s 808.49 MiB/s 810.41 MiB/s] change: time: [-40.413% -40.185% -39.984%] (p = 0.00 < 0.05) thrpt: [+66.623% +67.181% +67.822%] Performance has improved. Found 2 outliers among 100 measurements (2.00%) 2 (2.00%) high mild lexer/unicode/pypinyin.py time: [12.422 µs 12.445 µs 12.467 µs] thrpt: [337.03 MiB/s 337.65 MiB/s 338.27 MiB/s] change: time: [-9.4213% -9.1930% -8.9586%] (p = 0.00 < 0.05) thrpt: [+9.8401% +10.124% +10.401%] Performance has improved. Found 3 outliers among 100 measurements (3.00%) 1 (1.00%) high mild 2 (2.00%) high severe lexer/pydantic/types.py time: [107.45 µs 107.50 µs 107.56 µs] thrpt: [237.11 MiB/s 237.24 MiB/s 237.35 MiB/s] change: time: [-4.0108% -3.7005% -3.3787%] (p = 0.00 < 0.05) thrpt: [+3.4968% +3.8427% +4.1784%] Performance has improved. Found 7 outliers among 100 measurements (7.00%) 2 (2.00%) high mild 5 (5.00%) high severe lexer/numpy/ctypeslib.py time: [46.123 µs 46.165 µs 46.208 µs] thrpt: [360.36 MiB/s 360.69 MiB/s 361.01 MiB/s] change: time: [-19.313% -18.996% -18.710%] (p = 0.00 < 0.05) thrpt: [+23.016% +23.451% +23.935%] Performance has improved. Found 8 outliers among 100 measurements (8.00%) 3 (3.00%) low mild 1 (1.00%) high mild 4 (4.00%) high severe lexer/large/dataset.py time: [231.07 µs 231.19 µs 231.33 µs] thrpt: [175.87 MiB/s 175.97 MiB/s 176.06 MiB/s] change: time: [-2.0437% -1.7663% -1.4922%] (p = 0.00 < 0.05) thrpt: [+1.5148% +1.7981% +2.0864%] Performance has improved. Found 10 outliers among 100 measurements (10.00%) 5 (5.00%) high mild 5 (5.00%) high severe ``` --- crates/ruff_python_parser/src/lexer.rs | 130 +++++++++++++----- crates/ruff_python_parser/src/lexer/cursor.rs | 5 + 2 files changed, 100 insertions(+), 35 deletions(-) diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index 694d769b90..8d5a20b03a 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -690,48 +690,65 @@ impl<'source> Lexer<'source> { let value_start = self.offset(); - let value_end = loop { - match self.cursor.bump() { - Some('\\') => { - if self.cursor.eat_char('\r') { - self.cursor.eat_char('\n'); - } else { - self.cursor.bump(); - } - } - Some('\r' | '\n') if !triple_quoted => { + let quote_byte = u8::try_from(quote).expect("char that fits in u8"); + let value_end = if triple_quoted { + // For triple-quoted strings, scan until we find the closing quote (ignoring escaped + // quotes) or the end of the file. + loop { + let Some(index) = memchr::memchr(quote_byte, self.cursor.rest().as_bytes()) else { + self.cursor.skip_to_end(); + if let Some(fstring) = self.fstrings.current() { // When we are in an f-string, check whether the initial quote // matches with f-strings quotes and if it is, then this must be a // missing '}' token so raise the proper error. - if fstring.quote_char() == quote && !fstring.is_triple_quoted() { + if fstring.quote_char() == quote + && fstring.is_triple_quoted() == triple_quoted + { return Err(LexicalError { error: LexicalErrorType::FStringError( FStringErrorType::UnclosedLbrace, ), - location: self.offset() - TextSize::new(1), + location: self.cursor.text_len(), }); } } return Err(LexicalError { - error: LexicalErrorType::OtherError( - "EOL while scanning string literal".to_owned(), - ), - location: self.offset() - TextSize::new(1), + error: LexicalErrorType::Eof, + location: self.cursor.text_len(), }); - } - Some(c) if c == quote => { - if triple_quoted { - if self.cursor.eat_char2(quote, quote) { - break self.offset() - TextSize::new(3); - } - } else { - break self.offset() - TextSize::new(1); - } + }; + + // Rare case: if there are an odd number of backslashes before the quote, then + // the quote is escaped and we should continue scanning. + let num_backslashes = self.cursor.rest().as_bytes()[..index] + .iter() + .rev() + .take_while(|&&c| c == b'\\') + .count(); + + // Advance the cursor past the quote and continue scanning. + self.cursor.skip_bytes(index + 1); + + // If the character is escaped, continue scanning. + if num_backslashes % 2 == 1 { + continue; } - Some(_) => {} - None => { + // Otherwise, if it's followed by two more quotes, then we're done. + if self.cursor.eat_char2(quote, quote) { + break self.offset() - TextSize::new(3); + } + } + } else { + // For non-triple-quoted strings, scan until we find the closing quote, but end early + // if we encounter a newline or the end of the file. + loop { + let Some(index) = + memchr::memchr3(quote_byte, b'\r', b'\n', self.cursor.rest().as_bytes()) + else { + self.cursor.skip_to_end(); + if let Some(fstring) = self.fstrings.current() { // When we are in an f-string, check whether the initial quote // matches with f-strings quotes and if it is, then this must be a @@ -748,23 +765,66 @@ impl<'source> Lexer<'source> { } } return Err(LexicalError { - error: if triple_quoted { - LexicalErrorType::Eof - } else { - LexicalErrorType::StringError - }, + error: LexicalErrorType::StringError, location: self.offset(), }); + }; + + // Rare case: if there are an odd number of backslashes before the quote, then + // the quote is escaped and we should continue scanning. + let num_backslashes = self.cursor.rest().as_bytes()[..index] + .iter() + .rev() + .take_while(|&&c| c == b'\\') + .count(); + + // Skip up to the current character. + self.cursor.skip_bytes(index); + let ch = self.cursor.bump(); + + // If the character is escaped, continue scanning. + if num_backslashes % 2 == 1 { + if ch == Some('\r') { + self.cursor.eat_char('\n'); + } + continue; + } + + match ch { + Some('\r' | '\n') => { + if let Some(fstring) = self.fstrings.current() { + // When we are in an f-string, check whether the initial quote + // matches with f-strings quotes and if it is, then this must be a + // missing '}' token so raise the proper error. + if fstring.quote_char() == quote && !fstring.is_triple_quoted() { + return Err(LexicalError { + error: LexicalErrorType::FStringError( + FStringErrorType::UnclosedLbrace, + ), + location: self.offset() - TextSize::new(1), + }); + } + } + return Err(LexicalError { + error: LexicalErrorType::OtherError( + "EOL while scanning string literal".to_owned(), + ), + location: self.offset() - TextSize::new(1), + }); + } + Some(ch) if ch == quote => { + break self.offset() - TextSize::new(1); + } + _ => unreachable!("memchr2 returned an index that is not a quote or a newline"), } } }; - let tok = Tok::String { + Ok(Tok::String { value: self.source[TextRange::new(value_start, value_end)].to_string(), kind, triple_quoted, - }; - Ok(tok) + }) } // This is the main entry point. Call this function to retrieve the next token. diff --git a/crates/ruff_python_parser/src/lexer/cursor.rs b/crates/ruff_python_parser/src/lexer/cursor.rs index 26f3bb8a5b..6dd8e63d70 100644 --- a/crates/ruff_python_parser/src/lexer/cursor.rs +++ b/crates/ruff_python_parser/src/lexer/cursor.rs @@ -145,4 +145,9 @@ impl<'a> Cursor<'a> { self.chars = self.chars.as_str()[count..].chars(); } + + /// Skips to the end of the input stream. + pub(super) fn skip_to_end(&mut self) { + self.chars = "".chars(); + } }