From e137c824c34cdebe328c8f67e1c2e2d39d3514c3 Mon Sep 17 00:00:00 2001 From: Dhruv Manilawala Date: Thu, 27 Jun 2024 17:02:48 +0530 Subject: [PATCH] Avoid consuming newline for unterminated string (#12067) ## Summary This PR fixes the lexer logic to **not** consume the newline character for an unterminated string literal. Currently, the lexer would consume it to be part of the string itself but that would be bad for recovery because then the lexer wouldn't emit the newline token ever. This PR fixes that to avoid consuming the newline character in that case. This was discovered during https://github.com/astral-sh/ruff/pull/12060. ## Test Plan Update the snapshots and validate them. --- crates/ruff_python_parser/src/lexer.rs | 19 ++++++++++++------- ...y_concatenated_unterminated_string.py.snap | 9 +++++++++ ...x@re_lexing__fstring_format_spec_1.py.snap | 2 +- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index 0640bd8349..8228da57a2 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -962,25 +962,30 @@ impl<'src> Lexer<'src> { // Skip up to the current character. self.cursor.skip_bytes(index); - let ch = self.cursor.bump(); + + // Lookahead because we want to bump only if it's a quote or being escaped. + let quote_or_newline = self.cursor.first(); // If the character is escaped, continue scanning. if num_backslashes % 2 == 1 { - if ch == Some('\r') { + self.cursor.bump(); + if quote_or_newline == '\r' { self.cursor.eat_char('\n'); } continue; } - match ch { - Some(newline @ ('\r' | '\n')) => { + match quote_or_newline { + '\r' | '\n' => { return self.push_error(LexicalError::new( LexicalErrorType::UnclosedStringError, - self.token_range().sub_end(newline.text_len()), + self.token_range(), )); } - Some(ch) if ch == quote => { - break self.offset() - TextSize::new(1); + ch if ch == quote => { + let value_end = self.offset(); + self.cursor.bump(); + break value_end; } _ => unreachable!("memchr2 returned an index that is not a quote or a newline"), } diff --git a/crates/ruff_python_parser/tests/snapshots/invalid_syntax@implicitly_concatenated_unterminated_string.py.snap b/crates/ruff_python_parser/tests/snapshots/invalid_syntax@implicitly_concatenated_unterminated_string.py.snap index 67ef835321..7eada587cd 100644 --- a/crates/ruff_python_parser/tests/snapshots/invalid_syntax@implicitly_concatenated_unterminated_string.py.snap +++ b/crates/ruff_python_parser/tests/snapshots/invalid_syntax@implicitly_concatenated_unterminated_string.py.snap @@ -160,6 +160,15 @@ Module( | + | +1 | 'hello' 'world + | ^ Syntax Error: Expected a statement +2 | 1 + 1 +3 | 'hello' f'world {x} +4 | 2 + 2 + | + + | 1 | 'hello' 'world 2 | 1 + 1 diff --git a/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lexing__fstring_format_spec_1.py.snap b/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lexing__fstring_format_spec_1.py.snap index 7251180b0c..a4c68ae7d3 100644 --- a/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lexing__fstring_format_spec_1.py.snap +++ b/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lexing__fstring_format_spec_1.py.snap @@ -335,8 +335,8 @@ Module( | 5 | f'middle {'string':\ 6 | 'format spec'} + | ^ Syntax Error: Expected a statement 7 | - | ^ Syntax Error: Expected a statement 8 | f'middle {'string':\\ 9 | 'format spec'} |