From cdc7c7144997ca9aa99a6bfae9dd73b4fa53de49 Mon Sep 17 00:00:00 2001
From: Dhruv Manilawala <dhruvmanila@gmail.com>
Date: Wed, 19 Jun 2024 12:14:18 +0530
Subject: [PATCH] Avoid consuming trailing whitespace during re-lexing (#11933)

## Summary

This PR updates the re-lexing logic to avoid consuming the trailing
whitespace and move the lexer explicitly to the last newline character
encountered while moving backwards.

Consider the following code snippet as taken from the test case
highlighted with whitespace (`.`) and newline (`\n`) characters:
```py
# There are trailing whitespace before the newline character but those whitespaces are
# part of the comment token
f"""hello {x # comment....\n
#                     ^
y = 1\n
```

The parser is at `y` when it's trying to recover from an unclosed `{`,
so it calls into the re-lexing logic which tries to move the lexer back
to the end of the previous line. But, as it consumed all whitespaces it
moved the lexer to the location marked by `^` in the above code snippet.
But, those whitespaces are part of the comment token. This means that
the range for the two tokens were overlapping which introduced the
panic.

Note that this is only a bug when there's a comment with a trailing
whitespace otherwise it's fine to move the lexer to the whitespace
character. This is because the lexer would just skip the whitespace
otherwise. Nevertheless, this PR updates the logic to move it explicitly
to the newline character in all cases.

fixes: #11929

## Test Plan

Add test cases and update the snapshot. Make sure that it doesn't panic
on the code snippet in the linked issue.
---
 .../resources/invalid/re_lex_logical_token.py |  8 +-
 crates/ruff_python_parser/src/lexer.rs        | 15 ++-
 ...nvalid_syntax@re_lex_logical_token.py.snap | 94 ++++++++++++++++++-
 3 files changed, 103 insertions(+), 14 deletions(-)

diff --git a/crates/ruff_python_parser/resources/invalid/re_lex_logical_token.py b/crates/ruff_python_parser/resources/invalid/re_lex_logical_token.py
index 86e2b1435b..9a2b9f4fb3 100644
--- a/crates/ruff_python_parser/resources/invalid/re_lex_logical_token.py
+++ b/crates/ruff_python_parser/resources/invalid/re_lex_logical_token.py
@@ -54,4 +54,10 @@ if call(f"hello {x
 
 if call(f"hello
     def bar():
-        pass
\ No newline at end of file
+        pass
+
+
+# There are trailing whitespace before the newline character but those whitespaces are
+# part of the comment token
+f"""hello {x # comment    
+y = 1
\ No newline at end of file
diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
index 0decf4cb80..c4a44b106d 100644
--- a/crates/ruff_python_parser/src/lexer.rs
+++ b/crates/ruff_python_parser/src/lexer.rs
@@ -1370,17 +1370,16 @@ impl<'src> Lexer<'src> {
         // i.e., it recovered from an unclosed parenthesis (`(`, `[`, or `{`).
         self.nesting -= 1;
 
-        let current_position = self.current_range().start();
+        let mut current_position = self.current_range().start();
         let reverse_chars = self.source[..current_position.to_usize()].chars().rev();
-        let mut new_position = current_position;
-        let mut has_newline = false;
+        let mut newline_position = None;
 
         for ch in reverse_chars {
             if is_python_whitespace(ch) {
-                new_position -= ch.text_len();
+                current_position -= ch.text_len();
             } else if matches!(ch, '\n' | '\r') {
-                has_newline |= true;
-                new_position -= ch.text_len();
+                current_position -= ch.text_len();
+                newline_position = Some(current_position);
             } else {
                 break;
             }
@@ -1388,7 +1387,7 @@ impl<'src> Lexer<'src> {
 
         // The lexer should only be moved if there's a newline character which needs to be
         // re-lexed.
-        if new_position != current_position && has_newline {
+        if let Some(newline_position) = newline_position {
             // Earlier we reduced the nesting level unconditionally. Now that we know the lexer's
             // position is going to be moved back, the lexer needs to be put back into a
             // parenthesized context if the current token is a closing parenthesis.
@@ -1410,7 +1409,7 @@ impl<'src> Lexer<'src> {
             }
 
             self.cursor = Cursor::new(self.source);
-            self.cursor.skip_bytes(new_position.to_usize());
+            self.cursor.skip_bytes(newline_position.to_usize());
             self.state = State::Other;
             self.next_token();
             true
diff --git a/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lex_logical_token.py.snap b/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lex_logical_token.py.snap
index 2a3c1866e7..7604d4bede 100644
--- a/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lex_logical_token.py.snap
+++ b/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lex_logical_token.py.snap
@@ -7,7 +7,7 @@ input_file: crates/ruff_python_parser/resources/invalid/re_lex_logical_token.py
 ```
 Module(
     ModModule {
-        range: 0..979,
+        range: 0..1129,
         body: [
             If(
                 StmtIf {
@@ -670,6 +670,53 @@ Module(
                     ],
                 },
             ),
+            Expr(
+                StmtExpr {
+                    range: 1097..1109,
+                    value: FString(
+                        ExprFString {
+                            range: 1097..1109,
+                            value: FStringValue {
+                                inner: Single(
+                                    FString(
+                                        FString {
+                                            range: 1097..1109,
+                                            elements: [
+                                                Literal(
+                                                    FStringLiteralElement {
+                                                        range: 1101..1107,
+                                                        value: "hello ",
+                                                    },
+                                                ),
+                                                Expression(
+                                                    FStringExpressionElement {
+                                                        range: 1107..1109,
+                                                        expression: Name(
+                                                            ExprName {
+                                                                range: 1108..1109,
+                                                                id: "x",
+                                                                ctx: Load,
+                                                            },
+                                                        ),
+                                                        debug_text: None,
+                                                        conversion: None,
+                                                        format_spec: None,
+                                                    },
+                                                ),
+                                            ],
+                                            flags: FStringFlags {
+                                                quote_style: Double,
+                                                prefix: Regular,
+                                                triple_quoted: true,
+                                            },
+                                        },
+                                    ),
+                                ),
+                            },
+                        },
+                    ),
+                },
+            ),
         ],
     },
 )
@@ -831,8 +878,45 @@ Module(
 
 
    |
-55 | if call(f"hello
-56 |     def bar():
-57 |         pass
-   |              Syntax Error: Expected a statement
+60 | # There are trailing whitespace before the newline character but those whitespaces are
+61 | # part of the comment token
+62 | f"""hello {x # comment    
+   |  Syntax Error: Expected a statement
+63 | y = 1
+   |
+
+
+   |
+60 |   # There are trailing whitespace before the newline character but those whitespaces are
+61 |   # part of the comment token
+62 |   f"""hello {x # comment    
+   |  ___________________________^
+63 | | y = 1
+   | |_____^ Syntax Error: f-string: unterminated triple-quoted string
+   |
+
+
+   |
+61 | # part of the comment token
+62 | f"""hello {x # comment    
+63 | y = 1
+   | ^ Syntax Error: f-string: expecting '}'
+   |
+
+
+   |
+60 |   # There are trailing whitespace before the newline character but those whitespaces are
+61 |   # part of the comment token
+62 |   f"""hello {x # comment    
+   |  ___________________________^
+63 | | y = 1
+   | |_____^ Syntax Error: Expected FStringEnd, found Unknown
+   |
+
+
+   |
+61 | # part of the comment token
+62 | f"""hello {x # comment    
+63 | y = 1
+   |       Syntax Error: Expected a statement
    |