Skip over trivia tokens after re-lexing (#21895)

2025-12-11 11:45:18 +01:00 · 2025-12-11 11:45:18 +01:00 · aa27925e87
parent 5c320990f7
commit aa27925e87
9 changed files with 373 additions and 33 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3349,6 +3349,7 @@ dependencies = [
 "compact_str",
 "get-size2",
 "insta",
+ "itertools 0.14.0",
 "memchr",
 "ruff_annotate_snippets",
 "ruff_python_ast",
--- a/crates/ruff_python_parser/Cargo.toml
+++ b/crates/ruff_python_parser/Cargo.toml
@ -35,6 +35,7 @@ ruff_source_file = { workspace = true }

 anyhow = { workspace = true }
 insta = { workspace = true, features = ["glob"] }
+itertools = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
 walkdir = { workspace = true }
--- a/crates/ruff_python_parser/resources/invalid/re_lexing/ty_1828.py
+++ b/crates/ruff_python_parser/resources/invalid/re_lexing/ty_1828.py
@ -0,0 +1,5 @@
+# Regression test for https://github.com/astral-sh/ty/issues/1828
+(c: int = 1,f"""{d=[
+def a(
+class A:
+    pass
--- a/crates/ruff_python_parser/src/token_source.rs
+++ b/crates/ruff_python_parser/src/token_source.rs
@ -67,26 +67,59 @@ impl<'src> TokenSource<'src> {
    ///
    /// [`re_lex_logical_token`]: Lexer::re_lex_logical_token
    pub(crate) fn re_lex_logical_token(&mut self) {
-        let mut non_logical_newline_start = None;
-        for token in self.tokens.iter().rev() {
+        let mut non_logical_newline = None;
+
+        #[cfg(debug_assertions)]
+        let last_non_trivia_end_before = {
+            self.tokens
+                .iter()
+                .rev()
+                .find(|tok| !tok.kind().is_trivia())
+                .map(ruff_text_size::Ranged::end)
+        };
+
+        for (index, token) in self.tokens.iter().enumerate().rev() {
            match token.kind() {
                TokenKind::NonLogicalNewline => {
-                    non_logical_newline_start = Some(token.start());
+                    non_logical_newline = Some((index, token.start()));
                }
                TokenKind::Comment => continue,
                _ => break,
            }
        }

-        if self.lexer.re_lex_logical_token(non_logical_newline_start) {
-            let current_start = self.current_range().start();
-            while self
-                .tokens
-                .last()
-                .is_some_and(|last| last.start() >= current_start)
-            {
-                self.tokens.pop();
-            }
+        if !self
+            .lexer
+            .re_lex_logical_token(non_logical_newline.map(|(_, start)| start))
+        {
+            return;
+        }
+
+        let non_logical_line_index = non_logical_newline
+            .expect(
+                "`re_lex_logical_token` should only return `true` if `non_logical_line` is `Some`",
+            )
+            .0;
+
+        // Trim the already bumped logical line token (and comments coming after it) as it might now have become a logical line token
+        self.tokens.truncate(non_logical_line_index);
+
+        #[cfg(debug_assertions)]
+        {
+            let last_non_trivia_end_now = {
+                self.tokens
+                    .iter()
+                    .rev()
+                    .find(|tok| !tok.kind().is_trivia())
+                    .map(ruff_text_size::Ranged::end)
+            };
+
+            assert_eq!(last_non_trivia_end_before, last_non_trivia_end_now);
+        }
+
+        // Ensure `current` is positioned at a non-trivia token.
+        if self.current_kind().is_trivia() {
+            self.bump(self.current_kind());
        }
    }

--- a/crates/ruff_python_parser/tests/fixtures.rs
+++ b/crates/ruff_python_parser/tests/fixtures.rs
@ -4,15 +4,16 @@ use std::fmt::{Formatter, Write};
 use std::fs;
 use std::path::Path;

+use itertools::Itertools;
 use ruff_annotate_snippets::{Level, Renderer, Snippet};
-use ruff_python_ast::token::Token;
+use ruff_python_ast::token::{Token, Tokens};
 use ruff_python_ast::visitor::Visitor;
 use ruff_python_ast::visitor::source_order::{SourceOrderVisitor, TraversalSignal, walk_module};
 use ruff_python_ast::{self as ast, AnyNodeRef, Mod, PythonVersion};
 use ruff_python_parser::semantic_errors::{
    SemanticSyntaxChecker, SemanticSyntaxContext, SemanticSyntaxError,
 };
-use ruff_python_parser::{Mode, ParseErrorType, ParseOptions, parse_unchecked};
+use ruff_python_parser::{Mode, ParseErrorType, ParseOptions, Parsed, parse_unchecked};
 use ruff_source_file::{LineIndex, OneIndexed, SourceCode};
 use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};

@ -81,7 +82,7 @@ fn test_valid_syntax(input_path: &Path) {
    }

    validate_tokens(parsed.tokens(), source.text_len(), input_path);
-    validate_ast(parsed.syntax(), source.text_len(), input_path);
+    validate_ast(&parsed, source.text_len(), input_path);

    let mut output = String::new();
    writeln!(&mut output, "## AST").unwrap();
@ -139,7 +140,7 @@ fn test_invalid_syntax(input_path: &Path) {
    let parsed = parse_unchecked(&source, options.clone());

    validate_tokens(parsed.tokens(), source.text_len(), input_path);
-    validate_ast(parsed.syntax(), source.text_len(), input_path);
+    validate_ast(&parsed, source.text_len(), input_path);

    let mut output = String::new();
    writeln!(&mut output, "## AST").unwrap();
@ -402,12 +403,16 @@ Tokens: {tokens:#?}
 /// * the range of the parent node fully encloses all its child nodes
 /// * the ranges are strictly increasing when traversing the nodes in pre-order.
 /// * all ranges are within the length of the source code.
-fn validate_ast(root: &Mod, source_len: TextSize, test_path: &Path) {
-    walk_module(&mut ValidateAstVisitor::new(source_len, test_path), root);
+fn validate_ast(parsed: &Parsed<Mod>, source_len: TextSize, test_path: &Path) {
+    walk_module(
+        &mut ValidateAstVisitor::new(parsed.tokens(), source_len, test_path),
+        parsed.syntax(),
+    );
 }

 #[derive(Debug)]
 struct ValidateAstVisitor<'a> {
+    tokens: std::iter::Peekable<std::slice::Iter<'a, Token>>,
    parents: Vec<AnyNodeRef<'a>>,
    previous: Option<AnyNodeRef<'a>>,
    source_length: TextSize,
@ -415,8 +420,9 @@ struct ValidateAstVisitor<'a> {
 }

 impl<'a> ValidateAstVisitor<'a> {
-    fn new(source_length: TextSize, test_path: &'a Path) -> Self {
+    fn new(tokens: &'a Tokens, source_length: TextSize, test_path: &'a Path) -> Self {
        Self {
+            tokens: tokens.iter().peekable(),
            parents: Vec::new(),
            previous: None,
            source_length,
@ -425,6 +431,47 @@ impl<'a> ValidateAstVisitor<'a> {
    }
 }

+impl ValidateAstVisitor<'_> {
+    /// Check that the node's start doesn't fall within a token.
+    /// Called in `enter_node` before visiting children.
+    fn assert_start_boundary(&mut self, node: AnyNodeRef<'_>) {
+        // Skip tokens that end at or before the node starts.
+        self.tokens
+            .peeking_take_while(|t| t.end() <= node.start())
+            .last();
+
+        if let Some(next) = self.tokens.peek() {
+            // At this point, next_token.end() > node.start()
+            assert!(
+                next.start() >= node.start(),
+                "{path}: The start of the node falls within a token.\nNode: {node:#?}\n\nToken: {next:#?}\n\nRoot: {root:#?}",
+                path = self.test_path.display(),
+                root = self.parents.first()
+            );
+        }
+    }
+
+    /// Check that the node's end doesn't fall within a token.
+    /// Called in `leave_node` after visiting children, so all tokens
+    /// within the node have been consumed.
+    fn assert_end_boundary(&mut self, node: AnyNodeRef<'_>) {
+        // Skip tokens that end at or before the node ends.
+        self.tokens
+            .peeking_take_while(|t| t.end() <= node.end())
+            .last();
+
+        if let Some(next) = self.tokens.peek() {
+            // At this point, `next_token.end() > node.end()`
+            assert!(
+                next.start() >= node.end(),
+                "{path}: The end of the node falls within a token.\nNode: {node:#?}\n\nToken: {next:#?}\n\nRoot: {root:#?}",
+                path = self.test_path.display(),
+                root = self.parents.first()
+            );
+        }
+    }
+}
+
 impl<'ast> SourceOrderVisitor<'ast> for ValidateAstVisitor<'ast> {
    fn enter_node(&mut self, node: AnyNodeRef<'ast>) -> TraversalSignal {
        assert!(
@ -452,12 +499,16 @@ impl<'ast> SourceOrderVisitor<'ast> for ValidateAstVisitor<'ast> {
            );
        }

+        self.assert_start_boundary(node);
+
        self.parents.push(node);

        TraversalSignal::Traverse
    }

    fn leave_node(&mut self, node: AnyNodeRef<'ast>) {
+        self.assert_end_boundary(node);
+
        self.parents.pop().expect("Expected tree to be balanced");

        self.previous = Some(node);
--- a/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lex_logical_token.py.snap
+++ b/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lex_logical_token.py.snap
@ -296,7 +296,7 @@ Module(
                    test: Call(
                        ExprCall {
                            node_index: NodeIndex(None),
-                            range: 456..472,
+                            range: 456..471,
                            func: Name(
                                ExprName {
                                    node_index: NodeIndex(None),
@ -306,7 +306,7 @@ Module(
                                },
                            ),
                            arguments: Arguments {
-                                range: 460..472,
+                                range: 460..471,
                                node_index: NodeIndex(None),
                                args: [
                                    Name(
@ -581,7 +581,7 @@ Module(
                    test: Call(
                        ExprCall {
                            node_index: NodeIndex(None),
-                            range: 890..906,
+                            range: 890..905,
                            func: Name(
                                ExprName {
                                    node_index: NodeIndex(None),
@ -591,7 +591,7 @@ Module(
                                },
                            ),
                            arguments: Arguments {
-                                range: 894..906,
+                                range: 894..905,
                                node_index: NodeIndex(None),
                                args: [
                                    FString(
@ -832,7 +832,16 @@ Module(
   |
 28 | # The lexer is nested with multiple levels of parentheses
 29 | if call(foo, [a, b
-   |                   ^ Syntax Error: Expected `]`, found NonLogicalNewline
+30 |     def bar():
+   |     ^^^ Syntax Error: Expected `]`, found `def`
+31 |         pass
+   |
+
+
+   |
+28 | # The lexer is nested with multiple levels of parentheses
+29 | if call(foo, [a, b
+   |                   ^ Syntax Error: Expected `)`, found newline
 30 |     def bar():
 31 |         pass
   |
@ -857,11 +866,10 @@ Module(


   |
-41 | # test is to make sure it emits a `NonLogicalNewline` token after `b`.
 42 | if call(foo, [a,
 43 |     b
-   |      ^ Syntax Error: Expected `]`, found NonLogicalNewline
 44 | )
+   | ^ Syntax Error: Expected `]`, found `)`
 45 |     def bar():
 46 |         pass
   |
@ -898,7 +906,7 @@ Module(
   |
 49 | # F-strings uses normal list parsing, so test those as well
 50 | if call(f"hello {x
-   |                   ^ Syntax Error: Expected FStringEnd, found NonLogicalNewline
+   |                   ^ Syntax Error: Expected `)`, found newline
 51 |     def bar():
 52 |         pass
   |
--- a/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lex_logical_token_mac_eol.py.snap
+++ b/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lex_logical_token_mac_eol.py.snap
@ -17,7 +17,7 @@ Module(
                    test: Call(
                        ExprCall {
                            node_index: NodeIndex(None),
-                            range: 3..19,
+                            range: 3..18,
                            func: Name(
                                ExprName {
                                    node_index: NodeIndex(None),
@ -27,7 +27,7 @@ Module(
                                },
                            ),
                            arguments: Arguments {
-                                range: 7..19,
+                                range: 7..18,
                                node_index: NodeIndex(None),
                                args: [
                                    Name(
@ -113,5 +113,11 @@ Module(

  |
 1 | if call(foo, [a, b
    def bar():
        pass
-  |                   ^ Syntax Error: Expected `]`, found NonLogicalNewline
+  |                       ^^^ Syntax Error: Expected `]`, found `def`
+  |
+
+
+  |
+1 | if call(foo, [a, b
    def bar():
        pass
+  |                   ^ Syntax Error: Expected `)`, found newline
  |
--- a/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lex_logical_token_windows_eol.py.snap
+++ b/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lex_logical_token_windows_eol.py.snap
@ -17,7 +17,7 @@ Module(
                    test: Call(
                        ExprCall {
                            node_index: NodeIndex(None),
-                            range: 3..20,
+                            range: 3..18,
                            func: Name(
                                ExprName {
                                    node_index: NodeIndex(None),
@ -27,7 +27,7 @@ Module(
                                },
                            ),
                            arguments: Arguments {
-                                range: 7..20,
+                                range: 7..18,
                                node_index: NodeIndex(None),
                                args: [
                                    Name(
@ -113,7 +113,15 @@ Module(

  |
 1 | if call(foo, [a, b
-  |                   ^ Syntax Error: Expected `]`, found NonLogicalNewline
+2 |     def bar():
+  |     ^^^ Syntax Error: Expected `]`, found `def`
+3 |         pass
+  |
+
+
+  |
+1 | if call(foo, [a, b
+  |                   ^ Syntax Error: Expected `)`, found newline
 2 |     def bar():
 3 |         pass
  |
--- a/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lexing__ty_1828.py.snap
+++ b/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lexing__ty_1828.py.snap
@ -0,0 +1,227 @@
+---
+source: crates/ruff_python_parser/tests/fixtures.rs
+input_file: crates/ruff_python_parser/resources/invalid/re_lexing/ty_1828.py
+---
+## AST
+
+```
+Module(
+    ModModule {
+        node_index: NodeIndex(None),
+        range: 0..112,
+        body: [
+            AnnAssign(
+                StmtAnnAssign {
+                    node_index: NodeIndex(None),
+                    range: 66..93,
+                    target: Name(
+                        ExprName {
+                            node_index: NodeIndex(None),
+                            range: 67..68,
+                            id: Name("c"),
+                            ctx: Store,
+                        },
+                    ),
+                    annotation: Name(
+                        ExprName {
+                            node_index: NodeIndex(None),
+                            range: 70..73,
+                            id: Name("int"),
+                            ctx: Load,
+                        },
+                    ),
+                    value: Some(
+                        Tuple(
+                            ExprTuple {
+                                node_index: NodeIndex(None),
+                                range: 76..93,
+                                elts: [
+                                    NumberLiteral(
+                                        ExprNumberLiteral {
+                                            node_index: NodeIndex(None),
+                                            range: 76..77,
+                                            value: Int(
+                                                1,
+                                            ),
+                                        },
+                                    ),
+                                    Subscript(
+                                        ExprSubscript {
+                                            node_index: NodeIndex(None),
+                                            range: 78..90,
+                                            value: FString(
+                                                ExprFString {
+                                                    node_index: NodeIndex(None),
+                                                    range: 78..85,
+                                                    value: FStringValue {
+                                                        inner: Single(
+                                                            FString(
+                                                                FString {
+                                                                    range: 78..85,
+                                                                    node_index: NodeIndex(None),
+                                                                    elements: [
+                                                                        Interpolation(
+                                                                            InterpolatedElement {
+                                                                                range: 82..85,
+                                                                                node_index: NodeIndex(None),
+                                                                                expression: Name(
+                                                                                    ExprName {
+                                                                                        node_index: NodeIndex(None),
+                                                                                        range: 83..84,
+                                                                                        id: Name("d"),
+                                                                                        ctx: Load,
+                                                                                    },
+                                                                                ),
+                                                                                debug_text: Some(
+                                                                                    DebugText {
+                                                                                        leading: "",
+                                                                                        trailing: "=",
+                                                                                    },
+                                                                                ),
+                                                                                conversion: None,
+                                                                                format_spec: None,
+                                                                            },
+                                                                        ),
+                                                                    ],
+                                                                    flags: FStringFlags {
+                                                                        quote_style: Double,
+                                                                        prefix: Regular,
+                                                                        triple_quoted: true,
+                                                                        unclosed: true,
+                                                                    },
+                                                                },
+                                                            ),
+                                                        ),
+                                                    },
+                                                },
+                                            ),
+                                            slice: Slice(
+                                                ExprSlice {
+                                                    node_index: NodeIndex(None),
+                                                    range: 87..90,
+                                                    lower: None,
+                                                    upper: Some(
+                                                        Name(
+                                                            ExprName {
+                                                                node_index: NodeIndex(None),
+                                                                range: 87..90,
+                                                                id: Name("def"),
+                                                                ctx: Load,
+                                                            },
+                                                        ),
+                                                    ),
+                                                    step: None,
+                                                },
+                                            ),
+                                            ctx: Load,
+                                        },
+                                    ),
+                                    Call(
+                                        ExprCall {
+                                            node_index: NodeIndex(None),
+                                            range: 91..93,
+                                            func: Name(
+                                                ExprName {
+                                                    node_index: NodeIndex(None),
+                                                    range: 91..92,
+                                                    id: Name("a"),
+                                                    ctx: Load,
+                                                },
+                                            ),
+                                            arguments: Arguments {
+                                                range: 92..93,
+                                                node_index: NodeIndex(None),
+                                                args: [],
+                                                keywords: [],
+                                            },
+                                        },
+                                    ),
+                                ],
+                                ctx: Load,
+                                parenthesized: false,
+                            },
+                        ),
+                    ),
+                    simple: false,
+                },
+            ),
+        ],
+    },
+)
+```
+## Errors
+
+  |
+1 | # Regression test for https://github.com/astral-sh/ty/issues/1828
+2 | (c: int = 1,f"""{d=[
+  |   ^ Syntax Error: Expected `)`, found `:`
+3 | def a(
+4 | class A:
+  |
+
+
+  |
+1 | # Regression test for https://github.com/astral-sh/ty/issues/1828
+2 | (c: int = 1,f"""{d=[
+  |                    ^ Syntax Error: f-string: expecting `}`
+3 | def a(
+4 | class A:
+  |
+
+
+  |
+1 | # Regression test for https://github.com/astral-sh/ty/issues/1828
+2 | (c: int = 1,f"""{d=[
+3 | def a(
+  | ^^^ Syntax Error: Expected `:`, found `def`
+4 | class A:
+5 |     pass
+  |
+
+
+  |
+1 | # Regression test for https://github.com/astral-sh/ty/issues/1828
+2 | (c: int = 1,f"""{d=[
+3 | def a(
+  |     ^ Syntax Error: Expected `]`, found name
+4 | class A:
+5 |     pass
+  |
+
+
+  |
+1 |   # Regression test for https://github.com/astral-sh/ty/issues/1828
+2 |   (c: int = 1,f"""{d=[
+3 |   def a(
+  |  _______^
+4 | | class A:
+5 | |     pass
+  | |_________^ Syntax Error: f-string: unterminated triple-quoted string
+  |
+
+
+  |
+2 | (c: int = 1,f"""{d=[
+3 | def a(
+4 | class A:
+  | ^^^^^ Syntax Error: Expected `)`, found `class`
+5 |     pass
+  |
+
+
+  |
+1 |   # Regression test for https://github.com/astral-sh/ty/issues/1828
+2 |   (c: int = 1,f"""{d=[
+3 |   def a(
+  |  _______^
+4 | | class A:
+5 | |     pass
+  | |_________^ Syntax Error: Expected a statement
+  |
+
+
+  |
+4 | class A:
+5 |     pass
+  |         ^ Syntax Error: unexpected EOF while parsing
+  |