Simple lexer for formatter (#4922)

2023-06-08 17:37:39 +02:00 · 2023-06-08 17:37:39 +02:00 · 9c3fb23ace
parent 467df23e65
commit 9c3fb23ace
11 changed files with 1152 additions and 189 deletions
--- a/crates/ruff_python_formatter/src/comments/placement.rs
+++ b/crates/ruff_python_formatter/src/comments/placement.rs
@ -1,11 +1,11 @@
 use crate::comments::visitor::{CommentPlacement, DecoratedComment};
 use crate::comments::CommentTextPosition;
-use crate::trivia::find_first_non_trivia_character_in_range;
+use crate::trivia::{SimpleTokenizer, TokenKind};
 use ruff_newlines::StrExt;
 use ruff_python_ast::node::AnyNodeRef;
 use ruff_python_ast::source_code::Locator;
 use ruff_python_ast::whitespace;
-use ruff_text_size::{TextLen, TextRange, TextSize};
+use ruff_text_size::{TextRange, TextSize};
 use rustpython_parser::ast::Ranged;
 use std::cmp::Ordering;
@ -521,14 +521,16 @@ fn handle_trailing_end_of_line_condition_comment<'a>(
    // If the preceding is the node before the `colon`
    // `while true:` The node before the `colon` is the `true` constant.
    if preceding.ptr_eq(last_before_colon) {
-        let mut start = preceding.end();
+        let tokens = SimpleTokenizer::new(
        while let Some((offset, c)) = find_first_non_trivia_character_in_range(
            TextRange::new(start, following.start()),
            locator.contents(),
-        ) {
+            TextRange::new(preceding.end(), following.start()),
-            match c {
+        )
-                ':' => {
+        .skip_trivia();
-                    if comment.slice().start() > offset {
+
        for token in tokens {
            match token.kind() {
                TokenKind::Colon => {
                    if comment.slice().start() > token.start() {
                        // Comment comes after the colon
                        // ```python
                        // while a: # comment
@ -546,9 +548,8 @@ fn handle_trailing_end_of_line_condition_comment<'a>(
                    // ```
                    break;
                }
-                ')' => {
+                TokenKind::RParen => {
                    // Skip over any closing parentheses
                    start = offset + ')'.text_len();
                }
                _ => {
                    unreachable!("Only ')' or ':' should follow the condition")
@ -652,21 +653,17 @@ fn handle_trailing_binary_expression_left_or_operator_comment<'a>(
        return CommentPlacement::Default(comment);
    }
-    let mut between_operands_range = TextRange::new(
+    let between_operands_range = TextRange::new(
        binary_expression.left.end(),
        binary_expression.right.start(),
    );
-    let operator_offset = loop {
+    let mut tokens = SimpleTokenizer::new(locator.contents(), between_operands_range).skip_trivia();
-        match find_first_non_trivia_character_in_range(between_operands_range, locator.contents()) {
+    let operator_offset = if let Some(non_r_paren) = tokens.find(|t| t.kind() != TokenKind::RParen)
-            // Skip over closing parens
+    {
-            Some((offset, ')')) => {
+        non_r_paren.start()
-                between_operands_range =
+    } else {
-                    TextRange::new(offset + TextSize::new(1), between_operands_range.end());
+        return CommentPlacement::Default(comment);
            }
            Some((offset, _)) => break offset,
            None => return CommentPlacement::Default(comment),
        }
    };
    let comment_range = comment.slice().range();
@ -805,29 +802,22 @@ fn find_pos_only_slash_offset(
    between_arguments_range: TextRange,
    locator: &Locator,
 ) -> Option<TextSize> {
-    // First find the comma separating the two arguments
+    let mut tokens =
-    find_first_non_trivia_character_in_range(between_arguments_range, locator.contents()).and_then(
+        SimpleTokenizer::new(locator.contents(), between_arguments_range).skip_trivia();
        |(comma_offset, comma)| {
            debug_assert_eq!(comma, ',');
-            // Then find the position of the `/` operator
+    if let Some(comma) = tokens.next() {
-            find_first_non_trivia_character_in_range(
+        debug_assert_eq!(comma.kind(), TokenKind::Comma);
-                TextRange::new(
+
-                    comma_offset + TextSize::new(1),
+        if let Some(maybe_slash) = tokens.next() {
-                    between_arguments_range.end(),
+            if maybe_slash.kind() == TokenKind::Slash {
-                ),
+                return Some(maybe_slash.start());
                locator.contents(),
            )
            .and_then(|(offset, c)| {
                if c == '/' {
                    Some(offset)
                } else {
                    debug_assert_eq!(c, ')');
                    None
            }
-            })
+
-        },
+            debug_assert_eq!(maybe_slash.kind(), TokenKind::RParen);
-    )
+        }
    }
    None
 }
 /// Returns `true` if `right` is `Some` and `left` and `right` are referentially equal.
--- a/crates/ruff_python_formatter/src/expression/parentheses.rs
+++ b/crates/ruff_python_formatter/src/expression/parentheses.rs
@ -1,7 +1,6 @@
-use crate::trivia::{
+use crate::trivia::{first_non_trivia_token, first_non_trivia_token_rev, Token, TokenKind};
    find_first_non_trivia_character_after, find_first_non_trivia_character_before,
 };
 use ruff_python_ast::node::AnyNodeRef;
 use rustpython_parser::ast::Ranged;
 pub(crate) trait NeedsParentheses {
    fn needs_parentheses(&self, parenthesize: Parenthesize, source: &str) -> Parentheses;
@ -73,21 +72,17 @@ pub enum Parentheses {
 }
 fn is_expression_parenthesized(expr: AnyNodeRef, contents: &str) -> bool {
    use rustpython_parser::ast::Ranged;
    debug_assert!(
        expr.is_expression(),
        "Should only be called for expressions"
    );
    // Search backwards to avoid ambiguity with `(a, )` and because it's faster
    matches!(
-        find_first_non_trivia_character_after(expr.end(), contents),
+        first_non_trivia_token(expr.end(), contents),
-        Some((_, ')'))
+        Some(Token {
-    )
+            kind: TokenKind::RParen,
-        // Search forwards to confirm that this is not a nested expression `(5 + d * 3)`
+            ..
-        && matches!(
+        })
-        find_first_non_trivia_character_before(expr.start(), contents),
+    ) && matches!(
-        Some((_, '('))
+        first_non_trivia_token_rev(expr.start(), contents),
        Some(Token {
            kind: TokenKind::LParen,
            ..
        })
    )
 }
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__Reverse.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__Reverse.snap
@ -0,0 +1,218 @@
 ---
 source: crates/ruff_python_formatter/src/trivia.rs
 expression: test_case.tokenize_reverse()
 ---
 [
    Token {
        kind: RParen,
        range: 52..53,
    },
    Token {
        kind: Other,
        range: 51..52,
    },
    Token {
        kind: Bogus,
        range: 50..51,
    },
    Token {
        kind: Bogus,
        range: 49..50,
    },
    Token {
        kind: Bogus,
        range: 48..49,
    },
    Token {
        kind: Bogus,
        range: 47..48,
    },
    Token {
        kind: Bogus,
        range: 46..47,
    },
    Token {
        kind: Bogus,
        range: 45..46,
    },
    Token {
        kind: Bogus,
        range: 44..45,
    },
    Token {
        kind: Bogus,
        range: 43..44,
    },
    Token {
        kind: Bogus,
        range: 42..43,
    },
    Token {
        kind: Bogus,
        range: 41..42,
    },
    Token {
        kind: Bogus,
        range: 40..41,
    },
    Token {
        kind: Bogus,
        range: 39..40,
    },
    Token {
        kind: Bogus,
        range: 38..39,
    },
    Token {
        kind: Bogus,
        range: 37..38,
    },
    Token {
        kind: Bogus,
        range: 36..37,
    },
    Token {
        kind: Bogus,
        range: 35..36,
    },
    Token {
        kind: Bogus,
        range: 34..35,
    },
    Token {
        kind: Bogus,
        range: 33..34,
    },
    Token {
        kind: Bogus,
        range: 32..33,
    },
    Token {
        kind: Bogus,
        range: 31..32,
    },
    Token {
        kind: Bogus,
        range: 30..31,
    },
    Token {
        kind: Bogus,
        range: 29..30,
    },
    Token {
        kind: Bogus,
        range: 28..29,
    },
    Token {
        kind: Bogus,
        range: 27..28,
    },
    Token {
        kind: Bogus,
        range: 26..27,
    },
    Token {
        kind: Bogus,
        range: 25..26,
    },
    Token {
        kind: Bogus,
        range: 24..25,
    },
    Token {
        kind: Bogus,
        range: 23..24,
    },
    Token {
        kind: Bogus,
        range: 22..23,
    },
    Token {
        kind: Bogus,
        range: 21..22,
    },
    Token {
        kind: Bogus,
        range: 20..21,
    },
    Token {
        kind: Bogus,
        range: 19..20,
    },
    Token {
        kind: Bogus,
        range: 18..19,
    },
    Token {
        kind: Bogus,
        range: 17..18,
    },
    Token {
        kind: Bogus,
        range: 16..17,
    },
    Token {
        kind: Bogus,
        range: 15..16,
    },
    Token {
        kind: Bogus,
        range: 14..15,
    },
    Token {
        kind: Bogus,
        range: 13..14,
    },
    Token {
        kind: Bogus,
        range: 12..13,
    },
    Token {
        kind: Bogus,
        range: 11..12,
    },
    Token {
        kind: Bogus,
        range: 10..11,
    },
    Token {
        kind: Bogus,
        range: 9..10,
    },
    Token {
        kind: Bogus,
        range: 8..9,
    },
    Token {
        kind: Bogus,
        range: 7..8,
    },
    Token {
        kind: Bogus,
        range: 6..7,
    },
    Token {
        kind: Bogus,
        range: 5..6,
    },
    Token {
        kind: Bogus,
        range: 4..5,
    },
    Token {
        kind: Bogus,
        range: 3..4,
    },
    Token {
        kind: Bogus,
        range: 2..3,
    },
    Token {
        kind: Bogus,
        range: 1..2,
    },
    Token {
        kind: Bogus,
        range: 0..1,
    },
 ]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_bogus.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_bogus.snap
@ -0,0 +1,126 @@
 ---
 source: crates/ruff_python_formatter/src/trivia.rs
 expression: test_case.tokens()
 ---
 [
    Token {
        kind: Comment,
        range: 0..17,
    },
    Token {
        kind: Newline,
        range: 17..18,
    },
    Token {
        kind: Whitespace,
        range: 18..26,
    },
    Token {
        kind: Other,
        range: 26..27,
    },
    Token {
        kind: Bogus,
        range: 27..28,
    },
    Token {
        kind: Bogus,
        range: 28..29,
    },
    Token {
        kind: Bogus,
        range: 29..30,
    },
    Token {
        kind: Bogus,
        range: 30..31,
    },
    Token {
        kind: Bogus,
        range: 31..32,
    },
    Token {
        kind: Bogus,
        range: 32..33,
    },
    Token {
        kind: Bogus,
        range: 33..34,
    },
    Token {
        kind: Bogus,
        range: 34..35,
    },
    Token {
        kind: Bogus,
        range: 35..36,
    },
    Token {
        kind: Bogus,
        range: 36..37,
    },
    Token {
        kind: Bogus,
        range: 37..38,
    },
    Token {
        kind: Bogus,
        range: 38..39,
    },
    Token {
        kind: Bogus,
        range: 39..40,
    },
    Token {
        kind: Bogus,
        range: 40..41,
    },
    Token {
        kind: Bogus,
        range: 41..42,
    },
    Token {
        kind: Bogus,
        range: 42..43,
    },
    Token {
        kind: Bogus,
        range: 43..44,
    },
    Token {
        kind: Bogus,
        range: 44..45,
    },
    Token {
        kind: Bogus,
        range: 45..46,
    },
    Token {
        kind: Bogus,
        range: 46..47,
    },
    Token {
        kind: Bogus,
        range: 47..48,
    },
    Token {
        kind: Bogus,
        range: 48..49,
    },
    Token {
        kind: Bogus,
        range: 49..50,
    },
    Token {
        kind: Bogus,
        range: 50..51,
    },
    Token {
        kind: Bogus,
        range: 51..52,
    },
    Token {
        kind: Bogus,
        range: 52..53,
    },
 ]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_comma.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_comma.snap
@ -0,0 +1,22 @@
 ---
 source: crates/ruff_python_formatter/src/trivia.rs
 expression: tokens
 ---
 [
    Token {
        kind: Comma,
        range: 0..1,
    },
    Token {
        kind: Comma,
        range: 1..2,
    },
    Token {
        kind: Comma,
        range: 2..3,
    },
    Token {
        kind: Comma,
        range: 3..4,
    },
 ]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_continuation.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_continuation.snap
@ -0,0 +1,30 @@
 ---
 source: crates/ruff_python_formatter/src/trivia.rs
 expression: tokens
 ---
 [
    Token {
        kind: LParen,
        range: 0..1,
    },
    Token {
        kind: Whitespace,
        range: 1..2,
    },
    Token {
        kind: Continuation,
        range: 2..3,
    },
    Token {
        kind: Newline,
        range: 3..4,
    },
    Token {
        kind: Whitespace,
        range: 4..5,
    },
    Token {
        kind: RParen,
        range: 5..6,
    },
 ]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_parentheses.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_parentheses.snap
@ -0,0 +1,30 @@
 ---
 source: crates/ruff_python_formatter/src/trivia.rs
 expression: tokens
 ---
 [
    Token {
        kind: LParen,
        range: 0..1,
    },
    Token {
        kind: LBracket,
        range: 1..2,
    },
    Token {
        kind: LBrace,
        range: 2..3,
    },
    Token {
        kind: RBrace,
        range: 3..4,
    },
    Token {
        kind: RBracket,
        range: 4..5,
    },
    Token {
        kind: RParen,
        range: 5..6,
    },
 ]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_slash.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_slash.snap
@ -0,0 +1,42 @@
 ---
 source: crates/ruff_python_formatter/src/trivia.rs
 expression: test_case.tokens()
 ---
 [
    Token {
        kind: Whitespace,
        range: 0..1,
    },
    Token {
        kind: Comment,
        range: 1..30,
    },
    Token {
        kind: Newline,
        range: 30..31,
    },
    Token {
        kind: Whitespace,
        range: 31..39,
    },
    Token {
        kind: Comment,
        range: 39..77,
    },
    Token {
        kind: Newline,
        range: 77..78,
    },
    Token {
        kind: Whitespace,
        range: 78..86,
    },
    Token {
        kind: Comma,
        range: 86..87,
    },
    Token {
        kind: Slash,
        range: 87..88,
    },
 ]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_substring.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_substring.snap
@ -0,0 +1,18 @@
 ---
 source: crates/ruff_python_formatter/src/trivia.rs
 expression: tokens
 ---
 [
    Token {
        kind: RParen,
        range: 14..15,
    },
    Token {
        kind: Whitespace,
        range: 15..16,
    },
    Token {
        kind: Comment,
        range: 16..25,
    },
 ]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_trivia.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_trivia.snap
@ -0,0 +1,22 @@
 ---
 source: crates/ruff_python_formatter/src/trivia.rs
 expression: tokens
 ---
 [
    Token {
        kind: Comment,
        range: 0..9,
    },
    Token {
        kind: Newline,
        range: 9..10,
    },
    Token {
        kind: Whitespace,
        range: 10..14,
    },
    Token {
        kind: Comment,
        range: 14..23,
    },
 ]
--- a/crates/ruff_python_formatter/src/trivia.rs
+++ b/crates/ruff_python_formatter/src/trivia.rs
@ -1,5 +1,6 @@
 use ruff_python_ast::whitespace::is_python_whitespace;
 use ruff_text_size::{TextLen, TextRange, TextSize};
 use std::str::Chars;
 /// Searches for the first non-trivia character in `range`.
 ///
@ -9,113 +10,40 @@ use ruff_text_size::{TextLen, TextRange, TextSize};
 /// of the character, the second item the non-trivia character.
 ///
 /// Returns `None` if the range is empty or only contains trivia (whitespace or comments).
-pub(crate) fn find_first_non_trivia_character_in_range(
+pub(crate) fn first_non_trivia_token(offset: TextSize, code: &str) -> Option<Token> {
-    range: TextRange,
+    SimpleTokenizer::starts_at(offset, code)
-    code: &str,
+        .skip_trivia()
-) -> Option<(TextSize, char)> {
+        .next()
    let rest = &code[range];
    let mut char_iter = rest.chars();
    while let Some(c) = char_iter.next() {
        match c {
            '#' => {
                // We're now inside of a comment. Skip all content until the end of the line
                for c in char_iter.by_ref() {
                    if matches!(c, '\n' | '\r') {
                        break;
                    }
                }
            }
            c => {
                if !is_python_whitespace(c) {
                    let index = range.start() + rest.text_len()
                        - char_iter.as_str().text_len()
                        - c.text_len();
                    return Some((index, c));
                }
            }
        }
 }
-    None
+/// Returns the first non-trivia token right before `offset` or `None` if at the start of the file
-}
+/// or all preceding tokens are trivia tokens.
-
+///
-pub(crate) fn find_first_non_trivia_character_after(
+/// ## Notes
-    offset: TextSize,
+///
-    code: &str,
+/// Prefer [`first_non_trivia_token`] whenever possible because reverse lookup is expensive because of comments.
-) -> Option<(TextSize, char)> {
+pub(crate) fn first_non_trivia_token_rev(offset: TextSize, code: &str) -> Option<Token> {
-    find_first_non_trivia_character_in_range(TextRange::new(offset, code.text_len()), code)
+    SimpleTokenizer::up_to(offset, code)
-}
+        .skip_trivia()
-
+        .next_back()
 pub(crate) fn find_first_non_trivia_character_before(
    offset: TextSize,
    code: &str,
 ) -> Option<(TextSize, char)> {
    let head = &code[TextRange::up_to(offset)];
    let mut char_iter = head.chars();
    while let Some(c) = char_iter.next_back() {
        match c {
            c if is_python_whitespace(c) => {
                continue;
            }
            // Empty comment
            '#' => continue,
            non_trivia_character => {
                // Non trivia character but we don't know if it is a comment or not. Consume all characters
                // until the start of the line and track if the last non-whitespace character was a `#`.
                let mut is_comment = false;
                let first_non_trivia_offset = char_iter.as_str().text_len();
                while let Some(c) = char_iter.next_back() {
                    match c {
                        '#' => {
                            is_comment = true;
                        }
                        '\n' | '\r' => {
                            if !is_comment {
                                return Some((first_non_trivia_offset, non_trivia_character));
                            }
                        }
                        c => {
                            if !is_python_whitespace(c) {
                                is_comment = false;
                            }
                        }
                    }
                }
            }
        }
    }
    None
 }
 /// Returns the number of newlines between `offset` and the first non whitespace character in the source code.
 pub(crate) fn lines_before(offset: TextSize, code: &str) -> u32 {
-    let head = &code[TextRange::up_to(offset)];
+    let tokens = SimpleTokenizer::up_to(offset, code);
    let mut newlines = 0u32;
-    for (index, c) in head.char_indices().rev() {
+    for token in tokens.rev() {
-        match c {
+        match token.kind() {
-            '\n' => {
+            TokenKind::Newline => {
                if head.as_bytes()[index.saturating_sub(1)] == b'\r' {
                    continue;
                }
                newlines += 1;
            }
-
+            TokenKind::Whitespace => {
-            '\r' => {
+                // ignore
-                newlines += 1;
+            }
            _ => {
                break;
            }
            c if is_python_whitespace(c) => continue,
            _ => break,
        }
    }
@ -124,22 +52,20 @@ pub(crate) fn lines_before(offset: TextSize, code: &str) -> u32 {
 /// Counts the empty lines between `offset` and the first non-whitespace character.
 pub(crate) fn lines_after(offset: TextSize, code: &str) -> u32 {
-    let rest = &code[usize::from(offset)..];
+    let tokens = SimpleTokenizer::starts_at(offset, code);
-    let mut newlines = 0;
+    let mut newlines = 0u32;
-    for (index, c) in rest.char_indices() {
+    for token in tokens {
-        match c {
+        match token.kind() {
-            '\n' => {
+            TokenKind::Newline => {
                newlines += 1;
            }
-            '\r' if rest.as_bytes().get(index + 1).copied() == Some(b'\n') => {
+            TokenKind::Whitespace => {
-                continue;
+                // ignore
            }
-            '\r' => {
+            _ => {
-                newlines += 1;
+                break;
            }
            c if is_python_whitespace(c) => continue,
            _ => break,
        }
    }
@ -148,35 +74,579 @@ pub(crate) fn lines_after(offset: TextSize, code: &str) -> u32 {
 /// Returns the position after skipping any trailing trivia up to, but not including the newline character.
 pub(crate) fn skip_trailing_trivia(offset: TextSize, code: &str) -> TextSize {
-    let rest = &code[usize::from(offset)..];
+    let tokenizer = SimpleTokenizer::starts_at(offset, code);
    let mut iter = rest.char_indices();
-    while let Some((relative_offset, c)) = iter.next() {
+    for token in tokenizer {
        match token.kind() {
            TokenKind::Whitespace | TokenKind::Comment | TokenKind::Continuation => {
                // No op
            }
            _ => {
                return token.start();
            }
        }
    }
    offset
 }
 #[derive(Clone, Debug, Eq, PartialEq, Hash)]
 pub(crate) struct Token {
    pub(crate) kind: TokenKind,
    pub(crate) range: TextRange,
 }
 impl Token {
    pub(crate) const fn kind(&self) -> TokenKind {
        self.kind
    }
    #[allow(unused)]
    pub(crate) const fn range(&self) -> TextRange {
        self.range
    }
    pub(crate) const fn start(&self) -> TextSize {
        self.range.start()
    }
    #[allow(unused)]
    pub(crate) const fn end(&self) -> TextSize {
        self.range.start()
    }
 }
 #[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
 pub(crate) enum TokenKind {
    /// A comment, not including the trailing new line.
    Comment,
    /// Sequence of ' ' or '\t'
    Whitespace,
    /// Start or end of the file
    EndOfFile,
    /// `\\`
    Continuation,
    /// `\n` or `\r` or `\r\n`
    Newline,
    /// `(`
    LParen,
    /// `)`
    RParen,
    /// `{`
    LBrace,
    /// `}`
    RBrace,
    /// `[`
    LBracket,
    /// `]`
    RBracket,
    /// `,`
    Comma,
    /// `:`
    Colon,
    /// '/'
    Slash,
    /// Any other non trivia token. Always has a length of 1
    Other,
    /// Returned for each character after [`TokenKind::Other`] has been returned once.
    Bogus,
 }
 impl TokenKind {
    const fn from_non_trivia_char(c: char) -> TokenKind {
        match c {
-            '\n' | '\r' => return offset + TextSize::try_from(relative_offset).unwrap(),
+            '(' => TokenKind::LParen,
            ')' => TokenKind::RParen,
            '[' => TokenKind::LBracket,
            ']' => TokenKind::RBracket,
            '{' => TokenKind::LBrace,
            '}' => TokenKind::RBrace,
            ',' => TokenKind::Comma,
            ':' => TokenKind::Colon,
            '/' => TokenKind::Slash,
            _ => TokenKind::Other,
        }
    }
    const fn is_trivia(self) -> bool {
        matches!(
            self,
            TokenKind::Whitespace
                | TokenKind::Newline
                | TokenKind::Comment
                | TokenKind::Continuation
        )
    }
 }
 /// Simple zero allocation tokenizer for tokenizing trivia (and some tokens).
 ///
 /// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
 ///
 /// The tokenizer doesn't guarantee any correctness after it returned a [`TokenKind::Other`]. That's why it
 /// will return [`TokenKind::Bogus`] for every character after until it reaches the end of the file.
 pub(crate) struct SimpleTokenizer<'a> {
    offset: TextSize,
    back_offset: TextSize,
    /// `true` when it is known that the current `back` line has no comment for sure.
    back_line_has_no_comment: bool,
    bogus: bool,
    cursor: Cursor<'a>,
 }
 impl<'a> SimpleTokenizer<'a> {
    pub(crate) fn new(source: &'a str, range: TextRange) -> Self {
        Self {
            offset: range.start(),
            back_offset: range.end(),
            back_line_has_no_comment: false,
            bogus: false,
            cursor: Cursor::new(&source[range]),
        }
    }
    pub(crate) fn starts_at(offset: TextSize, source: &'a str) -> Self {
        let range = TextRange::new(offset, source.text_len());
        Self::new(source, range)
    }
    pub(crate) fn up_to(offset: TextSize, source: &'a str) -> Self {
        Self::new(source, TextRange::up_to(offset))
    }
    fn next_token(&mut self) -> Token {
        self.cursor.start_token();
        let Some(first) = self.cursor.bump() else {
            return Token {
                kind: TokenKind::EndOfFile,
                range: TextRange::empty(self.offset),
            }
        };
        if self.bogus {
            let token = Token {
                kind: TokenKind::Bogus,
                range: TextRange::at(self.offset, first.text_len()),
            };
            self.offset += first.text_len();
            return token;
        }
        let kind = match first {
            ' ' | '\t' => {
                self.cursor.eat_while(|c| matches!(c, ' ' | '\t'));
                TokenKind::Whitespace
            }
            '\n' => TokenKind::Newline,
            '\r' => {
                self.cursor.eat_char('\n');
                TokenKind::Newline
            }
            '#' => {
-                // Skip the comment
+                self.cursor.eat_while(|c| !matches!(c, '\n' | '\r'));
-                let newline_offset = iter
+                TokenKind::Comment
-                    .as_str()
+            }
-                    .find(['\n', '\r'])
+
-                    .unwrap_or(iter.as_str().len());
+            '\\' => TokenKind::Continuation,
-
+
-                return offset
+            c => {
-                    + TextSize::try_from(relative_offset + '#'.len_utf8() + newline_offset)
+                let kind = TokenKind::from_non_trivia_char(c);
-                        .unwrap();
+
                if kind == TokenKind::Other {
                    self.bogus = true;
                }
                kind
            }
        };
        let token_len = self.cursor.token_len();
        let token = Token {
            kind,
            range: TextRange::at(self.offset, token_len),
        };
        self.offset += token_len;
        token
    }
    /// Returns the next token from the back. Prefer iterating forwards. Iterating backwards is significantly more expensive
    /// because it needs to check if the line has any comments when encountering any non-trivia token.
    pub(crate) fn next_token_back(&mut self) -> Token {
        self.cursor.start_token();
        let Some(last) = self.cursor.bump_back() else {
            return Token {
                kind: TokenKind::EndOfFile,
                range: TextRange::empty(self.back_offset),
            }
        };
        if self.bogus {
            let token = Token {
                kind: TokenKind::Bogus,
                range: TextRange::at(self.back_offset - last.text_len(), last.text_len()),
            };
            self.back_offset -= last.text_len();
            return token;
        }
        let kind = match last {
            // This may not be 100% correct because it will lex-out trailing whitespace from a comment
            // as whitespace rather than being part of the token. This shouldn't matter for what we use the lexer for.
            ' ' | '\t' => {
                self.cursor.eat_back_while(|c| matches!(c, ' ' | '\t'));
                TokenKind::Whitespace
            }
            '\r' => {
                self.back_line_has_no_comment = false;
                TokenKind::Newline
            }
            '\n' => {
                self.back_line_has_no_comment = false;
                self.cursor.eat_char_back('\r');
                TokenKind::Newline
            }
            // Empty comment (could also be a comment nested in another comment, but this shouldn't matter for what we use the lexer for)
            '#' => TokenKind::Comment,
            // For all other tokens, test if the character isn't part of a comment.
            c => {
                let mut comment_offset = None;
                // Skip the test whether there's a preceding comment if it has been performed before.
                if !self.back_line_has_no_comment {
                    let rest = self.cursor.chars.as_str();
                    for (back_index, c) in rest.chars().rev().enumerate() {
                        match c {
                            '#' => {
                                // Potentially a comment
                                comment_offset = Some(back_index + 1);
                            }
                            '\r' | '\n' | '\\' => {
                                break;
                            }
                            c => {
                                if !is_python_whitespace(c)
                                    && TokenKind::from_non_trivia_char(c) == TokenKind::Other
                                {
                                    comment_offset = None;
                                }
                            }
                        }
            c if is_python_whitespace(c) => continue,
            _ => return offset + TextSize::try_from(relative_offset).unwrap(),
                    }
                }
-    offset + rest.text_len()
+                // From here on it is guaranteed that this line has no other comment.
                self.back_line_has_no_comment = true;
                if let Some(comment_offset) = comment_offset {
                    // It is a comment, bump all tokens
                    for _ in 0..comment_offset {
                        self.cursor.bump_back().unwrap();
                    }
                    TokenKind::Comment
                } else if c == '\\' {
                    TokenKind::Continuation
                } else {
                    let kind = TokenKind::from_non_trivia_char(c);
                    if kind == TokenKind::Other {
                        self.bogus = true;
                    }
                    kind
                }
            }
        };
        let token_len = self.cursor.token_len();
        let start = self.back_offset - token_len;
        let token = Token {
            kind,
            range: TextRange::at(start, token_len),
        };
        self.back_offset = start;
        token
    }
    pub(crate) fn skip_trivia(self) -> impl Iterator<Item = Token> + DoubleEndedIterator + 'a {
        self.filter(|t| !t.kind().is_trivia())
    }
 }
 impl Iterator for SimpleTokenizer<'_> {
    type Item = Token;
    fn next(&mut self) -> Option<Self::Item> {
        let token = self.next_token();
        if token.kind == TokenKind::EndOfFile {
            None
        } else {
            Some(token)
        }
    }
 }
 impl DoubleEndedIterator for SimpleTokenizer<'_> {
    fn next_back(&mut self) -> Option<Self::Item> {
        let token = self.next_token_back();
        if token.kind == TokenKind::EndOfFile {
            None
        } else {
            Some(token)
        }
    }
 }
 const EOF_CHAR: char = '\0';
 #[derive(Debug, Clone)]
 struct Cursor<'a> {
    chars: Chars<'a>,
    source_length: TextSize,
 }
 impl<'a> Cursor<'a> {
    fn new(source: &'a str) -> Self {
        Self {
            source_length: source.text_len(),
            chars: source.chars(),
        }
    }
    /// Peeks the next character from the input stream without consuming it.
    /// Returns [`EOF_CHAR`] if the file is at the end of the file.
    fn first(&self) -> char {
        self.chars.clone().next().unwrap_or(EOF_CHAR)
    }
    /// Peeks the next character from the input stream without consuming it.
    /// Returns [`EOF_CHAR`] if the file is at the end of the file.
    fn last(&self) -> char {
        self.chars.clone().next_back().unwrap_or(EOF_CHAR)
    }
    // SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
    #[allow(clippy::cast_possible_truncation)]
    fn text_len(&self) -> TextSize {
        TextSize::new(self.chars.as_str().len() as u32)
    }
    fn token_len(&self) -> TextSize {
        self.source_length - self.text_len()
    }
    fn start_token(&mut self) {
        self.source_length = self.text_len();
    }
    fn is_eof(&self) -> bool {
        self.chars.as_str().is_empty()
    }
    /// Consumes the next character
    fn bump(&mut self) -> Option<char> {
        self.chars.next()
    }
    /// Consumes the next character from the back
    fn bump_back(&mut self) -> Option<char> {
        self.chars.next_back()
    }
    fn eat_char(&mut self, c: char) -> bool {
        if self.first() == c {
            self.bump();
            true
        } else {
            false
        }
    }
    fn eat_char_back(&mut self, c: char) -> bool {
        if self.last() == c {
            self.bump_back();
            true
        } else {
            false
        }
    }
    /// Eats symbols while predicate returns true or until the end of file is reached.
    fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
        // It was tried making optimized version of this for eg. line comments, but
        // LLVM can inline all of this and compile it down to fast iteration over bytes.
        while predicate(self.first()) && !self.is_eof() {
            self.bump();
        }
    }
    /// Eats symbols from the back while predicate returns true or until the beginning of file is reached.
    fn eat_back_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
        // It was tried making optimized version of this for eg. line comments, but
        // LLVM can inline all of this and compile it down to fast iteration over bytes.
        while predicate(self.last()) && !self.is_eof() {
            self.bump_back();
        }
    }
 }
 #[cfg(test)]
 mod tests {
-    use crate::trivia::{lines_after, lines_before};
+    use crate::trivia::{lines_after, lines_before, SimpleTokenizer, Token};
-    use ruff_text_size::TextSize;
+    use insta::assert_debug_snapshot;
    use ruff_text_size::{TextLen, TextRange, TextSize};
    struct TokenizationTestCase {
        source: &'static str,
        range: TextRange,
        tokens: Vec<Token>,
    }
    impl TokenizationTestCase {
        fn assert_reverse_tokenization(&self) {
            let mut backwards = self.tokenize_reverse();
            // Re-reverse to get the tokens in forward order.
            backwards.reverse();
            assert_eq!(&backwards, &self.tokens);
        }
        fn tokenize_reverse(&self) -> Vec<Token> {
            SimpleTokenizer::new(self.source, self.range)
                .rev()
                .collect()
        }
        fn tokens(&self) -> &[Token] {
            &self.tokens
        }
    }
    fn tokenize_range(source: &'static str, range: TextRange) -> TokenizationTestCase {
        let tokens: Vec<_> = SimpleTokenizer::new(source, range).collect();
        TokenizationTestCase {
            source,
            range,
            tokens,
        }
    }
    fn tokenize(source: &'static str) -> TokenizationTestCase {
        tokenize_range(source, TextRange::new(TextSize::new(0), source.text_len()))
    }
    #[test]
    fn tokenize_trivia() {
        let source = "# comment\n    # comment";
        let test_case = tokenize(source);
        assert_debug_snapshot!(test_case.tokens());
        test_case.assert_reverse_tokenization();
    }
    #[test]
    fn tokenize_parentheses() {
        let source = "([{}])";
        let test_case = tokenize(source);
        assert_debug_snapshot!(test_case.tokens());
        test_case.assert_reverse_tokenization();
    }
    #[test]
    fn tokenize_comma() {
        let source = ",,,,";
        let test_case = tokenize(source);
        assert_debug_snapshot!(test_case.tokens());
        test_case.assert_reverse_tokenization();
    }
    #[test]
    fn tokenize_continuation() {
        let source = "( \\\n )";
        let test_case = tokenize(source);
        assert_debug_snapshot!(test_case.tokens());
        test_case.assert_reverse_tokenization();
    }
    #[test]
    fn tokenize_substring() {
        let source = "('some string') # comment";
        let test_case =
            tokenize_range(source, TextRange::new(TextSize::new(14), source.text_len()));
        assert_debug_snapshot!(test_case.tokens());
        test_case.assert_reverse_tokenization();
    }
    #[test]
    fn tokenize_slash() {
        let source = r#" # trailing positional comment
        # Positional arguments only after here
        ,/"#;
        let test_case = tokenize(source);
        assert_debug_snapshot!(test_case.tokens());
        test_case.assert_reverse_tokenization();
    }
    #[test]
    fn tokenize_bogus() {
        let source = r#"# leading comment
        "a string"
        a = (10)"#;
        let test_case = tokenize(source);
        assert_debug_snapshot!(test_case.tokens());
        assert_debug_snapshot!("Reverse", test_case.tokenize_reverse());
    }
    #[test]
    fn lines_before_empty_string() {