diff --git a/Cargo.lock b/Cargo.lock index 33cf811709..ebcf36fbe7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2201,6 +2201,7 @@ dependencies = [ "insta", "memchr", "ruff_text_size", + "smallvec", "unic-ucd-ident", ] diff --git a/crates/ruff/src/rules/pyflakes/fixes.rs b/crates/ruff/src/rules/pyflakes/fixes.rs index e592befab4..2f61d467d6 100644 --- a/crates/ruff/src/rules/pyflakes/fixes.rs +++ b/crates/ruff/src/rules/pyflakes/fixes.rs @@ -95,8 +95,11 @@ pub(crate) fn remove_exception_handler_assignment( locator: &Locator, ) -> Result { // Lex backwards, to the token just before the `as`. - let mut tokenizer = - SimpleTokenizer::up_to(bound_exception.range.start(), locator.contents()).skip_trivia(); + let mut tokenizer = SimpleTokenizer::up_to_without_back_comment( + bound_exception.range.start(), + locator.contents(), + ) + .skip_trivia(); // Eat the `as` token. let preceding = tokenizer diff --git a/crates/ruff_python_formatter/src/comments/placement.rs b/crates/ruff_python_formatter/src/comments/placement.rs index f705333a87..7a03f67ac8 100644 --- a/crates/ruff_python_formatter/src/comments/placement.rs +++ b/crates/ruff_python_formatter/src/comments/placement.rs @@ -8,8 +8,7 @@ use ruff_python_ast::node::{AnyNodeRef, AstNode}; use ruff_python_ast::source_code::Locator; use ruff_python_ast::whitespace; use ruff_python_trivia::{ - first_non_trivia_token_rev, PythonWhitespace, SimpleToken, SimpleTokenKind, SimpleTokenizer, - UniversalNewlines, + PythonWhitespace, SimpleToken, SimpleTokenKind, SimpleTokenizer, UniversalNewlines, }; use crate::comments::visitor::{CommentPlacement, DecoratedComment}; @@ -1059,7 +1058,9 @@ fn handle_slice_comments<'a>( // Check for `foo[ # comment`, but only if they are on the same line let after_lbracket = matches!( - first_non_trivia_token_rev(comment.slice().start(), locator.contents()), + SimpleTokenizer::up_to_without_back_comment(comment.slice().start(), locator.contents()) + .skip_trivia() + .next_back(), Some(SimpleToken { kind: SimpleTokenKind::LBracket, .. diff --git a/crates/ruff_python_trivia/Cargo.toml b/crates/ruff_python_trivia/Cargo.toml index 29de141a6f..716248abcf 100644 --- a/crates/ruff_python_trivia/Cargo.toml +++ b/crates/ruff_python_trivia/Cargo.toml @@ -16,6 +16,7 @@ license = { workspace = true } ruff_text_size = { workspace = true } memchr = { workspace = true } +smallvec = { workspace = true } unic-ucd-ident = "0.9.0" [dev-dependencies] diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__comment_containing_single_quoted_string.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__comment_containing_single_quoted_string.snap new file mode 100644 index 0000000000..1afba81fcc --- /dev/null +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__comment_containing_single_quoted_string.snap @@ -0,0 +1,78 @@ +--- +source: crates/ruff_python_trivia/src/tokenizer.rs +expression: test_case.tokenize_reverse() +--- +[ + SimpleToken { + kind: Comment, + range: 17..43, + }, + SimpleToken { + kind: Whitespace, + range: 16..17, + }, + SimpleToken { + kind: Other, + range: 15..16, + }, + SimpleToken { + kind: Bogus, + range: 14..15, + }, + SimpleToken { + kind: Bogus, + range: 13..14, + }, + SimpleToken { + kind: Bogus, + range: 12..13, + }, + SimpleToken { + kind: Bogus, + range: 11..12, + }, + SimpleToken { + kind: Bogus, + range: 10..11, + }, + SimpleToken { + kind: Bogus, + range: 9..10, + }, + SimpleToken { + kind: Bogus, + range: 8..9, + }, + SimpleToken { + kind: Bogus, + range: 7..8, + }, + SimpleToken { + kind: Bogus, + range: 6..7, + }, + SimpleToken { + kind: Bogus, + range: 5..6, + }, + SimpleToken { + kind: Bogus, + range: 4..5, + }, + SimpleToken { + kind: Bogus, + range: 3..4, + }, + SimpleToken { + kind: Bogus, + range: 2..3, + }, + SimpleToken { + kind: Bogus, + range: 1..2, + }, + SimpleToken { + kind: Bogus, + range: 0..1, + }, +] diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__comment_containing_triple_quoted_string.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__comment_containing_triple_quoted_string.snap new file mode 100644 index 0000000000..a3585ac5e4 --- /dev/null +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__comment_containing_triple_quoted_string.snap @@ -0,0 +1,94 @@ +--- +source: crates/ruff_python_trivia/src/tokenizer.rs +expression: test_case.tokenize_reverse() +--- +[ + SimpleToken { + kind: Comment, + range: 21..51, + }, + SimpleToken { + kind: Whitespace, + range: 20..21, + }, + SimpleToken { + kind: Other, + range: 19..20, + }, + SimpleToken { + kind: Bogus, + range: 18..19, + }, + SimpleToken { + kind: Bogus, + range: 17..18, + }, + SimpleToken { + kind: Bogus, + range: 16..17, + }, + SimpleToken { + kind: Bogus, + range: 15..16, + }, + SimpleToken { + kind: Bogus, + range: 14..15, + }, + SimpleToken { + kind: Bogus, + range: 13..14, + }, + SimpleToken { + kind: Bogus, + range: 12..13, + }, + SimpleToken { + kind: Bogus, + range: 11..12, + }, + SimpleToken { + kind: Bogus, + range: 10..11, + }, + SimpleToken { + kind: Bogus, + range: 9..10, + }, + SimpleToken { + kind: Bogus, + range: 8..9, + }, + SimpleToken { + kind: Bogus, + range: 7..8, + }, + SimpleToken { + kind: Bogus, + range: 6..7, + }, + SimpleToken { + kind: Bogus, + range: 5..6, + }, + SimpleToken { + kind: Bogus, + range: 4..5, + }, + SimpleToken { + kind: Bogus, + range: 3..4, + }, + SimpleToken { + kind: Bogus, + range: 2..3, + }, + SimpleToken { + kind: Bogus, + range: 1..2, + }, + SimpleToken { + kind: Bogus, + range: 0..1, + }, +] diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__single_quoted_multiline_string_containing_comment.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__single_quoted_multiline_string_containing_comment.snap new file mode 100644 index 0000000000..60b5a8da36 --- /dev/null +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__single_quoted_multiline_string_containing_comment.snap @@ -0,0 +1,314 @@ +--- +source: crates/ruff_python_trivia/src/tokenizer.rs +expression: test_case.tokenize_reverse() +--- +[ + SimpleToken { + kind: Other, + range: 76..77, + }, + SimpleToken { + kind: Bogus, + range: 75..76, + }, + SimpleToken { + kind: Bogus, + range: 74..75, + }, + SimpleToken { + kind: Bogus, + range: 73..74, + }, + SimpleToken { + kind: Bogus, + range: 72..73, + }, + SimpleToken { + kind: Bogus, + range: 71..72, + }, + SimpleToken { + kind: Bogus, + range: 70..71, + }, + SimpleToken { + kind: Bogus, + range: 69..70, + }, + SimpleToken { + kind: Bogus, + range: 68..69, + }, + SimpleToken { + kind: Bogus, + range: 67..68, + }, + SimpleToken { + kind: Bogus, + range: 66..67, + }, + SimpleToken { + kind: Bogus, + range: 65..66, + }, + SimpleToken { + kind: Bogus, + range: 64..65, + }, + SimpleToken { + kind: Bogus, + range: 63..64, + }, + SimpleToken { + kind: Bogus, + range: 62..63, + }, + SimpleToken { + kind: Bogus, + range: 61..62, + }, + SimpleToken { + kind: Bogus, + range: 60..61, + }, + SimpleToken { + kind: Bogus, + range: 59..60, + }, + SimpleToken { + kind: Bogus, + range: 58..59, + }, + SimpleToken { + kind: Bogus, + range: 57..58, + }, + SimpleToken { + kind: Bogus, + range: 56..57, + }, + SimpleToken { + kind: Bogus, + range: 55..56, + }, + SimpleToken { + kind: Bogus, + range: 54..55, + }, + SimpleToken { + kind: Bogus, + range: 53..54, + }, + SimpleToken { + kind: Bogus, + range: 52..53, + }, + SimpleToken { + kind: Bogus, + range: 51..52, + }, + SimpleToken { + kind: Bogus, + range: 50..51, + }, + SimpleToken { + kind: Bogus, + range: 49..50, + }, + SimpleToken { + kind: Bogus, + range: 48..49, + }, + SimpleToken { + kind: Bogus, + range: 47..48, + }, + SimpleToken { + kind: Bogus, + range: 46..47, + }, + SimpleToken { + kind: Bogus, + range: 45..46, + }, + SimpleToken { + kind: Bogus, + range: 44..45, + }, + SimpleToken { + kind: Bogus, + range: 43..44, + }, + SimpleToken { + kind: Bogus, + range: 42..43, + }, + SimpleToken { + kind: Bogus, + range: 41..42, + }, + SimpleToken { + kind: Bogus, + range: 40..41, + }, + SimpleToken { + kind: Bogus, + range: 39..40, + }, + SimpleToken { + kind: Bogus, + range: 38..39, + }, + SimpleToken { + kind: Bogus, + range: 37..38, + }, + SimpleToken { + kind: Bogus, + range: 36..37, + }, + SimpleToken { + kind: Bogus, + range: 35..36, + }, + SimpleToken { + kind: Bogus, + range: 34..35, + }, + SimpleToken { + kind: Bogus, + range: 33..34, + }, + SimpleToken { + kind: Bogus, + range: 32..33, + }, + SimpleToken { + kind: Bogus, + range: 31..32, + }, + SimpleToken { + kind: Bogus, + range: 30..31, + }, + SimpleToken { + kind: Bogus, + range: 29..30, + }, + SimpleToken { + kind: Bogus, + range: 28..29, + }, + SimpleToken { + kind: Bogus, + range: 27..28, + }, + SimpleToken { + kind: Bogus, + range: 26..27, + }, + SimpleToken { + kind: Bogus, + range: 25..26, + }, + SimpleToken { + kind: Bogus, + range: 24..25, + }, + SimpleToken { + kind: Bogus, + range: 23..24, + }, + SimpleToken { + kind: Bogus, + range: 22..23, + }, + SimpleToken { + kind: Bogus, + range: 21..22, + }, + SimpleToken { + kind: Bogus, + range: 20..21, + }, + SimpleToken { + kind: Bogus, + range: 19..20, + }, + SimpleToken { + kind: Bogus, + range: 18..19, + }, + SimpleToken { + kind: Bogus, + range: 17..18, + }, + SimpleToken { + kind: Bogus, + range: 16..17, + }, + SimpleToken { + kind: Bogus, + range: 15..16, + }, + SimpleToken { + kind: Bogus, + range: 14..15, + }, + SimpleToken { + kind: Bogus, + range: 13..14, + }, + SimpleToken { + kind: Bogus, + range: 12..13, + }, + SimpleToken { + kind: Bogus, + range: 11..12, + }, + SimpleToken { + kind: Bogus, + range: 10..11, + }, + SimpleToken { + kind: Bogus, + range: 9..10, + }, + SimpleToken { + kind: Bogus, + range: 8..9, + }, + SimpleToken { + kind: Bogus, + range: 7..8, + }, + SimpleToken { + kind: Bogus, + range: 6..7, + }, + SimpleToken { + kind: Bogus, + range: 5..6, + }, + SimpleToken { + kind: Bogus, + range: 4..5, + }, + SimpleToken { + kind: Bogus, + range: 3..4, + }, + SimpleToken { + kind: Bogus, + range: 2..3, + }, + SimpleToken { + kind: Bogus, + range: 1..2, + }, + SimpleToken { + kind: Bogus, + range: 0..1, + }, +] diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__single_quoted_multiline_string_implicit_concatenation.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__single_quoted_multiline_string_implicit_concatenation.snap new file mode 100644 index 0000000000..ee103aeece --- /dev/null +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__single_quoted_multiline_string_implicit_concatenation.snap @@ -0,0 +1,322 @@ +--- +source: crates/ruff_python_trivia/src/tokenizer.rs +expression: test_case.tokenize_reverse() +--- +[ + SimpleToken { + kind: Other, + range: 78..79, + }, + SimpleToken { + kind: Bogus, + range: 77..78, + }, + SimpleToken { + kind: Bogus, + range: 76..77, + }, + SimpleToken { + kind: Bogus, + range: 75..76, + }, + SimpleToken { + kind: Bogus, + range: 74..75, + }, + SimpleToken { + kind: Bogus, + range: 73..74, + }, + SimpleToken { + kind: Bogus, + range: 72..73, + }, + SimpleToken { + kind: Bogus, + range: 71..72, + }, + SimpleToken { + kind: Bogus, + range: 70..71, + }, + SimpleToken { + kind: Bogus, + range: 69..70, + }, + SimpleToken { + kind: Bogus, + range: 68..69, + }, + SimpleToken { + kind: Bogus, + range: 67..68, + }, + SimpleToken { + kind: Bogus, + range: 66..67, + }, + SimpleToken { + kind: Bogus, + range: 65..66, + }, + SimpleToken { + kind: Bogus, + range: 64..65, + }, + SimpleToken { + kind: Bogus, + range: 63..64, + }, + SimpleToken { + kind: Bogus, + range: 62..63, + }, + SimpleToken { + kind: Bogus, + range: 61..62, + }, + SimpleToken { + kind: Bogus, + range: 60..61, + }, + SimpleToken { + kind: Bogus, + range: 59..60, + }, + SimpleToken { + kind: Bogus, + range: 58..59, + }, + SimpleToken { + kind: Bogus, + range: 57..58, + }, + SimpleToken { + kind: Bogus, + range: 56..57, + }, + SimpleToken { + kind: Bogus, + range: 55..56, + }, + SimpleToken { + kind: Bogus, + range: 54..55, + }, + SimpleToken { + kind: Bogus, + range: 53..54, + }, + SimpleToken { + kind: Bogus, + range: 52..53, + }, + SimpleToken { + kind: Bogus, + range: 51..52, + }, + SimpleToken { + kind: Bogus, + range: 50..51, + }, + SimpleToken { + kind: Bogus, + range: 49..50, + }, + SimpleToken { + kind: Bogus, + range: 48..49, + }, + SimpleToken { + kind: Bogus, + range: 47..48, + }, + SimpleToken { + kind: Bogus, + range: 46..47, + }, + SimpleToken { + kind: Bogus, + range: 45..46, + }, + SimpleToken { + kind: Bogus, + range: 44..45, + }, + SimpleToken { + kind: Bogus, + range: 43..44, + }, + SimpleToken { + kind: Bogus, + range: 42..43, + }, + SimpleToken { + kind: Bogus, + range: 41..42, + }, + SimpleToken { + kind: Bogus, + range: 40..41, + }, + SimpleToken { + kind: Bogus, + range: 39..40, + }, + SimpleToken { + kind: Bogus, + range: 38..39, + }, + SimpleToken { + kind: Bogus, + range: 37..38, + }, + SimpleToken { + kind: Bogus, + range: 36..37, + }, + SimpleToken { + kind: Bogus, + range: 35..36, + }, + SimpleToken { + kind: Bogus, + range: 34..35, + }, + SimpleToken { + kind: Bogus, + range: 33..34, + }, + SimpleToken { + kind: Bogus, + range: 32..33, + }, + SimpleToken { + kind: Bogus, + range: 31..32, + }, + SimpleToken { + kind: Bogus, + range: 30..31, + }, + SimpleToken { + kind: Bogus, + range: 29..30, + }, + SimpleToken { + kind: Bogus, + range: 28..29, + }, + SimpleToken { + kind: Bogus, + range: 27..28, + }, + SimpleToken { + kind: Bogus, + range: 26..27, + }, + SimpleToken { + kind: Bogus, + range: 25..26, + }, + SimpleToken { + kind: Bogus, + range: 24..25, + }, + SimpleToken { + kind: Bogus, + range: 23..24, + }, + SimpleToken { + kind: Bogus, + range: 22..23, + }, + SimpleToken { + kind: Bogus, + range: 21..22, + }, + SimpleToken { + kind: Bogus, + range: 20..21, + }, + SimpleToken { + kind: Bogus, + range: 19..20, + }, + SimpleToken { + kind: Bogus, + range: 18..19, + }, + SimpleToken { + kind: Bogus, + range: 17..18, + }, + SimpleToken { + kind: Bogus, + range: 16..17, + }, + SimpleToken { + kind: Bogus, + range: 15..16, + }, + SimpleToken { + kind: Bogus, + range: 14..15, + }, + SimpleToken { + kind: Bogus, + range: 13..14, + }, + SimpleToken { + kind: Bogus, + range: 12..13, + }, + SimpleToken { + kind: Bogus, + range: 11..12, + }, + SimpleToken { + kind: Bogus, + range: 10..11, + }, + SimpleToken { + kind: Bogus, + range: 9..10, + }, + SimpleToken { + kind: Bogus, + range: 8..9, + }, + SimpleToken { + kind: Bogus, + range: 7..8, + }, + SimpleToken { + kind: Bogus, + range: 6..7, + }, + SimpleToken { + kind: Bogus, + range: 5..6, + }, + SimpleToken { + kind: Bogus, + range: 4..5, + }, + SimpleToken { + kind: Bogus, + range: 3..4, + }, + SimpleToken { + kind: Bogus, + range: 2..3, + }, + SimpleToken { + kind: Bogus, + range: 1..2, + }, + SimpleToken { + kind: Bogus, + range: 0..1, + }, +] diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_followed_by_multiple_comments.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_followed_by_multiple_comments.snap new file mode 100644 index 0000000000..7e2f64da75 --- /dev/null +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_followed_by_multiple_comments.snap @@ -0,0 +1,222 @@ +--- +source: crates/ruff_python_trivia/src/tokenizer.rs +expression: test_case.tokenize_reverse() +--- +[ + SimpleToken { + kind: Comment, + range: 53..72, + }, + SimpleToken { + kind: Whitespace, + range: 52..53, + }, + SimpleToken { + kind: Other, + range: 51..52, + }, + SimpleToken { + kind: Bogus, + range: 50..51, + }, + SimpleToken { + kind: Bogus, + range: 49..50, + }, + SimpleToken { + kind: Bogus, + range: 48..49, + }, + SimpleToken { + kind: Bogus, + range: 47..48, + }, + SimpleToken { + kind: Bogus, + range: 46..47, + }, + SimpleToken { + kind: Bogus, + range: 45..46, + }, + SimpleToken { + kind: Bogus, + range: 44..45, + }, + SimpleToken { + kind: Bogus, + range: 43..44, + }, + SimpleToken { + kind: Bogus, + range: 42..43, + }, + SimpleToken { + kind: Bogus, + range: 41..42, + }, + SimpleToken { + kind: Bogus, + range: 40..41, + }, + SimpleToken { + kind: Bogus, + range: 39..40, + }, + SimpleToken { + kind: Bogus, + range: 38..39, + }, + SimpleToken { + kind: Bogus, + range: 37..38, + }, + SimpleToken { + kind: Bogus, + range: 36..37, + }, + SimpleToken { + kind: Bogus, + range: 35..36, + }, + SimpleToken { + kind: Bogus, + range: 34..35, + }, + SimpleToken { + kind: Bogus, + range: 33..34, + }, + SimpleToken { + kind: Bogus, + range: 32..33, + }, + SimpleToken { + kind: Bogus, + range: 31..32, + }, + SimpleToken { + kind: Bogus, + range: 30..31, + }, + SimpleToken { + kind: Bogus, + range: 29..30, + }, + SimpleToken { + kind: Bogus, + range: 28..29, + }, + SimpleToken { + kind: Bogus, + range: 27..28, + }, + SimpleToken { + kind: Bogus, + range: 26..27, + }, + SimpleToken { + kind: Bogus, + range: 25..26, + }, + SimpleToken { + kind: Bogus, + range: 24..25, + }, + SimpleToken { + kind: Bogus, + range: 23..24, + }, + SimpleToken { + kind: Bogus, + range: 22..23, + }, + SimpleToken { + kind: Bogus, + range: 21..22, + }, + SimpleToken { + kind: Bogus, + range: 20..21, + }, + SimpleToken { + kind: Bogus, + range: 19..20, + }, + SimpleToken { + kind: Bogus, + range: 18..19, + }, + SimpleToken { + kind: Bogus, + range: 17..18, + }, + SimpleToken { + kind: Bogus, + range: 16..17, + }, + SimpleToken { + kind: Bogus, + range: 15..16, + }, + SimpleToken { + kind: Bogus, + range: 14..15, + }, + SimpleToken { + kind: Bogus, + range: 13..14, + }, + SimpleToken { + kind: Bogus, + range: 12..13, + }, + SimpleToken { + kind: Bogus, + range: 11..12, + }, + SimpleToken { + kind: Bogus, + range: 10..11, + }, + SimpleToken { + kind: Bogus, + range: 9..10, + }, + SimpleToken { + kind: Bogus, + range: 8..9, + }, + SimpleToken { + kind: Bogus, + range: 7..8, + }, + SimpleToken { + kind: Bogus, + range: 6..7, + }, + SimpleToken { + kind: Bogus, + range: 5..6, + }, + SimpleToken { + kind: Bogus, + range: 4..5, + }, + SimpleToken { + kind: Bogus, + range: 3..4, + }, + SimpleToken { + kind: Bogus, + range: 2..3, + }, + SimpleToken { + kind: Bogus, + range: 1..2, + }, + SimpleToken { + kind: Bogus, + range: 0..1, + }, +] diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_double_escaped_backslash.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_double_escaped_backslash.snap new file mode 100644 index 0000000000..756d0e7075 --- /dev/null +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_double_escaped_backslash.snap @@ -0,0 +1,66 @@ +--- +source: crates/ruff_python_trivia/src/tokenizer.rs +expression: test_case.tokenize_reverse() +--- +[ + SimpleToken { + kind: Comment, + range: 14..27, + }, + SimpleToken { + kind: Whitespace, + range: 13..14, + }, + SimpleToken { + kind: Other, + range: 12..13, + }, + SimpleToken { + kind: Bogus, + range: 11..12, + }, + SimpleToken { + kind: Bogus, + range: 10..11, + }, + SimpleToken { + kind: Bogus, + range: 9..10, + }, + SimpleToken { + kind: Bogus, + range: 8..9, + }, + SimpleToken { + kind: Bogus, + range: 7..8, + }, + SimpleToken { + kind: Bogus, + range: 6..7, + }, + SimpleToken { + kind: Bogus, + range: 5..6, + }, + SimpleToken { + kind: Bogus, + range: 4..5, + }, + SimpleToken { + kind: Bogus, + range: 3..4, + }, + SimpleToken { + kind: Bogus, + range: 2..3, + }, + SimpleToken { + kind: Bogus, + range: 1..2, + }, + SimpleToken { + kind: Bogus, + range: 0..1, + }, +] diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_escaped_quote.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_escaped_quote.snap new file mode 100644 index 0000000000..b9e074fbb3 --- /dev/null +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_escaped_quote.snap @@ -0,0 +1,150 @@ +--- +source: crates/ruff_python_trivia/src/tokenizer.rs +expression: test_case.tokenize_reverse() +--- +[ + SimpleToken { + kind: Comment, + range: 35..54, + }, + SimpleToken { + kind: Whitespace, + range: 34..35, + }, + SimpleToken { + kind: Other, + range: 33..34, + }, + SimpleToken { + kind: Bogus, + range: 32..33, + }, + SimpleToken { + kind: Bogus, + range: 31..32, + }, + SimpleToken { + kind: Bogus, + range: 30..31, + }, + SimpleToken { + kind: Bogus, + range: 29..30, + }, + SimpleToken { + kind: Bogus, + range: 28..29, + }, + SimpleToken { + kind: Bogus, + range: 27..28, + }, + SimpleToken { + kind: Bogus, + range: 26..27, + }, + SimpleToken { + kind: Bogus, + range: 25..26, + }, + SimpleToken { + kind: Bogus, + range: 24..25, + }, + SimpleToken { + kind: Bogus, + range: 23..24, + }, + SimpleToken { + kind: Bogus, + range: 22..23, + }, + SimpleToken { + kind: Bogus, + range: 21..22, + }, + SimpleToken { + kind: Bogus, + range: 20..21, + }, + SimpleToken { + kind: Bogus, + range: 19..20, + }, + SimpleToken { + kind: Bogus, + range: 18..19, + }, + SimpleToken { + kind: Bogus, + range: 17..18, + }, + SimpleToken { + kind: Bogus, + range: 16..17, + }, + SimpleToken { + kind: Bogus, + range: 15..16, + }, + SimpleToken { + kind: Bogus, + range: 14..15, + }, + SimpleToken { + kind: Bogus, + range: 13..14, + }, + SimpleToken { + kind: Bogus, + range: 12..13, + }, + SimpleToken { + kind: Bogus, + range: 11..12, + }, + SimpleToken { + kind: Bogus, + range: 10..11, + }, + SimpleToken { + kind: Bogus, + range: 9..10, + }, + SimpleToken { + kind: Bogus, + range: 8..9, + }, + SimpleToken { + kind: Bogus, + range: 7..8, + }, + SimpleToken { + kind: Bogus, + range: 6..7, + }, + SimpleToken { + kind: Bogus, + range: 5..6, + }, + SimpleToken { + kind: Bogus, + range: 4..5, + }, + SimpleToken { + kind: Bogus, + range: 3..4, + }, + SimpleToken { + kind: Bogus, + range: 2..3, + }, + SimpleToken { + kind: Bogus, + range: 1..2, + }, + SimpleToken { + kind: Bogus, + range: 0..1, + }, +] diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__triple_quoted_multiline_string_containing_comment.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__triple_quoted_multiline_string_containing_comment.snap new file mode 100644 index 0000000000..abad16a47d --- /dev/null +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__triple_quoted_multiline_string_containing_comment.snap @@ -0,0 +1,326 @@ +--- +source: crates/ruff_python_trivia/src/tokenizer.rs +expression: test_case.tokenize_reverse() +--- +[ + SimpleToken { + kind: Other, + range: 79..80, + }, + SimpleToken { + kind: Bogus, + range: 78..79, + }, + SimpleToken { + kind: Bogus, + range: 77..78, + }, + SimpleToken { + kind: Bogus, + range: 76..77, + }, + SimpleToken { + kind: Bogus, + range: 75..76, + }, + SimpleToken { + kind: Bogus, + range: 74..75, + }, + SimpleToken { + kind: Bogus, + range: 73..74, + }, + SimpleToken { + kind: Bogus, + range: 72..73, + }, + SimpleToken { + kind: Bogus, + range: 71..72, + }, + SimpleToken { + kind: Bogus, + range: 70..71, + }, + SimpleToken { + kind: Bogus, + range: 69..70, + }, + SimpleToken { + kind: Bogus, + range: 68..69, + }, + SimpleToken { + kind: Bogus, + range: 67..68, + }, + SimpleToken { + kind: Bogus, + range: 66..67, + }, + SimpleToken { + kind: Bogus, + range: 65..66, + }, + SimpleToken { + kind: Bogus, + range: 64..65, + }, + SimpleToken { + kind: Bogus, + range: 63..64, + }, + SimpleToken { + kind: Bogus, + range: 62..63, + }, + SimpleToken { + kind: Bogus, + range: 61..62, + }, + SimpleToken { + kind: Bogus, + range: 60..61, + }, + SimpleToken { + kind: Bogus, + range: 59..60, + }, + SimpleToken { + kind: Bogus, + range: 58..59, + }, + SimpleToken { + kind: Bogus, + range: 57..58, + }, + SimpleToken { + kind: Bogus, + range: 56..57, + }, + SimpleToken { + kind: Bogus, + range: 55..56, + }, + SimpleToken { + kind: Bogus, + range: 54..55, + }, + SimpleToken { + kind: Bogus, + range: 53..54, + }, + SimpleToken { + kind: Bogus, + range: 52..53, + }, + SimpleToken { + kind: Bogus, + range: 51..52, + }, + SimpleToken { + kind: Bogus, + range: 50..51, + }, + SimpleToken { + kind: Bogus, + range: 49..50, + }, + SimpleToken { + kind: Bogus, + range: 48..49, + }, + SimpleToken { + kind: Bogus, + range: 47..48, + }, + SimpleToken { + kind: Bogus, + range: 46..47, + }, + SimpleToken { + kind: Bogus, + range: 45..46, + }, + SimpleToken { + kind: Bogus, + range: 44..45, + }, + SimpleToken { + kind: Bogus, + range: 43..44, + }, + SimpleToken { + kind: Bogus, + range: 42..43, + }, + SimpleToken { + kind: Bogus, + range: 41..42, + }, + SimpleToken { + kind: Bogus, + range: 40..41, + }, + SimpleToken { + kind: Bogus, + range: 39..40, + }, + SimpleToken { + kind: Bogus, + range: 38..39, + }, + SimpleToken { + kind: Bogus, + range: 37..38, + }, + SimpleToken { + kind: Bogus, + range: 36..37, + }, + SimpleToken { + kind: Bogus, + range: 35..36, + }, + SimpleToken { + kind: Bogus, + range: 34..35, + }, + SimpleToken { + kind: Bogus, + range: 33..34, + }, + SimpleToken { + kind: Bogus, + range: 32..33, + }, + SimpleToken { + kind: Bogus, + range: 31..32, + }, + SimpleToken { + kind: Bogus, + range: 30..31, + }, + SimpleToken { + kind: Bogus, + range: 29..30, + }, + SimpleToken { + kind: Bogus, + range: 28..29, + }, + SimpleToken { + kind: Bogus, + range: 27..28, + }, + SimpleToken { + kind: Bogus, + range: 26..27, + }, + SimpleToken { + kind: Bogus, + range: 25..26, + }, + SimpleToken { + kind: Bogus, + range: 24..25, + }, + SimpleToken { + kind: Bogus, + range: 23..24, + }, + SimpleToken { + kind: Bogus, + range: 22..23, + }, + SimpleToken { + kind: Bogus, + range: 21..22, + }, + SimpleToken { + kind: Bogus, + range: 20..21, + }, + SimpleToken { + kind: Bogus, + range: 19..20, + }, + SimpleToken { + kind: Bogus, + range: 18..19, + }, + SimpleToken { + kind: Bogus, + range: 17..18, + }, + SimpleToken { + kind: Bogus, + range: 16..17, + }, + SimpleToken { + kind: Bogus, + range: 15..16, + }, + SimpleToken { + kind: Bogus, + range: 14..15, + }, + SimpleToken { + kind: Bogus, + range: 13..14, + }, + SimpleToken { + kind: Bogus, + range: 12..13, + }, + SimpleToken { + kind: Bogus, + range: 11..12, + }, + SimpleToken { + kind: Bogus, + range: 10..11, + }, + SimpleToken { + kind: Bogus, + range: 9..10, + }, + SimpleToken { + kind: Bogus, + range: 8..9, + }, + SimpleToken { + kind: Bogus, + range: 7..8, + }, + SimpleToken { + kind: Bogus, + range: 6..7, + }, + SimpleToken { + kind: Bogus, + range: 5..6, + }, + SimpleToken { + kind: Bogus, + range: 4..5, + }, + SimpleToken { + kind: Bogus, + range: 3..4, + }, + SimpleToken { + kind: Bogus, + range: 2..3, + }, + SimpleToken { + kind: Bogus, + range: 1..2, + }, + SimpleToken { + kind: Bogus, + range: 0..1, + }, +] diff --git a/crates/ruff_python_trivia/src/tokenizer.rs b/crates/ruff_python_trivia/src/tokenizer.rs index cb36b06cc7..1a6fdc5151 100644 --- a/crates/ruff_python_trivia/src/tokenizer.rs +++ b/crates/ruff_python_trivia/src/tokenizer.rs @@ -1,4 +1,4 @@ -use memchr::memrchr3_iter; +use memchr::{memchr2, memchr3, memrchr3_iter}; use ruff_text_size::{TextLen, TextRange, TextSize}; use unic_ucd_ident::{is_xid_continue, is_xid_start}; @@ -18,30 +18,22 @@ pub fn first_non_trivia_token(offset: TextSize, code: &str) -> Option Option { - SimpleTokenizer::up_to(offset, code) - .skip_trivia() - .next_back() -} - /// Returns the number of newlines between `offset` and the first non whitespace character in the source code. pub fn lines_before(offset: TextSize, code: &str) -> u32 { - let tokens = SimpleTokenizer::up_to(offset, code); - let mut newlines = 0u32; + let mut cursor = Cursor::new(&code[TextRange::up_to(offset)]); - for token in tokens.rev() { - match token.kind() { - SimpleTokenKind::Newline => { + let mut newlines = 0u32; + while let Some(c) = cursor.bump_back() { + match c { + '\n' => { + cursor.eat_char_back('\r'); newlines += 1; } - SimpleTokenKind::Whitespace => { - // ignore + '\r' => { + newlines += 1; + } + c if is_python_whitespace(c) => { + continue; } _ => { break; @@ -54,16 +46,20 @@ pub fn lines_before(offset: TextSize, code: &str) -> u32 { /// Counts the empty lines between `offset` and the first non-whitespace character. pub fn lines_after(offset: TextSize, code: &str) -> u32 { - let tokens = SimpleTokenizer::starts_at(offset, code); - let mut newlines = 0u32; + let mut cursor = Cursor::new(&code[offset.to_usize()..]); - for token in tokens { - match token.kind() { - SimpleTokenKind::Newline => { + let mut newlines = 0u32; + while let Some(c) = cursor.bump() { + match c { + '\n' => { newlines += 1; } - SimpleTokenKind::Whitespace => { - // ignore + '\r' => { + cursor.eat_char('\n'); + newlines += 1; + } + c if is_python_whitespace(c) => { + continue; } _ => { break; @@ -278,6 +274,8 @@ impl<'a> SimpleTokenizer<'a> { } /// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`. + /// + /// Consider using [`SimpleTokenizer::up_to_without_back_comment`] if intend to lex backwards. pub fn up_to(offset: TextSize, source: &'a str) -> Self { Self::new(source, TextRange::up_to(offset)) } @@ -423,45 +421,40 @@ impl<'a> SimpleTokenizer<'a> { // For all other tokens, test if the character isn't part of a comment. c => { // Skip the test whether there's a preceding comment if it has been performed before. - let comment_offset = if self.back_line_has_no_comment { + let comment_length = if self.back_line_has_no_comment { None } else { let bytes = self.cursor.chars().as_str().as_bytes(); - let mut line_start = 0; - let mut last_comment_offset = None; + let mut potential_comment_starts: smallvec::SmallVec<[TextSize; 2]> = + smallvec::SmallVec::new(); // Find the start of the line, or any potential comments. for index in memrchr3_iter(b'\n', b'\r', b'#', bytes) { if bytes[index] == b'#' { // Potentially a comment, but not guaranteed - last_comment_offset = Some(index); + // SAFETY: Safe, because ruff only supports files up to 4GB + potential_comment_starts.push(TextSize::try_from(index).unwrap()); } else { - line_start = index + 1; break; } } - // Verify if this is indeed a comment. Doing this only when we've found a comment is significantly - // faster because comments are rare. - last_comment_offset.filter(|last_comment_offset| { - let before_comment = - &self.cursor.chars().as_str()[line_start..*last_comment_offset]; - - before_comment.chars().all(|c| { - is_python_whitespace(c) - || SimpleTokenKind::from_non_trivia_char(c) - != SimpleTokenKind::Other - }) - }) + // No comments + if potential_comment_starts.is_empty() { + None + } else { + // The line contains at least one `#` token. The `#` can indicate the start of a + // comment, meaning the current token is commented out, or it is a regular `#` inside of a string. + self.comment_from_hash_positions(&potential_comment_starts) + } }; // From here on it is guaranteed that this line has no other comment. self.back_line_has_no_comment = true; - if let Some(comment_offset) = comment_offset { - let comment_length = self.cursor.chars().as_str().len() - comment_offset; + if let Some(comment_length) = comment_length { // It is a comment, bump all tokens - for _ in 0..comment_length { + for _ in 0..usize::from(comment_length) { self.cursor.bump_back().unwrap(); } @@ -519,6 +512,141 @@ impl<'a> SimpleTokenizer<'a> { pub fn skip_trivia(self) -> impl Iterator + DoubleEndedIterator + 'a { self.filter(|t| !t.kind().is_trivia()) } + + /// Given the position of `#` tokens on a line, test if any `#` is the start of a comment and, if so, return the + /// length of the comment. + /// + /// The challenge is that `#` tokens can also appear inside of strings: + /// + /// ```python + /// ' #not a comment' + /// ``` + /// + /// This looks innocent but is the `'` really the start of the new string or could it be a closing delimiter + /// of a previously started string: + /// + /// ```python + /// ' a string\ + /// ` # a comment ' + /// ``` + /// + /// The only way to reliability tell whether the `#` is a comment when the comment contains a quote char is + /// to forward lex all strings and comments and test if there's any unclosed string literal. If so, then + /// the hash cannot be a comment. + fn comment_from_hash_positions(&self, hash_positions: &[TextSize]) -> Option { + // Iterate over the `#` positions from the start to the end of the line. + // This is necessary to correctly support `a # comment # comment`. + for possible_start in hash_positions.iter().rev() { + let comment_bytes = + self.source[TextRange::new(*possible_start, self.back_offset)].as_bytes(); + + // Test if the comment contains any quotes. If so, then it's possible that the `#` token isn't + // the start of a comment, but instead part of a string: + // ```python + // a + 'a string # not a comment' + // a + '''a string + // # not a comment''' + // ``` + match memchr2(b'\'', b'"', comment_bytes) { + // Most comments don't contain quotes, and most strings don't contain comments. + // For these it's safe to assume that they are comments. + None => return Some(self.cursor.chars().as_str().text_len() - possible_start), + // Now it gets complicated... There's no good way to know whether this is a string or not. + // It is necessary to lex all strings and comments from the start to know if it is one or the other. + Some(_) => { + if find_unterminated_string_kind( + &self.cursor.chars().as_str()[TextRange::up_to(*possible_start)], + ) + .is_none() + { + // There's no unterminated string at the comment's start position. This *must* + // be a comment. + return Some(self.cursor.chars().as_str().text_len() - possible_start); + } + + // This is a hash inside of a string: `'test # not a comment'` continue with the next potential comment on the line. + } + } + } + + None + } +} + +fn find_unterminated_string_kind(input: &str) -> Option { + let mut rest = input; + + while let Some(comment_or_string_start) = memchr3(b'#', b'\'', b'\"', rest.as_bytes()) { + let c = rest.as_bytes()[comment_or_string_start]; + let after = &rest[comment_or_string_start + 1..]; + + if c == b'#' { + let comment_end = memchr2(b'\n', b'\r', after.as_bytes()).unwrap_or(after.len()); + rest = &after[comment_end..]; + } else { + let mut cursor = Cursor::new(after); + let quote_kind = if c == b'\'' { + QuoteKind::Single + } else { + QuoteKind::Double + }; + + let string_kind = if cursor.eat_char(quote_kind.as_char()) { + // `''` or `""` + if cursor.eat_char(quote_kind.as_char()) { + // `'''` or `"""` + StringKind::Triple(quote_kind) + } else { + // empty string literal, nothing more to lex + continue; + } + } else { + StringKind::Single(quote_kind) + }; + + if !is_string_terminated(string_kind, &mut cursor) { + return Some(string_kind); + } + + rest = cursor.chars().as_str(); + } + } + + None +} + +fn is_string_terminated(kind: StringKind, cursor: &mut Cursor) -> bool { + let quote_char = kind.quote_kind().as_char(); + + while let Some(c) = cursor.bump() { + match c { + '\n' | '\r' if kind.is_single() => { + // Reached the end of the line without a closing quote, this is an unterminated string literal. + return false; + } + '\\' => { + // Skip over escaped quotes that match this strings quotes or double escaped backslashes + if cursor.eat_char(quote_char) || cursor.eat_char('\\') { + continue; + } + // Eat over line continuation + cursor.eat_char('\r'); + cursor.eat_char('\n'); + } + c if c == quote_char => { + if kind.is_single() || (cursor.eat_char(quote_char) && cursor.eat_char(quote_char)) + { + return true; + } + } + _ => { + // continue + } + } + } + + // Reached end without a closing quote + false } impl Iterator for SimpleTokenizer<'_> { @@ -547,6 +675,45 @@ impl DoubleEndedIterator for SimpleTokenizer<'_> { } } +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +enum StringKind { + /// `'...'` or `"..."` + Single(QuoteKind), + /// `'''...'''` or `"""..."""` + Triple(QuoteKind), +} + +impl StringKind { + const fn quote_kind(self) -> QuoteKind { + match self { + StringKind::Single(kind) => kind, + StringKind::Triple(kind) => kind, + } + } + + const fn is_single(self) -> bool { + matches!(self, StringKind::Single(_)) + } +} + +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +enum QuoteKind { + /// `'`` + Single, + + /// `"` + Double, +} + +impl QuoteKind { + const fn as_char(self) -> char { + match self { + QuoteKind::Single => '\'', + QuoteKind::Double => '"', + } + } +} + #[cfg(test)] mod tests { use insta::assert_debug_snapshot; @@ -708,6 +875,72 @@ mod tests { assert_debug_snapshot!("Reverse", test_case.tokenize_reverse()); } + #[test] + fn single_quoted_multiline_string_containing_comment() { + let test_case = tokenize( + r#"'This string contains a hash looking like a comment\ +# This is not a comment'"#, + ); + + assert_debug_snapshot!(test_case.tokenize_reverse()); + } + + #[test] + fn single_quoted_multiline_string_implicit_concatenation() { + let test_case = tokenize( + r#"'This string contains a hash looking like a comment\ +# This is' "not_a_comment""#, + ); + + assert_debug_snapshot!(test_case.tokenize_reverse()); + } + + #[test] + fn triple_quoted_multiline_string_containing_comment() { + let test_case = tokenize( + r#"'''This string contains a hash looking like a comment +# This is not a comment'''"#, + ); + + assert_debug_snapshot!(test_case.tokenize_reverse()); + } + + #[test] + fn comment_containing_triple_quoted_string() { + let test_case = tokenize("'''leading string''' # a comment '''not a string'''"); + + assert_debug_snapshot!(test_case.tokenize_reverse()); + } + + #[test] + fn comment_containing_single_quoted_string() { + let test_case = tokenize("'leading string' # a comment 'not a string'"); + + assert_debug_snapshot!(test_case.tokenize_reverse()); + } + + #[test] + fn string_followed_by_multiple_comments() { + let test_case = + tokenize(r#"'a string # containing a hash " # and another hash ' # finally a comment"#); + + assert_debug_snapshot!(test_case.tokenize_reverse()); + } + + #[test] + fn string_with_escaped_quote() { + let test_case = tokenize(r#"'a string \' # containing a hash ' # finally a comment"#); + + assert_debug_snapshot!(test_case.tokenize_reverse()); + } + + #[test] + fn string_with_double_escaped_backslash() { + let test_case = tokenize(r#"'a string \\' # a comment '"#); + + assert_debug_snapshot!(test_case.tokenize_reverse()); + } + #[test] fn lines_before_empty_string() { assert_eq!(lines_before(TextSize::new(0), ""), 0);