Files
ruff/crates/ruff_python_parser/src/token_source.rs
Dhruv Manilawala 8499abfa7f Implement re-lexing logic for better error recovery (#11845)
## Summary

This PR implements the re-lexing logic in the parser.

This logic is only applied when recovering from an error during list
parsing. The logic is as follows:
1. During list parsing, if an unexpected token is encountered and it
detects that an outer context can understand it and thus recover from
it, it invokes the re-lexing logic in the lexer
2. This logic first checks if the lexer is in a parenthesized context
and returns if it's not. Thus, the logic is a no-op if the lexer isn't
in a parenthesized context
3. It then reduces the nesting level by 1. It shouldn't reset it to 0
because otherwise the recovery from nested list parsing will be
incorrect
4. Then, it tries to find last newline character going backwards from
the current position of the lexer. This avoids any whitespaces but if it
encounters any character other than newline or whitespace, it aborts.
5. Now, if there's a newline character, then it needs to be re-lexed in
a logical context which means that the lexer needs to emit it as a
`Newline` token instead of `NonLogicalNewline`.
6. If the re-lexing gives a different token than the current one, the
token source needs to update it's token collection to remove all the
tokens which comes after the new current position.

It turns out that the list parsing isn't that happy with the results so
it requires some re-arranging such that the following two errors are
raised correctly:
1. Expected comma
2. Recovery context error

For (1), the following scenarios needs to be considered:
* Missing comma between two elements
* Half parsed element because the grammar doesn't allow it (for example,
named expressions)

For (2), the following scenarios needs to be considered:
1. If the parser is at a comma which means that there's a missing
element otherwise the comma would've been consumed by the first `eat`
call above. And, the parser doesn't take the re-lexing route on a comma
token.
2. If it's the first element and the current token is not a comma which
means that it's an invalid element.

resolves: #11640 

## Test Plan

- [x] Update existing test snapshots and validate them
- [x] Add additional test cases specific to the re-lexing logic and
validate the snapshots
- [x] Run the fuzzer on 3000+ valid inputs
- [x] Run the fuzzer on invalid inputs
- [x] Run the parser on various open source projects
- [x] Make sure the ecosystem changes are none
2024-06-17 06:47:00 +00:00

194 lines
6.5 KiB
Rust

use ruff_text_size::{Ranged, TextRange, TextSize};
use crate::lexer::{Lexer, LexerCheckpoint, LexicalError, Token, TokenFlags, TokenValue};
use crate::{Mode, TokenKind};
/// Token source for the parser that skips over any trivia tokens.
#[derive(Debug)]
pub(crate) struct TokenSource<'src> {
/// The underlying source for the tokens.
lexer: Lexer<'src>,
/// A vector containing all the tokens emitted by the lexer. This is returned when the parser
/// is finished consuming all the tokens. Note that unlike the emitted tokens, this vector
/// holds both the trivia and non-trivia tokens.
tokens: Vec<Token>,
}
impl<'src> TokenSource<'src> {
/// Create a new token source for the given lexer.
pub(crate) fn new(lexer: Lexer<'src>) -> Self {
// TODO(dhruvmanila): Use `allocate_tokens_vec`
TokenSource {
lexer,
tokens: vec![],
}
}
/// Create a new token source from the given source code which starts at the given offset.
pub(crate) fn from_source(source: &'src str, mode: Mode, start_offset: TextSize) -> Self {
let lexer = Lexer::new(source, mode, start_offset);
let mut source = TokenSource::new(lexer);
// Initialize the token source so that the current token is set correctly.
source.do_bump();
source
}
/// Returns the kind of the current token.
pub(crate) fn current_kind(&self) -> TokenKind {
self.lexer.current_kind()
}
/// Returns the range of the current token.
pub(crate) fn current_range(&self) -> TextRange {
self.lexer.current_range()
}
/// Returns the flags for the current token.
pub(crate) fn current_flags(&self) -> TokenFlags {
self.lexer.current_flags()
}
/// Calls the underlying [`take_value`] method on the lexer. Refer to its documentation
/// for more info.
///
/// [`take_value`]: Lexer::take_value
pub(crate) fn take_value(&mut self) -> TokenValue {
self.lexer.take_value()
}
/// Calls the underlying [`re_lex_logical_token`] method on the lexer and updates the token
/// vector accordingly.
///
/// [`re_lex_logical_token`]: Lexer::re_lex_logical_token
pub(crate) fn re_lex_logical_token(&mut self) {
if self.lexer.re_lex_logical_token() {
let current_start = self.current_range().start();
while self
.tokens
.last()
.is_some_and(|last| last.start() >= current_start)
{
self.tokens.pop();
}
}
}
/// Returns the next non-trivia token without consuming it.
///
/// Use [`peek2`] to get the next two tokens.
///
/// [`peek2`]: TokenSource::peek2
pub(crate) fn peek(&mut self) -> TokenKind {
let checkpoint = self.lexer.checkpoint();
let next = self.next_non_trivia_token();
self.lexer.rewind(checkpoint);
next
}
/// Returns the next two non-trivia tokens without consuming it.
///
/// Use [`peek`] to only get the next token.
///
/// [`peek`]: TokenSource::peek
pub(crate) fn peek2(&mut self) -> (TokenKind, TokenKind) {
let checkpoint = self.lexer.checkpoint();
let first = self.next_non_trivia_token();
let second = self.next_non_trivia_token();
self.lexer.rewind(checkpoint);
(first, second)
}
/// Bumps the token source to the next non-trivia token.
///
/// It pushes the given kind to the token vector with the current token range.
pub(crate) fn bump(&mut self, kind: TokenKind) {
self.tokens
.push(Token::new(kind, self.current_range(), self.current_flags()));
self.do_bump();
}
/// Bumps the token source to the next non-trivia token without adding the current token to the
/// token vector. It does add the trivia tokens to the token vector.
fn do_bump(&mut self) {
loop {
let kind = self.lexer.next_token();
if is_trivia(kind) {
self.tokens
.push(Token::new(kind, self.current_range(), self.current_flags()));
continue;
}
break;
}
}
/// Returns the next non-trivia token without adding it to the token vector.
fn next_non_trivia_token(&mut self) -> TokenKind {
loop {
let kind = self.lexer.next_token();
if is_trivia(kind) {
continue;
}
break kind;
}
}
/// Creates a checkpoint to which the token source can later return to using [`Self::rewind`].
pub(crate) fn checkpoint(&self) -> TokenSourceCheckpoint {
TokenSourceCheckpoint {
lexer_checkpoint: self.lexer.checkpoint(),
tokens_position: self.tokens.len(),
}
}
/// Restore the token source to the given checkpoint.
pub(crate) fn rewind(&mut self, checkpoint: TokenSourceCheckpoint) {
let TokenSourceCheckpoint {
lexer_checkpoint,
tokens_position,
} = checkpoint;
self.lexer.rewind(lexer_checkpoint);
self.tokens.truncate(tokens_position);
}
/// Consumes the token source, returning the collected tokens, comment ranges, and any errors
/// encountered during lexing. The token collection includes both the trivia and non-trivia
/// tokens.
pub(crate) fn finish(mut self) -> (Vec<Token>, Vec<LexicalError>) {
assert_eq!(
self.current_kind(),
TokenKind::EndOfFile,
"TokenSource was not fully consumed"
);
// The `EndOfFile` token shouldn't be included in the token stream, it's mainly to signal
// the parser to stop. This isn't in `do_bump` because it only needs to be done once.
if let Some(last) = self.tokens.pop() {
assert_eq!(last.kind(), TokenKind::EndOfFile);
}
(self.tokens, self.lexer.finish())
}
}
pub(crate) struct TokenSourceCheckpoint {
lexer_checkpoint: LexerCheckpoint,
tokens_position: usize,
}
/// Allocates a [`Vec`] with an approximated capacity to fit all tokens
/// of `contents`.
///
/// See [#9546](https://github.com/astral-sh/ruff/pull/9546) for a more detailed explanation.
#[allow(dead_code)]
fn allocate_tokens_vec(contents: &str) -> Vec<Token> {
let lower_bound = contents.len().saturating_mul(15) / 100;
Vec::with_capacity(lower_bound)
}
fn is_trivia(token: TokenKind) -> bool {
matches!(token, TokenKind::Comment | TokenKind::NonLogicalNewline)
}