mirror of https://github.com/astral-sh/ruff
517 lines
18 KiB
Rust
517 lines
18 KiB
Rust
use std::{iter::FusedIterator, ops::Deref};
|
|
|
|
use super::{Token, TokenKind};
|
|
use ruff_python_trivia::CommentRanges;
|
|
use ruff_text_size::{Ranged as _, TextRange, TextSize};
|
|
|
|
/// Tokens represents a vector of lexed [`Token`].
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
#[cfg_attr(feature = "get-size", derive(get_size2::GetSize))]
|
|
pub struct Tokens {
|
|
raw: Vec<Token>,
|
|
}
|
|
|
|
impl Tokens {
|
|
pub fn new(tokens: Vec<Token>) -> Tokens {
|
|
Tokens { raw: tokens }
|
|
}
|
|
|
|
/// Returns an iterator over all the tokens that provides context.
|
|
pub fn iter_with_context(&self) -> TokenIterWithContext<'_> {
|
|
TokenIterWithContext::new(&self.raw)
|
|
}
|
|
|
|
/// Performs a binary search to find the index of the **first** token that starts at the given `offset`.
|
|
///
|
|
/// Unlike `binary_search_by_key`, this method ensures that if multiple tokens start at the same offset,
|
|
/// it returns the index of the first one. Multiple tokens can start at the same offset in cases where
|
|
/// zero-length tokens are involved (like `Dedent` or `Newline` at the end of the file).
|
|
pub fn binary_search_by_start(&self, offset: TextSize) -> Result<usize, usize> {
|
|
let partition_point = self.partition_point(|token| token.start() < offset);
|
|
|
|
let after = &self[partition_point..];
|
|
|
|
if after.first().is_some_and(|first| first.start() == offset) {
|
|
Ok(partition_point)
|
|
} else {
|
|
Err(partition_point)
|
|
}
|
|
}
|
|
|
|
/// Returns a slice of [`Token`] that are within the given `range`.
|
|
///
|
|
/// The start and end offset of the given range should be either:
|
|
/// 1. Token boundary
|
|
/// 2. Gap between the tokens
|
|
///
|
|
/// For example, considering the following tokens and their corresponding range:
|
|
///
|
|
/// | Token | Range |
|
|
/// |---------------------|-----------|
|
|
/// | `Def` | `0..3` |
|
|
/// | `Name` | `4..7` |
|
|
/// | `Lpar` | `7..8` |
|
|
/// | `Rpar` | `8..9` |
|
|
/// | `Colon` | `9..10` |
|
|
/// | `Newline` | `10..11` |
|
|
/// | `Comment` | `15..24` |
|
|
/// | `NonLogicalNewline` | `24..25` |
|
|
/// | `Indent` | `25..29` |
|
|
/// | `Pass` | `29..33` |
|
|
///
|
|
/// Here, for (1) a token boundary is considered either the start or end offset of any of the
|
|
/// above tokens. For (2), the gap would be any offset between the `Newline` and `Comment`
|
|
/// token which are 12, 13, and 14.
|
|
///
|
|
/// Examples:
|
|
/// 1) `4..10` would give `Name`, `Lpar`, `Rpar`, `Colon`
|
|
/// 2) `11..25` would give `Comment`, `NonLogicalNewline`
|
|
/// 3) `12..25` would give same as (2) and offset 12 is in the "gap"
|
|
/// 4) `9..12` would give `Colon`, `Newline` and offset 12 is in the "gap"
|
|
/// 5) `18..27` would panic because both the start and end offset is within a token
|
|
///
|
|
/// ## Note
|
|
///
|
|
/// The returned slice can contain the [`TokenKind::Unknown`] token if there was a lexical
|
|
/// error encountered within the given range.
|
|
///
|
|
/// # Panics
|
|
///
|
|
/// If either the start or end offset of the given range is within a token range.
|
|
pub fn in_range(&self, range: TextRange) -> &[Token] {
|
|
let tokens_after_start = self.after(range.start());
|
|
|
|
Self::before_impl(tokens_after_start, range.end())
|
|
}
|
|
|
|
/// Searches the token(s) at `offset`.
|
|
///
|
|
/// Returns [`TokenAt::Between`] if `offset` points directly inbetween two tokens
|
|
/// (the left token ends at `offset` and the right token starts at `offset`).
|
|
pub fn at_offset(&self, offset: TextSize) -> TokenAt {
|
|
match self.binary_search_by_start(offset) {
|
|
// The token at `index` starts exactly at `offset.
|
|
// ```python
|
|
// object.attribute
|
|
// ^ OFFSET
|
|
// ```
|
|
Ok(index) => {
|
|
let token = self[index];
|
|
// `token` starts exactly at `offset`. Test if the offset is right between
|
|
// `token` and the previous token (if there's any)
|
|
if let Some(previous) = index.checked_sub(1).map(|idx| self[idx]) {
|
|
if previous.end() == offset {
|
|
return TokenAt::Between(previous, token);
|
|
}
|
|
}
|
|
|
|
TokenAt::Single(token)
|
|
}
|
|
|
|
// No token found that starts exactly at the given offset. But it's possible that
|
|
// the token starting before `offset` fully encloses `offset` (it's end range ends after `offset`).
|
|
// ```python
|
|
// object.attribute
|
|
// ^ OFFSET
|
|
// # or
|
|
// if True:
|
|
// print("test")
|
|
// ^ OFFSET
|
|
// ```
|
|
Err(index) => {
|
|
if let Some(previous) = index.checked_sub(1).map(|idx| self[idx]) {
|
|
if previous.range().contains_inclusive(offset) {
|
|
return TokenAt::Single(previous);
|
|
}
|
|
}
|
|
|
|
TokenAt::None
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Returns a slice of tokens before the given [`TextSize`] offset.
|
|
///
|
|
/// If the given offset is between two tokens, the returned slice will end just before the
|
|
/// following token. In other words, if the offset is between the end of previous token and
|
|
/// start of next token, the returned slice will end just before the next token.
|
|
///
|
|
/// # Panics
|
|
///
|
|
/// If the given offset is inside a token range at any point
|
|
/// other than the start of the range.
|
|
pub fn before(&self, offset: TextSize) -> &[Token] {
|
|
Self::before_impl(&self.raw, offset)
|
|
}
|
|
|
|
fn before_impl(tokens: &[Token], offset: TextSize) -> &[Token] {
|
|
let partition_point = tokens.partition_point(|token| token.start() < offset);
|
|
let before = &tokens[..partition_point];
|
|
|
|
if let Some(last) = before.last() {
|
|
// If it's equal to the end offset, then it's at a token boundary which is
|
|
// valid. If it's greater than the end offset, then it's in the gap between
|
|
// the tokens which is valid as well.
|
|
assert!(
|
|
offset >= last.end(),
|
|
"Offset {offset:?} is inside token `{last:?}`",
|
|
);
|
|
}
|
|
before
|
|
}
|
|
|
|
/// Returns a slice of tokens after the given [`TextSize`] offset.
|
|
///
|
|
/// If the given offset is between two tokens, the returned slice will start from the following
|
|
/// token. In other words, if the offset is between the end of previous token and start of next
|
|
/// token, the returned slice will start from the next token.
|
|
///
|
|
/// # Panics
|
|
///
|
|
/// If the given offset is inside a token range at any point
|
|
/// other than the start of the range.
|
|
pub fn after(&self, offset: TextSize) -> &[Token] {
|
|
let partition_point = self.partition_point(|token| token.end() <= offset);
|
|
let after = &self[partition_point..];
|
|
|
|
if let Some(first) = after.first() {
|
|
// valid. If it's greater than the end offset, then it's in the gap between
|
|
// the tokens which is valid as well.
|
|
assert!(
|
|
offset <= first.start(),
|
|
"Offset {offset:?} is inside token `{first:?}`",
|
|
);
|
|
}
|
|
|
|
after
|
|
}
|
|
}
|
|
|
|
impl<'a> IntoIterator for &'a Tokens {
|
|
type Item = &'a Token;
|
|
type IntoIter = std::slice::Iter<'a, Token>;
|
|
|
|
fn into_iter(self) -> Self::IntoIter {
|
|
self.iter()
|
|
}
|
|
}
|
|
|
|
impl Deref for Tokens {
|
|
type Target = [Token];
|
|
|
|
fn deref(&self) -> &Self::Target {
|
|
&self.raw
|
|
}
|
|
}
|
|
|
|
/// A token that encloses a given offset or ends exactly at it.
|
|
#[derive(Debug, Clone)]
|
|
pub enum TokenAt {
|
|
/// There's no token at the given offset
|
|
None,
|
|
|
|
/// There's a single token at the given offset.
|
|
Single(Token),
|
|
|
|
/// The offset falls exactly between two tokens. E.g. `CURSOR` in `call<CURSOR>(arguments)` is
|
|
/// positioned exactly between the `call` and `(` tokens.
|
|
Between(Token, Token),
|
|
}
|
|
|
|
impl Iterator for TokenAt {
|
|
type Item = Token;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
match *self {
|
|
TokenAt::None => None,
|
|
TokenAt::Single(token) => {
|
|
*self = TokenAt::None;
|
|
Some(token)
|
|
}
|
|
TokenAt::Between(first, second) => {
|
|
*self = TokenAt::Single(second);
|
|
Some(first)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl FusedIterator for TokenAt {}
|
|
|
|
impl From<&Tokens> for CommentRanges {
|
|
fn from(tokens: &Tokens) -> Self {
|
|
let mut ranges = vec![];
|
|
for token in tokens {
|
|
if token.kind() == TokenKind::Comment {
|
|
ranges.push(token.range());
|
|
}
|
|
}
|
|
CommentRanges::new(ranges)
|
|
}
|
|
}
|
|
|
|
/// An iterator over the [`Token`]s with context.
|
|
///
|
|
/// This struct is created by the [`iter_with_context`] method on [`Tokens`]. Refer to its
|
|
/// documentation for more details.
|
|
///
|
|
/// [`iter_with_context`]: Tokens::iter_with_context
|
|
#[derive(Debug, Clone)]
|
|
pub struct TokenIterWithContext<'a> {
|
|
inner: std::slice::Iter<'a, Token>,
|
|
nesting: u32,
|
|
}
|
|
|
|
impl<'a> TokenIterWithContext<'a> {
|
|
fn new(tokens: &'a [Token]) -> TokenIterWithContext<'a> {
|
|
TokenIterWithContext {
|
|
inner: tokens.iter(),
|
|
nesting: 0,
|
|
}
|
|
}
|
|
|
|
/// Return the nesting level the iterator is currently in.
|
|
pub const fn nesting(&self) -> u32 {
|
|
self.nesting
|
|
}
|
|
|
|
/// Returns `true` if the iterator is within a parenthesized context.
|
|
pub const fn in_parenthesized_context(&self) -> bool {
|
|
self.nesting > 0
|
|
}
|
|
|
|
/// Returns the next [`Token`] in the iterator without consuming it.
|
|
pub fn peek(&self) -> Option<&'a Token> {
|
|
self.clone().next()
|
|
}
|
|
}
|
|
|
|
impl<'a> Iterator for TokenIterWithContext<'a> {
|
|
type Item = &'a Token;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
let token = self.inner.next()?;
|
|
|
|
match token.kind() {
|
|
TokenKind::Lpar | TokenKind::Lbrace | TokenKind::Lsqb => self.nesting += 1,
|
|
TokenKind::Rpar | TokenKind::Rbrace | TokenKind::Rsqb => {
|
|
self.nesting = self.nesting.saturating_sub(1);
|
|
}
|
|
// This mimics the behavior of re-lexing which reduces the nesting level on the lexer.
|
|
// We don't need to reduce it by 1 because unlike the lexer we see the final token
|
|
// after recovering from every unclosed parenthesis.
|
|
TokenKind::Newline if self.nesting > 0 => {
|
|
self.nesting = 0;
|
|
}
|
|
_ => {}
|
|
}
|
|
|
|
Some(token)
|
|
}
|
|
}
|
|
|
|
impl FusedIterator for TokenIterWithContext<'_> {}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use std::ops::Range;
|
|
|
|
use ruff_text_size::TextSize;
|
|
|
|
use crate::token::{Token, TokenFlags, TokenKind};
|
|
|
|
use super::*;
|
|
|
|
/// Test case containing a "gap" between two tokens.
|
|
///
|
|
/// Code: <https://play.ruff.rs/a3658340-6df8-42c5-be80-178744bf1193>
|
|
const TEST_CASE_WITH_GAP: [(TokenKind, Range<u32>); 10] = [
|
|
(TokenKind::Def, 0..3),
|
|
(TokenKind::Name, 4..7),
|
|
(TokenKind::Lpar, 7..8),
|
|
(TokenKind::Rpar, 8..9),
|
|
(TokenKind::Colon, 9..10),
|
|
(TokenKind::Newline, 10..11),
|
|
// Gap ||..||
|
|
(TokenKind::Comment, 15..24),
|
|
(TokenKind::NonLogicalNewline, 24..25),
|
|
(TokenKind::Indent, 25..29),
|
|
(TokenKind::Pass, 29..33),
|
|
// No newline at the end to keep the token set full of unique tokens
|
|
];
|
|
|
|
/// Helper function to create [`Tokens`] from an iterator of (kind, range).
|
|
fn new_tokens(tokens: impl Iterator<Item = (TokenKind, Range<u32>)>) -> Tokens {
|
|
Tokens::new(
|
|
tokens
|
|
.map(|(kind, range)| {
|
|
Token::new(
|
|
kind,
|
|
TextRange::new(TextSize::new(range.start), TextSize::new(range.end)),
|
|
TokenFlags::empty(),
|
|
)
|
|
})
|
|
.collect(),
|
|
)
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_after_offset_at_token_start() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let after = tokens.after(TextSize::new(8));
|
|
assert_eq!(after.len(), 7);
|
|
assert_eq!(after.first().unwrap().kind(), TokenKind::Rpar);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_after_offset_at_token_end() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let after = tokens.after(TextSize::new(11));
|
|
assert_eq!(after.len(), 4);
|
|
assert_eq!(after.first().unwrap().kind(), TokenKind::Comment);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_after_offset_between_tokens() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let after = tokens.after(TextSize::new(13));
|
|
assert_eq!(after.len(), 4);
|
|
assert_eq!(after.first().unwrap().kind(), TokenKind::Comment);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_after_offset_at_last_token_end() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let after = tokens.after(TextSize::new(33));
|
|
assert_eq!(after.len(), 0);
|
|
}
|
|
|
|
#[test]
|
|
#[should_panic(expected = "Offset 5 is inside token `Name 4..7`")]
|
|
fn tokens_after_offset_inside_token() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
tokens.after(TextSize::new(5));
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_before_offset_at_first_token_start() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let before = tokens.before(TextSize::new(0));
|
|
assert_eq!(before.len(), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_before_offset_after_first_token_gap() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let before = tokens.before(TextSize::new(3));
|
|
assert_eq!(before.len(), 1);
|
|
assert_eq!(before.last().unwrap().kind(), TokenKind::Def);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_before_offset_at_second_token_start() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let before = tokens.before(TextSize::new(4));
|
|
assert_eq!(before.len(), 1);
|
|
assert_eq!(before.last().unwrap().kind(), TokenKind::Def);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_before_offset_at_token_start() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let before = tokens.before(TextSize::new(8));
|
|
assert_eq!(before.len(), 3);
|
|
assert_eq!(before.last().unwrap().kind(), TokenKind::Lpar);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_before_offset_at_token_end() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let before = tokens.before(TextSize::new(11));
|
|
assert_eq!(before.len(), 6);
|
|
assert_eq!(before.last().unwrap().kind(), TokenKind::Newline);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_before_offset_between_tokens() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let before = tokens.before(TextSize::new(13));
|
|
assert_eq!(before.len(), 6);
|
|
assert_eq!(before.last().unwrap().kind(), TokenKind::Newline);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_before_offset_at_last_token_end() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let before = tokens.before(TextSize::new(33));
|
|
assert_eq!(before.len(), 10);
|
|
assert_eq!(before.last().unwrap().kind(), TokenKind::Pass);
|
|
}
|
|
|
|
#[test]
|
|
#[should_panic(expected = "Offset 5 is inside token `Name 4..7`")]
|
|
fn tokens_before_offset_inside_token() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
tokens.before(TextSize::new(5));
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_in_range_at_token_offset() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let in_range = tokens.in_range(TextRange::new(4.into(), 10.into()));
|
|
assert_eq!(in_range.len(), 4);
|
|
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Name);
|
|
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Colon);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_in_range_start_offset_at_token_end() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let in_range = tokens.in_range(TextRange::new(11.into(), 29.into()));
|
|
assert_eq!(in_range.len(), 3);
|
|
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Comment);
|
|
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Indent);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_in_range_end_offset_at_token_start() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let in_range = tokens.in_range(TextRange::new(8.into(), 15.into()));
|
|
assert_eq!(in_range.len(), 3);
|
|
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Rpar);
|
|
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Newline);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_in_range_start_offset_between_tokens() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let in_range = tokens.in_range(TextRange::new(13.into(), 29.into()));
|
|
assert_eq!(in_range.len(), 3);
|
|
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Comment);
|
|
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Indent);
|
|
}
|
|
|
|
#[test]
|
|
fn tokens_in_range_end_offset_between_tokens() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
let in_range = tokens.in_range(TextRange::new(9.into(), 13.into()));
|
|
assert_eq!(in_range.len(), 2);
|
|
assert_eq!(in_range.first().unwrap().kind(), TokenKind::Colon);
|
|
assert_eq!(in_range.last().unwrap().kind(), TokenKind::Newline);
|
|
}
|
|
|
|
#[test]
|
|
#[should_panic(expected = "Offset 5 is inside token `Name 4..7`")]
|
|
fn tokens_in_range_start_offset_inside_token() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
tokens.in_range(TextRange::new(5.into(), 10.into()));
|
|
}
|
|
|
|
#[test]
|
|
#[should_panic(expected = "Offset 6 is inside token `Name 4..7`")]
|
|
fn tokens_in_range_end_offset_inside_token() {
|
|
let tokens = new_tokens(TEST_CASE_WITH_GAP.into_iter());
|
|
tokens.in_range(TextRange::new(0.into(), 6.into()));
|
|
}
|
|
}
|