//! Struct used to efficiently slice source code at (row, column) Locations. use once_cell::unsync::OnceCell; use rustpython_parser::ast::Location; use crate::types::Range; pub struct Locator<'a> { contents: &'a str, index: OnceCell, } impl<'a> Locator<'a> { pub const fn new(contents: &'a str) -> Self { Self { contents, index: OnceCell::new(), } } fn get_or_init_index(&self) -> &Index { self.index.get_or_init(|| Index::from(self.contents)) } /// Take the source code up to the given [`Location`]. pub fn take(&self, location: Location) -> &'a str { let index = self.get_or_init_index(); let offset = index.byte_offset(location, self.contents); &self.contents[..offset] } /// Take the source code after the given [`Location`]. pub fn skip(&self, location: Location) -> &'a str { let index = self.get_or_init_index(); let offset = index.byte_offset(location, self.contents); &self.contents[offset..] } /// Take the source code between the given [`Range`]. pub fn slice>(&self, range: R) -> &'a str { let index = self.get_or_init_index(); let range = range.into(); let start = index.byte_offset(range.location, self.contents); let end = index.byte_offset(range.end_location, self.contents); &self.contents[start..end] } /// Return the byte offset of the given [`Location`]. pub fn offset(&self, location: Location) -> usize { let index = self.get_or_init_index(); index.byte_offset(location, self.contents) } /// Return the underlying source code. pub fn contents(&self) -> &'a str { self.contents } /// Return the number of lines in the source code. pub fn count_lines(&self) -> usize { let index = self.get_or_init_index(); index.count_lines() } /// Return the number of bytes in the source code. pub const fn len(&self) -> usize { self.contents.len() } /// Return `true` if the source code is empty. pub const fn is_empty(&self) -> bool { self.contents.is_empty() } } /// Index for fast [`Location`] to byte offset conversions. #[derive(Debug, Clone)] enum Index { /// Optimized index for an ASCII only document Ascii(AsciiIndex), /// Index for UTF8 documents Utf8(Utf8Index), } impl Index { /// Truncate a [`Location`] to a byte offset in source code. fn byte_offset(&self, location: Location, contents: &str) -> usize { match self { Index::Ascii(ascii) => ascii.byte_offset(location, contents), Index::Utf8(utf8) => utf8.byte_offset(location, contents), } } /// Return the number of lines in the source code. fn count_lines(&self) -> usize { match self { Index::Ascii(ascii) => ascii.line_start_byte_offsets.len(), Index::Utf8(utf8) => utf8.line_start_byte_offsets.len(), } } } impl From<&str> for Index { fn from(contents: &str) -> Self { assert!(u32::try_from(contents.len()).is_ok()); let mut line_start_offsets: Vec = Vec::with_capacity(48); line_start_offsets.push(0); let mut utf8 = false; // SAFE because of length assertion above #[allow(clippy::cast_possible_truncation)] for (i, byte) in contents.bytes().enumerate() { utf8 |= !byte.is_ascii(); match byte { // Only track one line break for `\r\n`. b'\r' if contents.as_bytes().get(i + 1) == Some(&b'\n') => continue, b'\n' | b'\r' => { line_start_offsets.push((i + 1) as u32); } _ => {} } } if utf8 { Self::Utf8(Utf8Index::new(line_start_offsets)) } else { Self::Ascii(AsciiIndex::new(line_start_offsets)) } } } /// Index for fast [`Location`] to byte offset conversions for ASCII documents. /// /// The index stores the byte offsets for every line. It computes the byte offset for a [`Location`] /// by retrieving the line offset from its index and adding the column. #[derive(Debug, Clone, Eq, PartialEq)] struct AsciiIndex { line_start_byte_offsets: Vec, } impl AsciiIndex { fn new(line_start_positions: Vec) -> Self { Self { line_start_byte_offsets: line_start_positions, } } /// Truncate a [`Location`] to a byte offset in ASCII source code. fn byte_offset(&self, location: Location, contents: &str) -> usize { let index = &self.line_start_byte_offsets; // If start-of-line position after last line if location.row() - 1 == index.len() && location.column() == 0 { contents.len() } else { let byte_offset = index[location.row() - 1] as usize + location.column(); byte_offset.min(contents.len()) } } } /// Index for fast [`Location`] to byte offset conversions for UTF8 documents. /// /// The index stores the byte offset of every line. The column offset is lazily computed by /// adding the line start offset and then iterating to the `nth` character. #[derive(Debug, Clone, PartialEq)] struct Utf8Index { line_start_byte_offsets: Vec, } impl Utf8Index { fn new(line_byte_positions: Vec) -> Self { Self { line_start_byte_offsets: line_byte_positions, } } /// Truncate a [`Location`] to a byte offset in UTF-8 source code. fn byte_offset(&self, location: Location, contents: &str) -> usize { let index = &self.line_start_byte_offsets; if location.row() - 1 == index.len() && location.column() == 0 { contents.len() } else { // Casting is safe because the length of utf8 characters is always between 1-4 #[allow(clippy::cast_possible_truncation)] let line_start = if location.row() == 1 && contents.starts_with('\u{feff}') { '\u{feff}'.len_utf8() as u32 } else { index[location.row() - 1] }; let rest = &contents[line_start as usize..]; let column_offset = match rest.char_indices().nth(location.column()) { Some((offset, _)) => offset, None => contents.len(), }; let offset = line_start as usize + column_offset; offset.min(contents.len()) } } } #[cfg(test)] mod tests { use rustpython_parser::ast::Location; use crate::source_code::locator::{AsciiIndex, Index, Utf8Index}; fn index_ascii(content: &str) -> AsciiIndex { match Index::from(content) { Index::Ascii(ascii) => ascii, Index::Utf8(_) => { panic!("Expected ASCII index") } } } fn index_utf8(content: &str) -> Utf8Index { match Index::from(content) { Index::Utf8(utf8) => utf8, Index::Ascii(_) => { panic!("Expected UTF8 index") } } } #[test] fn ascii_index() { let contents = ""; let index = index_ascii(contents); assert_eq!(index, AsciiIndex::new(vec![0])); let contents = "x = 1"; let index = index_ascii(contents); assert_eq!(index, AsciiIndex::new(vec![0])); let contents = "x = 1\n"; let index = index_ascii(contents); assert_eq!(index, AsciiIndex::new(vec![0, 6])); let contents = "x = 1\ny = 2\nz = x + y\n"; let index = index_ascii(contents); assert_eq!(index, AsciiIndex::new(vec![0, 6, 12, 22])); } #[test] fn ascii_byte_offset() { let contents = "x = 1\ny = 2"; let index = index_ascii(contents); // First row. let loc = index.byte_offset(Location::new(1, 0), contents); assert_eq!(loc, 0); // Second row. let loc = index.byte_offset(Location::new(2, 0), contents); assert_eq!(loc, 6); // One-past-the-end. let loc = index.byte_offset(Location::new(3, 0), contents); assert_eq!(loc, 11); } #[test] fn ascii_carriage_return() { let contents = "x = 4\ry = 3"; let index = index_ascii(contents); assert_eq!(index, AsciiIndex::new(vec![0, 6])); assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4); assert_eq!(index.byte_offset(Location::new(2, 0), contents), 6); assert_eq!(index.byte_offset(Location::new(2, 1), contents), 7); } #[test] fn ascii_carriage_return_newline() { let contents = "x = 4\r\ny = 3"; let index = index_ascii(contents); assert_eq!(index, AsciiIndex::new(vec![0, 7])); assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4); assert_eq!(index.byte_offset(Location::new(2, 0), contents), 7); assert_eq!(index.byte_offset(Location::new(2, 1), contents), 8); } impl Utf8Index { fn line_count(&self) -> usize { self.line_start_byte_offsets.len() } } #[test] fn utf8_index() { let contents = "x = '🫣'"; let index = index_utf8(contents); assert_eq!(index.line_count(), 1); assert_eq!(index, Utf8Index::new(vec![0])); let contents = "x = '🫣'\n"; let index = index_utf8(contents); assert_eq!(index.line_count(), 2); assert_eq!(index, Utf8Index::new(vec![0, 11])); let contents = "x = '🫣'\ny = 2\nz = x + y\n"; let index = index_utf8(contents); assert_eq!(index.line_count(), 4); assert_eq!(index, Utf8Index::new(vec![0, 11, 17, 27])); let contents = "# 🫣\nclass Foo:\n \"\"\".\"\"\""; let index = index_utf8(contents); assert_eq!(index.line_count(), 3); assert_eq!(index, Utf8Index::new(vec![0, 7, 18])); } #[test] fn utf8_carriage_return() { let contents = "x = '🫣'\ry = 3"; let index = index_utf8(contents); assert_eq!(index.line_count(), 2); assert_eq!(index, Utf8Index::new(vec![0, 11])); // Second ' assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9); assert_eq!(index.byte_offset(Location::new(2, 0), contents), 11); assert_eq!(index.byte_offset(Location::new(2, 1), contents), 12); } #[test] fn utf8_carriage_return_newline() { let contents = "x = '🫣'\r\ny = 3"; let index = index_utf8(contents); assert_eq!(index.line_count(), 2); assert_eq!(index, Utf8Index::new(vec![0, 12])); // Second ' assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9); assert_eq!(index.byte_offset(Location::new(2, 0), contents), 12); assert_eq!(index.byte_offset(Location::new(2, 1), contents), 13); } #[test] fn utf8_byte_offset() { let contents = "x = '☃'\ny = 2"; let index = index_utf8(contents); assert_eq!(index, Utf8Index::new(vec![0, 10])); // First row. let loc = index.byte_offset(Location::new(1, 0), contents); assert_eq!(loc, 0); let loc = index.byte_offset(Location::new(1, 5), contents); assert_eq!(loc, 5); assert_eq!(&contents[loc..], "☃'\ny = 2"); let loc = index.byte_offset(Location::new(1, 6), contents); assert_eq!(loc, 8); assert_eq!(&contents[loc..], "'\ny = 2"); // Second row. let loc = index.byte_offset(Location::new(2, 0), contents); assert_eq!(loc, 10); // One-past-the-end. let loc = index.byte_offset(Location::new(3, 0), contents); assert_eq!(loc, 15); } }