diff --git a/Cargo.lock b/Cargo.lock index 59e34ecfea..41d37d784a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2177,6 +2177,7 @@ dependencies = [ "once_cell", "regex", "ruff_rustpython", + "ruff_text_size", "rustc-hash", "rustpython-common", "rustpython-parser", diff --git a/crates/ruff_diagnostics/src/diagnostic.rs b/crates/ruff_diagnostics/src/diagnostic.rs index 914bb1d765..0fdb87440c 100644 --- a/crates/ruff_diagnostics/src/diagnostic.rs +++ b/crates/ruff_diagnostics/src/diagnostic.rs @@ -58,6 +58,7 @@ impl Diagnostic { /// Set the [`Fix`] used to fix the diagnostic, if the provided function returns `Ok`. /// Otherwise, log the error. + #[inline] pub fn try_set_fix>(&mut self, func: impl FnOnce() -> Result) { match func() { Ok(fix) => self.fix = fix.into(), @@ -66,6 +67,7 @@ impl Diagnostic { } /// Set the location of the diagnostic's parent node. + #[inline] pub fn set_parent(&mut self, parent: Location) { self.parent = Some(parent); } diff --git a/crates/ruff_python_ast/Cargo.toml b/crates/ruff_python_ast/Cargo.toml index 1ecb46b9f1..cf9ec26767 100644 --- a/crates/ruff_python_ast/Cargo.toml +++ b/crates/ruff_python_ast/Cargo.toml @@ -9,6 +9,7 @@ rust-version = { workspace = true } [dependencies] ruff_rustpython = { path = "../ruff_rustpython" } +ruff_text_size = { path = "../ruff_text_size" } anyhow = { workspace = true } bitflags = { workspace = true } diff --git a/crates/ruff_python_ast/src/source_code/line_index.rs b/crates/ruff_python_ast/src/source_code/line_index.rs new file mode 100644 index 0000000000..a15697dd85 --- /dev/null +++ b/crates/ruff_python_ast/src/source_code/line_index.rs @@ -0,0 +1,418 @@ +use ruff_text_size::{TextLen, TextRange, TextSize}; +use rustpython_parser::ast::Location; +use std::fmt; +use std::fmt::{Debug, Formatter}; +use std::num::NonZeroUsize; +use std::ops::Deref; +use std::sync::Arc; + +/// Index for fast [`Location`] to [byte offset](TextSize) conversions. +/// +/// Cloning a [`LineIndex`] is cheap because it only requires bumping a reference count. +#[derive(Clone)] +pub struct LineIndex { + inner: Arc, +} + +struct LineIndexInner { + line_starts: Vec, + kind: IndexKind, +} + +impl LineIndex { + /// Builds the [`LineIndex`] from the source text of a file. + pub fn from_source_text(text: &str) -> Self { + assert!(u32::try_from(text.len()).is_ok()); + + let mut line_starts: Vec = Vec::with_capacity(text.len() / 88); + line_starts.push(TextSize::default()); + + let bytes = text.as_bytes(); + let mut utf8 = false; + + for (i, byte) in bytes.iter().enumerate() { + utf8 |= !byte.is_ascii(); + + match byte { + // Only track one line break for `\r\n`. + b'\r' if bytes.get(i + 1) == Some(&b'\n') => continue, + b'\n' | b'\r' => { + line_starts.push(TextSize::try_from(i + 1).unwrap()); + } + _ => {} + } + } + + let kind = if utf8 { + IndexKind::Utf8 + } else { + IndexKind::Ascii + }; + + Self { + inner: Arc::new(LineIndexInner { line_starts, kind }), + } + } + + fn kind(&self) -> IndexKind { + self.inner.kind + } + + /// Converts a [`Location`] to it's [byte offset](TextSize) in the source code. + pub fn location_offset(&self, location: Location, contents: &str) -> TextSize { + let line_index = OneIndexed::new(location.row()).unwrap(); + let line_range = self.line_range(line_index, contents); + + let column_offset = match self.kind() { + IndexKind::Ascii => TextSize::try_from(location.column()).unwrap(), + IndexKind::Utf8 => { + let line = &contents[line_range]; + + // Skip the bom character + let bom_len = + usize::from(line_index.to_zero_indexed() == 0 && line.starts_with('\u{feff}')); + + match line.char_indices().nth(location.column() + bom_len) { + Some((offset, _)) => TextSize::try_from(offset).unwrap(), + None => line_range.len(), + } + } + }; + + line_range.start() + column_offset + } + + /// Return the number of lines in the source code. + pub(crate) fn lines_count(&self) -> usize { + self.line_starts().len() + } + + /// Returns the [byte offset](TextSize) for the `line` with the given index. + fn line_start(&self, line: OneIndexed, contents: &str) -> TextSize { + let row_index = line.to_zero_indexed(); + let starts = self.line_starts(); + + // If start-of-line position after last line + if row_index == starts.len() { + contents.text_len() + } else { + starts[row_index] + } + } + + /// Returns the [`TextRange`] of the `line` with the given index. + /// The start points to the first character's [byte offset](TextSize), the end up to, and including + /// the newline character ending the line (if any). + fn line_range(&self, line: OneIndexed, contents: &str) -> TextRange { + let starts = self.line_starts(); + + if starts.len() == line.to_zero_indexed() { + TextRange::empty(contents.text_len()) + } else { + TextRange::new( + self.line_start(line, contents), + self.line_start(line.saturating_add(1), contents), + ) + } + } + + /// Returns the [byte offsets](TextSize) for every line + pub fn line_starts(&self) -> &[TextSize] { + &self.inner.line_starts + } +} + +impl Deref for LineIndex { + type Target = [TextSize]; + + fn deref(&self) -> &Self::Target { + self.line_starts() + } +} + +impl Debug for LineIndex { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_list().entries(self.line_starts()).finish() + } +} + +#[derive(Debug, Clone, Copy)] +enum IndexKind { + /// Optimized index for an ASCII only document + Ascii, + + /// Index for UTF8 documents + Utf8, +} + +/// Type-safe wrapper for a value whose logical range starts at `1`, for +/// instance the line or column numbers in a file +/// +/// Internally this is represented as a [`NonZeroUsize`], this enables some +/// memory optimizations +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct OneIndexed(NonZeroUsize); + +const ONE: NonZeroUsize = unwrap(NonZeroUsize::new(1)); + +impl OneIndexed { + // SAFETY: These constants are being initialized with non-zero values + /// The smallest value that can be represented by this integer type. + pub const MIN: Self = unwrap(Self::new(1)); + /// The largest value that can be represented by this integer type + pub const MAX: Self = unwrap(Self::new(usize::MAX)); + + /// Creates a non-zero if the given value is not zero. + pub const fn new(value: usize) -> Option { + match NonZeroUsize::new(value) { + Some(value) => Some(Self(value)), + None => None, + } + } + + /// Construct a new [`OneIndexed`] from a zero-indexed value + pub const fn from_zero_indexed(value: usize) -> Self { + Self(ONE.saturating_add(value)) + } + + /// Return the zero-indexed primitive value for this [`OneIndexed`] + pub const fn to_zero_indexed(self) -> usize { + self.0.get() - 1 + } + + /// Saturating integer addition. Computes `self + rhs`, saturating at + /// the numeric bounds instead of overflowing. + #[must_use] + pub const fn saturating_add(self, rhs: usize) -> Self { + match NonZeroUsize::new(self.0.get().saturating_add(rhs)) { + Some(value) => Self(value), + None => Self::MAX, + } + } + + /// Saturating integer subtraction. Computes `self - rhs`, saturating + /// at the numeric bounds instead of overflowing. + #[must_use] + pub const fn saturating_sub(self, rhs: usize) -> Self { + match NonZeroUsize::new(self.0.get().saturating_sub(rhs)) { + Some(value) => Self(value), + None => Self::MIN, + } + } +} + +impl std::fmt::Display for OneIndexed { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Debug::fmt(&self.0.get(), f) + } +} + +/// A const `Option::unwrap` without nightly features: +/// [Tracking issue](https://github.com/rust-lang/rust/issues/67441) +const fn unwrap(option: Option) -> T { + match option { + Some(value) => value, + None => panic!("unwrapping None"), + } +} + +#[cfg(test)] +mod tests { + use crate::source_code::line_index::LineIndex; + use ruff_text_size::TextSize; + use rustpython_parser::ast::Location; + + #[test] + fn ascii_index() { + let index = LineIndex::from_source_text(""); + assert_eq!(index.line_starts(), &[TextSize::from(0)]); + + let index = LineIndex::from_source_text("x = 1"); + assert_eq!(index.line_starts(), &[TextSize::from(0)]); + + let index = LineIndex::from_source_text("x = 1\n"); + assert_eq!(index.line_starts(), &[TextSize::from(0), TextSize::from(6)]); + + let index = LineIndex::from_source_text("x = 1\ny = 2\nz = x + y\n"); + assert_eq!( + index.line_starts(), + &[ + TextSize::from(0), + TextSize::from(6), + TextSize::from(12), + TextSize::from(22) + ] + ); + } + + #[test] + fn ascii_byte_offset() { + let contents = "x = 1\ny = 2"; + let index = LineIndex::from_source_text(contents); + + // First row. + let loc = index.location_offset(Location::new(1, 0), contents); + assert_eq!(loc, TextSize::from(0)); + + // Second row. + let loc = index.location_offset(Location::new(2, 0), contents); + assert_eq!(loc, TextSize::from(6)); + + // One-past-the-end. + let loc = index.location_offset(Location::new(3, 0), contents); + assert_eq!(loc, TextSize::from(11)); + } + + #[test] + fn ascii_carriage_return() { + let contents = "x = 4\ry = 3"; + let index = LineIndex::from_source_text(contents); + assert_eq!(index.line_starts(), &[TextSize::from(0), TextSize::from(6)]); + + assert_eq!( + index.location_offset(Location::new(1, 4), contents), + TextSize::from(4) + ); + assert_eq!( + index.location_offset(Location::new(2, 0), contents), + TextSize::from(6) + ); + assert_eq!( + index.location_offset(Location::new(2, 1), contents), + TextSize::from(7) + ); + } + + #[test] + fn ascii_carriage_return_newline() { + let contents = "x = 4\r\ny = 3"; + let index = LineIndex::from_source_text(contents); + assert_eq!(index.line_starts(), &[TextSize::from(0), TextSize::from(7)]); + + assert_eq!( + index.location_offset(Location::new(1, 4), contents), + TextSize::from(4) + ); + assert_eq!( + index.location_offset(Location::new(2, 0), contents), + TextSize::from(7) + ); + assert_eq!( + index.location_offset(Location::new(2, 1), contents), + TextSize::from(8) + ); + } + + #[test] + fn utf8_index() { + let index = LineIndex::from_source_text("x = '🫣'"); + assert_eq!(index.lines_count(), 1); + assert_eq!(index.line_starts(), &[TextSize::from(0)]); + + let index = LineIndex::from_source_text("x = '🫣'\n"); + assert_eq!(index.lines_count(), 2); + assert_eq!( + index.line_starts(), + &[TextSize::from(0), TextSize::from(11)] + ); + + let index = LineIndex::from_source_text("x = '🫣'\ny = 2\nz = x + y\n"); + assert_eq!(index.lines_count(), 4); + assert_eq!( + index.line_starts(), + &[ + TextSize::from(0), + TextSize::from(11), + TextSize::from(17), + TextSize::from(27) + ] + ); + + let index = LineIndex::from_source_text("# 🫣\nclass Foo:\n \"\"\".\"\"\""); + assert_eq!(index.lines_count(), 3); + assert_eq!( + index.line_starts(), + &[TextSize::from(0), TextSize::from(7), TextSize::from(18)] + ); + } + + #[test] + fn utf8_carriage_return() { + let contents = "x = '🫣'\ry = 3"; + let index = LineIndex::from_source_text(contents); + assert_eq!(index.lines_count(), 2); + assert_eq!( + index.line_starts(), + &[TextSize::from(0), TextSize::from(11)] + ); + + // Second ' + assert_eq!( + index.location_offset(Location::new(1, 6), contents), + TextSize::from(9) + ); + assert_eq!( + index.location_offset(Location::new(2, 0), contents), + TextSize::from(11) + ); + assert_eq!( + index.location_offset(Location::new(2, 1), contents), + TextSize::from(12) + ); + } + + #[test] + fn utf8_carriage_return_newline() { + let contents = "x = '🫣'\r\ny = 3"; + let index = LineIndex::from_source_text(contents); + assert_eq!(index.lines_count(), 2); + assert_eq!( + index.line_starts(), + &[TextSize::from(0), TextSize::from(12)] + ); + + // Second ' + assert_eq!( + index.location_offset(Location::new(1, 6), contents), + TextSize::from(9) + ); + assert_eq!( + index.location_offset(Location::new(2, 0), contents), + TextSize::from(12) + ); + assert_eq!( + index.location_offset(Location::new(2, 1), contents), + TextSize::from(13) + ); + } + + #[test] + fn utf8_byte_offset() { + let contents = "x = '☃'\ny = 2"; + let index = LineIndex::from_source_text(contents); + assert_eq!( + index.line_starts(), + &[TextSize::from(0), TextSize::from(10)] + ); + + // First row. + let loc = index.location_offset(Location::new(1, 0), contents); + assert_eq!(loc, TextSize::from(0)); + + let loc = index.location_offset(Location::new(1, 5), contents); + assert_eq!(loc, TextSize::from(5)); + assert_eq!(&"x = '☃'\ny = 2"[usize::from(loc)..], "☃'\ny = 2"); + + let loc = index.location_offset(Location::new(1, 6), contents); + assert_eq!(loc, TextSize::from(8)); + assert_eq!(&"x = '☃'\ny = 2"[usize::from(loc)..], "'\ny = 2"); + + // Second row. + let loc = index.location_offset(Location::new(2, 0), contents); + assert_eq!(loc, TextSize::from(10)); + + // One-past-the-end. + let loc = index.location_offset(Location::new(3, 0), contents); + assert_eq!(loc, TextSize::from(15)); + } +} diff --git a/crates/ruff_python_ast/src/source_code/locator.rs b/crates/ruff_python_ast/src/source_code/locator.rs index ca177ee89b..becfed511a 100644 --- a/crates/ruff_python_ast/src/source_code/locator.rs +++ b/crates/ruff_python_ast/src/source_code/locator.rs @@ -1,13 +1,15 @@ //! Struct used to efficiently slice source code at (row, column) Locations. +use crate::source_code::line_index::LineIndex; use once_cell::unsync::OnceCell; +use ruff_text_size::{TextRange, TextSize}; use rustpython_parser::ast::Location; use crate::types::Range; pub struct Locator<'a> { contents: &'a str, - index: OnceCell, + index: OnceCell, } impl<'a> Locator<'a> { @@ -18,37 +20,38 @@ impl<'a> Locator<'a> { } } - fn get_or_init_index(&self) -> &Index { - self.index.get_or_init(|| Index::from(self.contents)) + fn get_or_init_index(&self) -> &LineIndex { + self.index + .get_or_init(|| LineIndex::from_source_text(self.contents)) } /// Take the source code up to the given [`Location`]. pub fn take(&self, location: Location) -> &'a str { let index = self.get_or_init_index(); - let offset = index.byte_offset(location, self.contents); - &self.contents[..offset] + let offset = index.location_offset(location, self.contents); + &self.contents[TextRange::up_to(offset)] } /// Take the source code after the given [`Location`]. pub fn skip(&self, location: Location) -> &'a str { let index = self.get_or_init_index(); - let offset = index.byte_offset(location, self.contents); - &self.contents[offset..] + let offset = index.location_offset(location, self.contents); + &self.contents[usize::from(offset)..] } /// Take the source code between the given [`Range`]. pub fn slice>(&self, range: R) -> &'a str { let index = self.get_or_init_index(); let range = range.into(); - let start = index.byte_offset(range.location, self.contents); - let end = index.byte_offset(range.end_location, self.contents); - &self.contents[start..end] + let start = index.location_offset(range.location, self.contents); + let end = index.location_offset(range.end_location, self.contents); + &self.contents[TextRange::new(start, end)] } /// Return the byte offset of the given [`Location`]. - pub fn offset(&self, location: Location) -> usize { + pub fn offset(&self, location: Location) -> TextSize { let index = self.get_or_init_index(); - index.byte_offset(location, self.contents) + index.location_offset(location, self.contents) } /// Return the underlying source code. @@ -59,7 +62,7 @@ impl<'a> Locator<'a> { /// Return the number of lines in the source code. pub fn count_lines(&self) -> usize { let index = self.get_or_init_index(); - index.count_lines() + index.lines_count() } /// Return the number of bytes in the source code. @@ -72,302 +75,3 @@ impl<'a> Locator<'a> { self.contents.is_empty() } } - -/// Index for fast [`Location`] to byte offset conversions. -#[derive(Debug, Clone)] -enum Index { - /// Optimized index for an ASCII only document - Ascii(AsciiIndex), - - /// Index for UTF8 documents - Utf8(Utf8Index), -} - -impl Index { - /// Truncate a [`Location`] to a byte offset in source code. - fn byte_offset(&self, location: Location, contents: &str) -> usize { - match self { - Index::Ascii(ascii) => ascii.byte_offset(location, contents), - Index::Utf8(utf8) => utf8.byte_offset(location, contents), - } - } - - /// Return the number of lines in the source code. - fn count_lines(&self) -> usize { - match self { - Index::Ascii(ascii) => ascii.line_start_byte_offsets.len(), - Index::Utf8(utf8) => utf8.line_start_byte_offsets.len(), - } - } -} - -impl From<&str> for Index { - fn from(contents: &str) -> Self { - assert!(u32::try_from(contents.len()).is_ok()); - - let mut line_start_offsets: Vec = Vec::with_capacity(48); - line_start_offsets.push(0); - let mut utf8 = false; - - // SAFE because of length assertion above - #[allow(clippy::cast_possible_truncation)] - for (i, byte) in contents.bytes().enumerate() { - utf8 |= !byte.is_ascii(); - - match byte { - // Only track one line break for `\r\n`. - b'\r' if contents.as_bytes().get(i + 1) == Some(&b'\n') => continue, - b'\n' | b'\r' => { - line_start_offsets.push((i + 1) as u32); - } - _ => {} - } - } - - if utf8 { - Self::Utf8(Utf8Index::new(line_start_offsets)) - } else { - Self::Ascii(AsciiIndex::new(line_start_offsets)) - } - } -} - -/// Index for fast [`Location`] to byte offset conversions for ASCII documents. -/// -/// The index stores the byte offsets for every line. It computes the byte offset for a [`Location`] -/// by retrieving the line offset from its index and adding the column. -#[derive(Debug, Clone, Eq, PartialEq)] -struct AsciiIndex { - line_start_byte_offsets: Vec, -} - -impl AsciiIndex { - fn new(line_start_positions: Vec) -> Self { - Self { - line_start_byte_offsets: line_start_positions, - } - } - - /// Truncate a [`Location`] to a byte offset in ASCII source code. - fn byte_offset(&self, location: Location, contents: &str) -> usize { - let index = &self.line_start_byte_offsets; - - // If start-of-line position after last line - if location.row() - 1 == index.len() && location.column() == 0 { - contents.len() - } else { - let byte_offset = index[location.row() - 1] as usize + location.column(); - byte_offset.min(contents.len()) - } - } -} - -/// Index for fast [`Location`] to byte offset conversions for UTF8 documents. -/// -/// The index stores the byte offset of every line. The column offset is lazily computed by -/// adding the line start offset and then iterating to the `nth` character. -#[derive(Debug, Clone, PartialEq)] -struct Utf8Index { - line_start_byte_offsets: Vec, -} - -impl Utf8Index { - fn new(line_byte_positions: Vec) -> Self { - Self { - line_start_byte_offsets: line_byte_positions, - } - } - - /// Truncate a [`Location`] to a byte offset in UTF-8 source code. - fn byte_offset(&self, location: Location, contents: &str) -> usize { - let index = &self.line_start_byte_offsets; - - if location.row() - 1 == index.len() && location.column() == 0 { - contents.len() - } else { - // Casting is safe because the length of utf8 characters is always between 1-4 - #[allow(clippy::cast_possible_truncation)] - let line_start = if location.row() == 1 && contents.starts_with('\u{feff}') { - '\u{feff}'.len_utf8() as u32 - } else { - index[location.row() - 1] - }; - - let rest = &contents[line_start as usize..]; - - let column_offset = match rest.char_indices().nth(location.column()) { - Some((offset, _)) => offset, - None => contents.len(), - }; - - let offset = line_start as usize + column_offset; - offset.min(contents.len()) - } - } -} - -#[cfg(test)] -mod tests { - use rustpython_parser::ast::Location; - - use crate::source_code::locator::{AsciiIndex, Index, Utf8Index}; - - fn index_ascii(content: &str) -> AsciiIndex { - match Index::from(content) { - Index::Ascii(ascii) => ascii, - Index::Utf8(_) => { - panic!("Expected ASCII index") - } - } - } - - fn index_utf8(content: &str) -> Utf8Index { - match Index::from(content) { - Index::Utf8(utf8) => utf8, - Index::Ascii(_) => { - panic!("Expected UTF8 index") - } - } - } - - #[test] - fn ascii_index() { - let contents = ""; - let index = index_ascii(contents); - assert_eq!(index, AsciiIndex::new(vec![0])); - - let contents = "x = 1"; - let index = index_ascii(contents); - assert_eq!(index, AsciiIndex::new(vec![0])); - - let contents = "x = 1\n"; - let index = index_ascii(contents); - assert_eq!(index, AsciiIndex::new(vec![0, 6])); - - let contents = "x = 1\ny = 2\nz = x + y\n"; - let index = index_ascii(contents); - assert_eq!(index, AsciiIndex::new(vec![0, 6, 12, 22])); - } - - #[test] - fn ascii_byte_offset() { - let contents = "x = 1\ny = 2"; - let index = index_ascii(contents); - - // First row. - let loc = index.byte_offset(Location::new(1, 0), contents); - assert_eq!(loc, 0); - - // Second row. - let loc = index.byte_offset(Location::new(2, 0), contents); - assert_eq!(loc, 6); - - // One-past-the-end. - let loc = index.byte_offset(Location::new(3, 0), contents); - assert_eq!(loc, 11); - } - - #[test] - fn ascii_carriage_return() { - let contents = "x = 4\ry = 3"; - let index = index_ascii(contents); - assert_eq!(index, AsciiIndex::new(vec![0, 6])); - - assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4); - assert_eq!(index.byte_offset(Location::new(2, 0), contents), 6); - assert_eq!(index.byte_offset(Location::new(2, 1), contents), 7); - } - - #[test] - fn ascii_carriage_return_newline() { - let contents = "x = 4\r\ny = 3"; - let index = index_ascii(contents); - assert_eq!(index, AsciiIndex::new(vec![0, 7])); - - assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4); - assert_eq!(index.byte_offset(Location::new(2, 0), contents), 7); - assert_eq!(index.byte_offset(Location::new(2, 1), contents), 8); - } - - impl Utf8Index { - fn line_count(&self) -> usize { - self.line_start_byte_offsets.len() - } - } - - #[test] - fn utf8_index() { - let contents = "x = '🫣'"; - let index = index_utf8(contents); - assert_eq!(index.line_count(), 1); - assert_eq!(index, Utf8Index::new(vec![0])); - - let contents = "x = '🫣'\n"; - let index = index_utf8(contents); - assert_eq!(index.line_count(), 2); - assert_eq!(index, Utf8Index::new(vec![0, 11])); - - let contents = "x = '🫣'\ny = 2\nz = x + y\n"; - let index = index_utf8(contents); - assert_eq!(index.line_count(), 4); - assert_eq!(index, Utf8Index::new(vec![0, 11, 17, 27])); - - let contents = "# 🫣\nclass Foo:\n \"\"\".\"\"\""; - let index = index_utf8(contents); - assert_eq!(index.line_count(), 3); - assert_eq!(index, Utf8Index::new(vec![0, 7, 18])); - } - - #[test] - fn utf8_carriage_return() { - let contents = "x = '🫣'\ry = 3"; - let index = index_utf8(contents); - assert_eq!(index.line_count(), 2); - assert_eq!(index, Utf8Index::new(vec![0, 11])); - - // Second ' - assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9); - assert_eq!(index.byte_offset(Location::new(2, 0), contents), 11); - assert_eq!(index.byte_offset(Location::new(2, 1), contents), 12); - } - - #[test] - fn utf8_carriage_return_newline() { - let contents = "x = '🫣'\r\ny = 3"; - let index = index_utf8(contents); - assert_eq!(index.line_count(), 2); - assert_eq!(index, Utf8Index::new(vec![0, 12])); - - // Second ' - assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9); - assert_eq!(index.byte_offset(Location::new(2, 0), contents), 12); - assert_eq!(index.byte_offset(Location::new(2, 1), contents), 13); - } - - #[test] - fn utf8_byte_offset() { - let contents = "x = '☃'\ny = 2"; - let index = index_utf8(contents); - assert_eq!(index, Utf8Index::new(vec![0, 10])); - - // First row. - let loc = index.byte_offset(Location::new(1, 0), contents); - assert_eq!(loc, 0); - - let loc = index.byte_offset(Location::new(1, 5), contents); - assert_eq!(loc, 5); - assert_eq!(&contents[loc..], "☃'\ny = 2"); - - let loc = index.byte_offset(Location::new(1, 6), contents); - assert_eq!(loc, 8); - assert_eq!(&contents[loc..], "'\ny = 2"); - - // Second row. - let loc = index.byte_offset(Location::new(2, 0), contents); - assert_eq!(loc, 10); - - // One-past-the-end. - let loc = index.byte_offset(Location::new(3, 0), contents); - assert_eq!(loc, 15); - } -} diff --git a/crates/ruff_python_ast/src/source_code/mod.rs b/crates/ruff_python_ast/src/source_code/mod.rs index 2f06c7fc4a..87eb82a592 100644 --- a/crates/ruff_python_ast/src/source_code/mod.rs +++ b/crates/ruff_python_ast/src/source_code/mod.rs @@ -1,13 +1,16 @@ mod generator; mod indexer; +mod line_index; mod locator; mod stylist; +pub use crate::source_code::line_index::{LineIndex, OneIndexed}; pub use generator::Generator; pub use indexer::Indexer; pub use locator::Locator; use rustpython_parser as parser; use rustpython_parser::{lexer, Mode, ParseError}; + pub use stylist::{LineEnding, Stylist}; /// Run round-trip source code generation on a given Python code. diff --git a/crates/ruff_python_formatter/src/cst/helpers.rs b/crates/ruff_python_formatter/src/cst/helpers.rs index 19b8cdb7fa..9aac063f99 100644 --- a/crates/ruff_python_formatter/src/cst/helpers.rs +++ b/crates/ruff_python_formatter/src/cst/helpers.rs @@ -3,6 +3,7 @@ use rustpython_parser::ast::Location; use ruff_python_ast::newlines::StrExt; use ruff_python_ast::source_code::Locator; use ruff_python_ast::types::Range; +use ruff_text_size::TextRange; /// Return `true` if the given string is a radix literal (e.g., `0b101`). pub fn is_radix_literal(content: &str) -> bool { @@ -55,7 +56,7 @@ pub fn expand_indented_block( let mut nesting = 0; let mut colon = None; for (start, tok, _end) in rustpython_parser::lexer::lex_located( - &contents[start_index..end_index], + &contents[TextRange::new(start_index, end_index)], rustpython_parser::Mode::Module, location, ) @@ -80,7 +81,7 @@ pub fn expand_indented_block( // From here, we have two options: simple statement or compound statement. let indent = rustpython_parser::lexer::lex_located( - &contents[colon_index..end_index], + &contents[TextRange::new(colon_index, end_index)], rustpython_parser::Mode::Module, colon_location, ) @@ -97,7 +98,7 @@ pub fn expand_indented_block( // Compound statement: from the colon to the end of the block. let mut offset = 0; - for (index, line) in contents[end_index..] + for (index, line) in contents[usize::from(end_index)..] .universal_newlines() .skip(1) .enumerate() diff --git a/crates/ruff_python_formatter/src/format/builders.rs b/crates/ruff_python_formatter/src/format/builders.rs index 67b9729ebb..0cde20034a 100644 --- a/crates/ruff_python_formatter/src/format/builders.rs +++ b/crates/ruff_python_formatter/src/format/builders.rs @@ -80,10 +80,7 @@ impl Format> for Literal { f.write_element(FormatElement::StaticTextSlice { text, - range: TextRange::new( - start_index.try_into().unwrap(), - end_index.try_into().unwrap(), - ), + range: TextRange::new(start_index, end_index), }) } } diff --git a/crates/ruff_python_formatter/src/format/numbers.rs b/crates/ruff_python_formatter/src/format/numbers.rs index 5ef3edffaf..eb315f1b19 100644 --- a/crates/ruff_python_formatter/src/format/numbers.rs +++ b/crates/ruff_python_formatter/src/format/numbers.rs @@ -3,7 +3,7 @@ use rustpython_parser::ast::Location; use ruff_formatter::prelude::*; use ruff_formatter::{write, Format}; use ruff_python_ast::types::Range; -use ruff_text_size::TextSize; +use ruff_text_size::{TextRange, TextSize}; use crate::context::ASTFormatContext; use crate::format::builders::literal; @@ -20,9 +20,10 @@ impl Format> for FloatAtom { let start_index = locator.offset(self.range.location); let end_index = locator.offset(self.range.end_location); - if let Some(dot_index) = contents[start_index..end_index].find('.') { - let integer = &contents[start_index..start_index + dot_index]; - let fractional = &contents[start_index + dot_index + 1..end_index]; + let content = &contents[TextRange::new(start_index, end_index)]; + if let Some(dot_index) = content.find('.') { + let integer = &content[..dot_index]; + let fractional = &content[dot_index + 1..]; if integer.is_empty() { write!(f, [text("0")])?; @@ -80,11 +81,10 @@ impl Format> for FloatLiteral { let start_index = locator.offset(self.range.location); let end_index = locator.offset(self.range.end_location); + let content = &contents[TextRange::new(start_index, end_index)]; + // Scientific notation - if let Some(exponent_index) = contents[start_index..end_index] - .find('e') - .or_else(|| contents[start_index..end_index].find('E')) - { + if let Some(exponent_index) = content.find('e').or_else(|| content.find('E')) { // Write the base. write!( f, @@ -100,7 +100,7 @@ impl Format> for FloatLiteral { write!(f, [text("e")])?; // Write the exponent, omitting the sign if it's positive. - let plus = contents[start_index + exponent_index + 1..end_index].starts_with('+'); + let plus = content[exponent_index + 1..].starts_with('+'); write!( f, [literal(Range::new( @@ -137,10 +137,11 @@ impl Format> for IntLiteral { let end_index = locator.offset(self.range.end_location); for prefix in ["0b", "0B", "0o", "0O", "0x", "0X"] { - if contents[start_index..end_index].starts_with(prefix) { + let content = &contents[TextRange::new(start_index, end_index)]; + if content.starts_with(prefix) { // In each case, the prefix must be lowercase, while the suffix must be uppercase. - let prefix = &contents[start_index..start_index + prefix.len()]; - let suffix = &contents[start_index + prefix.len()..end_index]; + let prefix = &content[..prefix.len()]; + let suffix = &content[prefix.len()..]; if prefix.bytes().any(|b| b.is_ascii_uppercase()) || suffix.bytes().any(|b| b.is_ascii_lowercase()) @@ -185,9 +186,11 @@ impl Format> for ComplexLiteral { let start_index = locator.offset(self.range.location); let end_index = locator.offset(self.range.end_location); - if contents[start_index..end_index].ends_with('j') { + let content = &contents[TextRange::new(start_index, end_index)]; + + if content.ends_with('j') { write!(f, [literal(self.range)])?; - } else if contents[start_index..end_index].ends_with('J') { + } else if content.ends_with('J') { write!( f, [literal(Range::new( diff --git a/crates/ruff_python_formatter/src/format/strings.rs b/crates/ruff_python_formatter/src/format/strings.rs index 6ce5c1b30a..499e72c49b 100644 --- a/crates/ruff_python_formatter/src/format/strings.rs +++ b/crates/ruff_python_formatter/src/format/strings.rs @@ -4,7 +4,7 @@ use ruff_formatter::prelude::*; use ruff_formatter::{write, Format}; use ruff_python_ast::str::{leading_quote, trailing_quote}; use ruff_python_ast::types::Range; -use ruff_text_size::TextSize; +use ruff_text_size::{TextRange, TextSize}; use crate::context::ASTFormatContext; use crate::cst::Expr; @@ -22,7 +22,7 @@ impl Format> for StringLiteralPart { let end_index = locator.offset(self.range.end_location); // Extract leading and trailing quotes. - let contents = &contents[start_index..end_index]; + let contents = &contents[TextRange::new(start_index, end_index)]; let leading_quote = leading_quote(contents).unwrap(); let trailing_quote = trailing_quote(contents).unwrap(); let body = &contents[leading_quote.len()..contents.len() - trailing_quote.len()];