From e5f30ff5a8e50d18e0a742541491ae118686b7bc Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Thu, 3 Nov 2022 23:23:38 -0400 Subject: [PATCH] Use a rope to manage string slicing (#576) --- Cargo.lock | 17 +++++++ Cargo.toml | 3 +- benches/source_code_locator.rs | 9 ++-- src/source_code_locator.rs | 92 +++++++--------------------------- 4 files changed, 42 insertions(+), 79 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 677404486b..025a1dcbda 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2209,6 +2209,16 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "ropey" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd22239fafefc42138ca5da064f3c17726a80d2379d817a3521240e78dd0064" +dependencies = [ + "smallvec", + "str_indices", +] + [[package]] name = "ruff" version = "0.0.99" @@ -2239,6 +2249,7 @@ dependencies = [ "path-absolutize", "rayon", "regex", + "ropey", "rustpython-ast", "rustpython-common", "rustpython-parser", @@ -2559,6 +2570,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "str_indices" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d9199fa80c817e074620be84374a520062ebac833f358d74b37060ce4a0f2c0" + [[package]] name = "string_cache" version = "0.8.4" diff --git a/Cargo.toml b/Cargo.toml index 7b10e90bc4..f4402479a3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [workspace] members = [ - "flake8_to_ruff", + "flake8_to_ruff", ] [package] @@ -31,6 +31,7 @@ once_cell = { version = "1.13.1" } path-absolutize = { version = "3.0.14", features = ["once_cell_cache", "use_unix_paths_on_wasm"] } rayon = { version = "1.5.3" } regex = { version = "1.6.0" } +ropey = { version = "1.5.0" } rustpython-ast = { features = ["unparse"], git = "https://github.com/RustPython/RustPython.git", rev = "77b821a1941019fe34f73ce17cea013ae1b98fd0" } rustpython-common = { git = "https://github.com/RustPython/RustPython.git", rev = "77b821a1941019fe34f73ce17cea013ae1b98fd0" } rustpython-parser = { features = ["lalrpop"], git = "https://github.com/RustPython/RustPython.git", rev = "77b821a1941019fe34f73ce17cea013ae1b98fd0" } diff --git a/benches/source_code_locator.rs b/benches/source_code_locator.rs index c412533601..473ea38578 100644 --- a/benches/source_code_locator.rs +++ b/benches/source_code_locator.rs @@ -1,13 +1,16 @@ use std::path::Path; use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use ropey::Rope; use ruff::fs; -use ruff::source_code_locator::compute_offsets; fn criterion_benchmark(c: &mut Criterion) { let contents = fs::read_file(Path::new("resources/test/fixtures/D.py")).unwrap(); - c.bench_function("compute_offsets", |b| { - b.iter(|| compute_offsets(black_box(&contents))) + c.bench_function("rope", |b| { + b.iter(|| { + let rope = Rope::from_str(black_box(&contents)); + rope.line_to_char(black_box(4)); + }) }); } diff --git a/src/source_code_locator.rs b/src/source_code_locator.rs index 796eedcdb9..74fd2cde77 100644 --- a/src/source_code_locator.rs +++ b/src/source_code_locator.rs @@ -1,60 +1,38 @@ //! Struct used to efficiently slice source code at (row, column) Locations. use once_cell::unsync::OnceCell; +use ropey::Rope; use rustpython_ast::Location; use crate::ast::types::Range; pub struct SourceCodeLocator<'a> { contents: &'a str, - offsets: OnceCell>>, -} - -pub fn compute_offsets(contents: &str) -> Vec> { - let mut offsets = vec![vec![]]; - let mut line_index = 0; - let mut char_index = 0; - let mut newline = false; - for (i, char) in contents.char_indices() { - offsets[line_index].push(i); - - newline = char == '\n'; - if newline { - line_index += 1; - offsets.push(vec![]); - char_index = i + char.len_utf8(); - } - } - // If we end in a newline, add an extra character to indicate the start of that - // line. - if newline { - offsets[line_index].push(char_index); - } - offsets + rope: OnceCell, } impl<'a> SourceCodeLocator<'a> { pub fn new(contents: &'a str) -> Self { SourceCodeLocator { contents, - offsets: OnceCell::new(), + rope: OnceCell::new(), } } - fn get_or_init_offsets(&self) -> &Vec> { - self.offsets.get_or_init(|| compute_offsets(self.contents)) + fn get_or_init_rope(&self) -> &Rope { + self.rope.get_or_init(|| Rope::from_str(self.contents)) } pub fn slice_source_code_at(&self, location: &Location) -> &'a str { - let offsets = self.get_or_init_offsets(); - let offset = offsets[location.row() - 1][location.column()]; + let rope = self.get_or_init_rope(); + let offset = rope.line_to_char(location.row() - 1) + location.column(); &self.contents[offset..] } pub fn slice_source_code_range(&self, range: &Range) -> &'a str { - let offsets = self.get_or_init_offsets(); - let start = offsets[range.location.row() - 1][range.location.column()]; - let end = offsets[range.end_location.row() - 1][range.end_location.column()]; + let rope = self.get_or_init_rope(); + let start = rope.line_to_char(range.location.row() - 1) + range.location.column(); + let end = rope.line_to_char(range.end_location.row() - 1) + range.end_location.column(); &self.contents[start..end] } @@ -63,11 +41,13 @@ impl<'a> SourceCodeLocator<'a> { outer: &Range, inner: &Range, ) -> (&'a str, &'a str, &'a str) { - let offsets = self.get_or_init_offsets(); - let outer_start = offsets[outer.location.row() - 1][outer.location.column()]; - let outer_end = offsets[outer.end_location.row() - 1][outer.end_location.column()]; - let inner_start = offsets[inner.location.row() - 1][inner.location.column()]; - let inner_end = offsets[inner.end_location.row() - 1][inner.end_location.column()]; + let rope = self.get_or_init_rope(); + let outer_start = rope.line_to_char(outer.location.row() - 1) + outer.location.column(); + let outer_end = + rope.line_to_char(outer.end_location.row() - 1) + outer.end_location.column(); + let inner_start = rope.line_to_char(inner.location.row() - 1) + inner.location.column(); + let inner_end = + rope.line_to_char(inner.end_location.row() - 1) + inner.end_location.column(); ( &self.contents[outer_start..inner_start], &self.contents[inner_start..inner_end], @@ -75,41 +55,3 @@ impl<'a> SourceCodeLocator<'a> { ) } } - -#[cfg(test)] -mod tests { - use crate::source_code_locator::SourceCodeLocator; - - #[test] - fn source_code_locator_init() { - let content = "x = 1"; - let locator = SourceCodeLocator::new(content); - let offsets = locator.get_or_init_offsets(); - assert_eq!(offsets.len(), 1); - assert_eq!(offsets[0], [0, 1, 2, 3, 4]); - - let content = "x = 1\n"; - let locator = SourceCodeLocator::new(content); - let offsets = locator.get_or_init_offsets(); - assert_eq!(offsets.len(), 2); - assert_eq!(offsets[0], [0, 1, 2, 3, 4, 5]); - assert_eq!(offsets[1], [6]); - - let content = "x = 1\ny = 2\nz = x + y\n"; - let locator = SourceCodeLocator::new(content); - let offsets = locator.get_or_init_offsets(); - assert_eq!(offsets.len(), 4); - assert_eq!(offsets[0], [0, 1, 2, 3, 4, 5]); - assert_eq!(offsets[1], [6, 7, 8, 9, 10, 11]); - assert_eq!(offsets[2], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21]); - assert_eq!(offsets[3], [22]); - - let content = "# \u{4e9c}\nclass Foo:\n \"\"\".\"\"\""; - let locator = SourceCodeLocator::new(content); - let offsets = locator.get_or_init_offsets(); - assert_eq!(offsets.len(), 3); - assert_eq!(offsets[0], [0, 1, 2, 5]); - assert_eq!(offsets[1], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); - assert_eq!(offsets[2], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]); - } -}