From 36cce347fd490a16eca9d7e4b03e71941913d816 Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Tue, 11 Nov 2025 10:43:37 +0100 Subject: [PATCH] Reduce notebook memory footprint (#21319) --- crates/ruff_db/src/diagnostic/render/full.rs | 14 ++-- crates/ruff_notebook/src/index.rs | 57 +++++++++---- crates/ruff_notebook/src/notebook.rs | 84 +++++++++----------- 3 files changed, 84 insertions(+), 71 deletions(-) diff --git a/crates/ruff_db/src/diagnostic/render/full.rs b/crates/ruff_db/src/diagnostic/render/full.rs index c87413a84e..0784297755 100644 --- a/crates/ruff_db/src/diagnostic/render/full.rs +++ b/crates/ruff_db/src/diagnostic/render/full.rs @@ -112,16 +112,16 @@ impl std::fmt::Display for Diff<'_> { // `None`, indicating a regular script file, all the lines will be in one "cell" under the // `None` key. let cells = if let Some(notebook_index) = &self.notebook_index { - let mut last_cell = OneIndexed::MIN; + let mut last_cell_index = OneIndexed::MIN; let mut cells: Vec<(Option, TextSize)> = Vec::new(); - for (row, cell) in notebook_index.iter() { - if cell != last_cell { - let offset = source_code.line_start(row); - cells.push((Some(last_cell), offset)); - last_cell = cell; + for cell in notebook_index.iter() { + if cell.cell_index() != last_cell_index { + let offset = source_code.line_start(cell.start_row()); + cells.push((Some(last_cell_index), offset)); + last_cell_index = cell.cell_index(); } } - cells.push((Some(last_cell), source_text.text_len())); + cells.push((Some(last_cell_index), source_text.text_len())); cells } else { vec![(None, source_text.text_len())] diff --git a/crates/ruff_notebook/src/index.rs b/crates/ruff_notebook/src/index.rs index eff605aa6d..951914e74a 100644 --- a/crates/ruff_notebook/src/index.rs +++ b/crates/ruff_notebook/src/index.rs @@ -8,37 +8,40 @@ use ruff_source_file::{LineColumn, OneIndexed, SourceLocation}; /// [`ruff_text_size::TextSize`] to jupyter notebook cell/row/column. #[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] pub struct NotebookIndex { - /// Enter a row (1-based), get back the cell (1-based) - pub(super) row_to_cell: Vec, - /// Enter a row (1-based), get back the row in cell (1-based) - pub(super) row_to_row_in_cell: Vec, + /// Stores the starting row and the absolute cell index for every Python (valid) cell. + /// + /// The index in this vector corresponds to the Python cell index (valid cell index). + pub(super) cell_starts: Vec, } impl NotebookIndex { - pub fn new(row_to_cell: Vec, row_to_row_in_cell: Vec) -> Self { - Self { - row_to_cell, - row_to_row_in_cell, + fn find_cell(&self, row: OneIndexed) -> Option { + match self + .cell_starts + .binary_search_by_key(&row, |start| start.start_row) + { + Ok(cell_index) => Some(self.cell_starts[cell_index]), + Err(insertion_point) => Some(self.cell_starts[insertion_point.checked_sub(1)?]), } } - /// Returns the cell number (1-based) for the given row (1-based). + /// Returns the (raw) cell number (1-based) for the given row (1-based). pub fn cell(&self, row: OneIndexed) -> Option { - self.row_to_cell.get(row.to_zero_indexed()).copied() + self.find_cell(row).map(|start| start.raw_cell_index) } /// Returns the row number (1-based) in the cell (1-based) for the /// given row (1-based). pub fn cell_row(&self, row: OneIndexed) -> Option { - self.row_to_row_in_cell.get(row.to_zero_indexed()).copied() + self.find_cell(row) + .map(|start| OneIndexed::from_zero_indexed(row.get() - start.start_row.get())) } - /// Returns an iterator over the row:cell-number pairs (both 1-based). - pub fn iter(&self) -> impl Iterator { - self.row_to_cell - .iter() - .enumerate() - .map(|(row, cell)| (OneIndexed::from_zero_indexed(row), *cell)) + /// Returns an iterator over the starting rows of each cell (1-based). + /// + /// This yields one entry per Python cell (skipping over Makrdown cell). + pub fn iter(&self) -> impl Iterator + '_ { + self.cell_starts.iter().copied() } /// Translates the given [`LineColumn`] based on the indexing table. @@ -67,3 +70,23 @@ impl NotebookIndex { } } } + +#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)] +pub struct CellStart { + /// The row in the concatenated notebook source code at which + /// this cell starts. + pub(super) start_row: OneIndexed, + + /// The absolute index of this cell in the notebook. + pub(super) raw_cell_index: OneIndexed, +} + +impl CellStart { + pub fn start_row(&self) -> OneIndexed { + self.start_row + } + + pub fn cell_index(&self) -> OneIndexed { + self.raw_cell_index + } +} diff --git a/crates/ruff_notebook/src/notebook.rs b/crates/ruff_notebook/src/notebook.rs index 124202e927..4dc01971fc 100644 --- a/crates/ruff_notebook/src/notebook.rs +++ b/crates/ruff_notebook/src/notebook.rs @@ -18,7 +18,7 @@ use ruff_text_size::TextSize; use crate::cell::CellOffsets; use crate::index::NotebookIndex; use crate::schema::{Cell, RawNotebook, SortAlphabetically, SourceValue}; -use crate::{CellMetadata, RawNotebookMetadata, schema}; +use crate::{CellMetadata, CellStart, RawNotebookMetadata, schema}; /// Run round-trip source code generation on a given Jupyter notebook file path. pub fn round_trip(path: &Path) -> anyhow::Result { @@ -320,11 +320,19 @@ impl Notebook { /// The index building is expensive as it needs to go through the content of /// every valid code cell. fn build_index(&self) -> NotebookIndex { - let mut row_to_cell = Vec::new(); - let mut row_to_row_in_cell = Vec::new(); + let mut cell_starts = Vec::with_capacity(self.valid_code_cells.len()); + + let mut current_row = OneIndexed::MIN; for &cell_index in &self.valid_code_cells { - let line_count = match &self.raw.cells[cell_index as usize].source() { + let raw_cell_index = cell_index as usize; + // Record the starting row of this cell + cell_starts.push(CellStart { + start_row: current_row, + raw_cell_index: OneIndexed::from_zero_indexed(raw_cell_index), + }); + + let line_count = match &self.raw.cells[raw_cell_index].source() { SourceValue::String(string) => { if string.is_empty() { 1 @@ -342,17 +350,11 @@ impl Notebook { } } }; - row_to_cell.extend(std::iter::repeat_n( - OneIndexed::from_zero_indexed(cell_index as usize), - line_count, - )); - row_to_row_in_cell.extend((0..line_count).map(OneIndexed::from_zero_indexed)); + + current_row = current_row.saturating_add(line_count); } - NotebookIndex { - row_to_cell, - row_to_row_in_cell, - } + NotebookIndex { cell_starts } } /// Return the notebook content. @@ -456,7 +458,7 @@ mod tests { use ruff_source_file::OneIndexed; - use crate::{Cell, Notebook, NotebookError, NotebookIndex}; + use crate::{Cell, CellStart, Notebook, NotebookError, NotebookIndex}; /// Construct a path to a Jupyter notebook in the `resources/test/fixtures/jupyter` directory. fn notebook_path(path: impl AsRef) -> std::path::PathBuf { @@ -548,39 +550,27 @@ print("after empty cells") assert_eq!( notebook.index(), &NotebookIndex { - row_to_cell: vec![ - OneIndexed::from_zero_indexed(0), - OneIndexed::from_zero_indexed(0), - OneIndexed::from_zero_indexed(0), - OneIndexed::from_zero_indexed(0), - OneIndexed::from_zero_indexed(0), - OneIndexed::from_zero_indexed(0), - OneIndexed::from_zero_indexed(2), - OneIndexed::from_zero_indexed(2), - OneIndexed::from_zero_indexed(2), - OneIndexed::from_zero_indexed(2), - OneIndexed::from_zero_indexed(2), - OneIndexed::from_zero_indexed(4), - OneIndexed::from_zero_indexed(6), - OneIndexed::from_zero_indexed(6), - OneIndexed::from_zero_indexed(7) - ], - row_to_row_in_cell: vec![ - OneIndexed::from_zero_indexed(0), - OneIndexed::from_zero_indexed(1), - OneIndexed::from_zero_indexed(2), - OneIndexed::from_zero_indexed(3), - OneIndexed::from_zero_indexed(4), - OneIndexed::from_zero_indexed(5), - OneIndexed::from_zero_indexed(0), - OneIndexed::from_zero_indexed(1), - OneIndexed::from_zero_indexed(2), - OneIndexed::from_zero_indexed(3), - OneIndexed::from_zero_indexed(4), - OneIndexed::from_zero_indexed(0), - OneIndexed::from_zero_indexed(0), - OneIndexed::from_zero_indexed(1), - OneIndexed::from_zero_indexed(0) + cell_starts: vec![ + CellStart { + start_row: OneIndexed::MIN, + raw_cell_index: OneIndexed::MIN + }, + CellStart { + start_row: OneIndexed::from_zero_indexed(6), + raw_cell_index: OneIndexed::from_zero_indexed(2) + }, + CellStart { + start_row: OneIndexed::from_zero_indexed(11), + raw_cell_index: OneIndexed::from_zero_indexed(4) + }, + CellStart { + start_row: OneIndexed::from_zero_indexed(12), + raw_cell_index: OneIndexed::from_zero_indexed(6) + }, + CellStart { + start_row: OneIndexed::from_zero_indexed(14), + raw_cell_index: OneIndexed::from_zero_indexed(7) + } ], } );