From b96aa4605b00f036927a25e7a8ff34127d7a5d89 Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Thu, 7 Aug 2025 11:16:04 +0200 Subject: [PATCH] [ty] Reduce size of member table (#19572) --- crates/ruff_python_ast/src/name.rs | 11 + crates/ty/src/lib.rs | 4 +- .../src/semantic_index/member.rs | 759 ++++++++++++++++-- .../src/semantic_index/place.rs | 78 +- .../src/types/ide_support.rs | 4 +- 5 files changed, 722 insertions(+), 134 deletions(-) diff --git a/crates/ruff_python_ast/src/name.rs b/crates/ruff_python_ast/src/name.rs index ae87bbed6e..a1e1376282 100644 --- a/crates/ruff_python_ast/src/name.rs +++ b/crates/ruff_python_ast/src/name.rs @@ -36,6 +36,10 @@ impl Name { pub fn as_str(&self) -> &str { self.0.as_str() } + + pub fn push_str(&mut self, s: &str) { + self.0.push_str(s); + } } impl Debug for Name { @@ -44,6 +48,13 @@ impl Debug for Name { } } +impl std::fmt::Write for Name { + fn write_str(&mut self, s: &str) -> std::fmt::Result { + self.0.push_str(s); + Ok(()) + } +} + impl AsRef for Name { #[inline] fn as_ref(&self) -> &str { diff --git a/crates/ty/src/lib.rs b/crates/ty/src/lib.rs index 45103c3f48..f16192bcd5 100644 --- a/crates/ty/src/lib.rs +++ b/crates/ty/src/lib.rs @@ -156,7 +156,9 @@ fn run_check(args: CheckCommand) -> anyhow::Result { match std::env::var(EnvVars::TY_MEMORY_REPORT).as_deref() { Ok("short") => write!(stdout, "{}", db.salsa_memory_dump().display_short())?, Ok("mypy_primer") => write!(stdout, "{}", db.salsa_memory_dump().display_mypy_primer())?, - Ok("full") => write!(stdout, "{}", db.salsa_memory_dump().display_full())?, + Ok("full") => { + write!(stdout, "{}", db.salsa_memory_dump().display_full())?; + } Ok(other) => { tracing::warn!( "Unknown value for `TY_MEMORY_REPORT`: `{other}`. Valid values are `short`, `mypy_primer`, and `full`." diff --git a/crates/ty_python_semantic/src/semantic_index/member.rs b/crates/ty_python_semantic/src/semantic_index/member.rs index 8728eee4d6..c1ea125156 100644 --- a/crates/ty_python_semantic/src/semantic_index/member.rs +++ b/crates/ty_python_semantic/src/semantic_index/member.rs @@ -2,9 +2,10 @@ use bitflags::bitflags; use hashbrown::hash_table::Entry; use ruff_index::{IndexVec, newtype_index}; use ruff_python_ast::{self as ast, name::Name}; +use ruff_text_size::{TextLen as _, TextRange, TextSize}; use rustc_hash::FxHasher; use smallvec::SmallVec; -use std::hash::{Hash as _, Hasher as _}; +use std::hash::{Hash, Hasher as _}; use std::ops::{Deref, DerefMut}; /// A member access, e.g. `x.y` or `x[1]` or `x["foo"]`. @@ -25,7 +26,7 @@ impl Member { /// Returns the left most part of the member expression, e.g. `x` in `x.y.z`. /// /// This is the symbol on which the member access is performed. - pub(crate) fn symbol_name(&self) -> &Name { + pub(crate) fn symbol_name(&self) -> &str { self.expression.symbol_name() } @@ -78,10 +79,14 @@ impl Member { /// a method context, or whether the `` actually refers to the first /// parameter of the method (i.e. `self`). To answer those questions, /// use [`Self::as_instance_attribute`]. - pub(super) fn as_instance_attribute_candidate(&self) -> Option<&Name> { - match &*self.expression.segments { - [MemberSegment::Attribute(name)] => Some(name), - _ => None, + pub(super) fn as_instance_attribute_candidate(&self) -> Option<&str> { + let mut segments = self.expression().segments(); + let first_segment = segments.next()?; + + if first_segment.kind == SegmentKind::Attribute && segments.next().is_none() { + Some(first_segment.text) + } else { + None } } @@ -100,11 +105,11 @@ impl Member { /// Does the place expression have the form `self.{name}` (`self` is the first parameter of the method)? pub(super) fn is_instance_attribute_named(&self, name: &str) -> bool { - self.as_instance_attribute().map(Name::as_str) == Some(name) + self.as_instance_attribute() == Some(name) } /// Return `Some()` if the place expression is an instance attribute. - pub(crate) fn as_instance_attribute(&self) -> Option<&Name> { + pub(crate) fn as_instance_attribute(&self) -> Option<&str> { if self.is_instance_attribute() { debug_assert!(self.as_instance_attribute_candidate().is_some()); self.as_instance_attribute_candidate() @@ -142,63 +147,119 @@ impl get_size2::GetSize for MemberFlags {} /// * An integer-based subscript, e.g. `[1]` in `x[1]` /// * A string-based subscript, e.g. `["foo"]` in `x["foo"]` /// -/// Internally, the segments are stored in reverse order. This allows constructing -/// a `MemberExpr` from an ast expression without having to reverse the segments. -#[derive(Clone, Debug, PartialEq, Eq, get_size2::GetSize, Hash)] +/// Uses a compact representation where the entire expression is stored as a single path. +/// For example, `foo.bar[0]["baz"]` is stored as: +/// - path: `foobar0baz` +/// - segments: stores where each segment starts and its kind (attribute, int subscript, string subscript) +/// +/// The symbol name can be extracted from the path by taking the text up to the first segment's start offset. +#[derive(Clone, Debug, PartialEq, Eq, get_size2::GetSize)] pub(crate) struct MemberExpr { - symbol_name: Name, - segments: SmallVec<[MemberSegment; 1]>, + /// The entire path as a single Name + path: Name, + /// Metadata for each segment (in forward order) + segments: Segments, } impl MemberExpr { - pub(super) fn new(symbol_name: Name, segments: SmallVec<[MemberSegment; 1]>) -> Self { - debug_assert!( - !segments.is_empty(), - "A member without segments is a symbol." - ); + pub(super) fn try_from_expr(expression: ast::ExprRef<'_>) -> Option { + fn visit(expr: ast::ExprRef) -> Option<(Name, SmallVec<[SegmentInfo; 8]>)> { + use std::fmt::Write as _; - Self { - symbol_name, - segments, + match expr { + ast::ExprRef::Name(name) => { + Some((name.id.clone(), smallvec::SmallVec::new_const())) + } + ast::ExprRef::Attribute(attribute) => { + let (mut path, mut segments) = visit(ast::ExprRef::from(&attribute.value))?; + + let start_offset = path.text_len(); + let _ = write!(path, "{}", attribute.attr.id); + segments.push(SegmentInfo::new(SegmentKind::Attribute, start_offset)); + + Some((path, segments)) + } + ast::ExprRef::Subscript(subscript) => { + let (mut path, mut segments) = visit((&subscript.value).into())?; + let start_offset = path.text_len(); + + match &*subscript.slice { + ast::Expr::NumberLiteral(ast::ExprNumberLiteral { + value: ast::Number::Int(index), + .. + }) => { + let _ = write!(path, "{index}"); + segments + .push(SegmentInfo::new(SegmentKind::IntSubscript, start_offset)); + } + ast::Expr::StringLiteral(string) => { + let _ = write!(path, "{}", string.value); + segments + .push(SegmentInfo::new(SegmentKind::StringSubscript, start_offset)); + } + _ => { + return None; + } + } + + Some((path, segments)) + } + _ => None, + } + } + + let (path, segments) = visit(expression)?; + + if segments.is_empty() { + None + } else { + Some(Self { + path, + segments: Segments::from_vec(segments), + }) } } + fn segment_infos(&self) -> impl Iterator + '_ { + self.segments.iter() + } + + fn segments(&self) -> impl Iterator> + '_ { + SegmentsIterator::new(self.path.as_str(), self.segment_infos()) + } + fn shrink_to_fit(&mut self) { - self.segments.shrink_to_fit(); - self.segments.shrink_to_fit(); + self.path.shrink_to_fit(); } /// Returns the left most part of the member expression, e.g. `x` in `x.y.z`. /// /// This is the symbol on which the member access is performed. - pub(crate) fn symbol_name(&self) -> &Name { - &self.symbol_name + pub(crate) fn symbol_name(&self) -> &str { + self.as_ref().symbol_name() } - /// Returns the segments of the member expression, e.g. `[MemberSegment::Attribute("y"), MemberSegment::IntSubscript(1)]` for `x.y[1]`. - pub(crate) fn member_segments( - &self, - ) -> impl ExactSizeIterator + DoubleEndedIterator { - self.segments.iter().rev() + pub(super) fn num_segments(&self) -> usize { + self.segments.len() } pub(crate) fn as_ref(&self) -> MemberExprRef { MemberExprRef { - name: self.symbol_name.as_str(), - segments: self.segments.as_slice(), + path: self.path.as_str(), + segments: SegmentsRef::from(&self.segments), } } } impl std::fmt::Display for MemberExpr { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_str(self.symbol_name.as_str())?; + f.write_str(self.symbol_name())?; - for segment in self.member_segments() { - match segment { - MemberSegment::Attribute(name) => write!(f, ".{name}")?, - MemberSegment::IntSubscript(int) => write!(f, "[{int}]")?, - MemberSegment::StringSubscript(string) => write!(f, "[\"{string}\"]")?, + for segment in self.segments() { + match segment.kind { + SegmentKind::Attribute => write!(f, ".{}", segment.text)?, + SegmentKind::IntSubscript => write!(f, "[{}]", segment.text)?, + SegmentKind::StringSubscript => write!(f, "[\"{}\"]", segment.text)?, } } @@ -230,42 +291,43 @@ impl PartialEq<&MemberExpr> for MemberExprRef<'_> { } } -#[derive(Debug, Clone, PartialEq, Eq, Hash, get_size2::GetSize)] -pub(crate) enum MemberSegment { - /// An attribute access, e.g. `.y` in `x.y` - Attribute(Name), - /// An integer-based index access, e.g. `[1]` in `x[1]` - IntSubscript(ast::Int), - /// A string-based index access, e.g. `["foo"]` in `x["foo"]` - StringSubscript(String), -} - /// Reference to a member expression. -#[derive(Debug, PartialEq, Eq, Hash, Copy, Clone)] +#[derive(Debug, Clone, PartialEq, Eq)] pub(crate) struct MemberExprRef<'a> { - name: &'a str, - segments: &'a [MemberSegment], + path: &'a str, + segments: SegmentsRef<'a>, } impl<'a> MemberExprRef<'a> { pub(super) fn symbol_name(&self) -> &'a str { - self.name + let end = self + .segments + .iter() + .next() + .map(SegmentInfo::offset) + .unwrap_or(self.path.text_len()); + + let range = TextRange::new(TextSize::default(), end); + + &self.path[range] } - /// Create a new `MemberExprRef` from a name and segments. - /// - /// Note that the segments are expected to be in reverse order, i.e. the last segment is the first one in the expression. - pub(super) fn from_raw(name: &'a str, segments: &'a [MemberSegment]) -> Self { - debug_assert!( - !segments.is_empty(), - "A member without segments is a symbol." - ); - Self { name, segments } + #[cfg(test)] + fn segments(&self) -> impl Iterator> + '_ { + SegmentsIterator::new(self.path, self.segments.iter()) } - /// Returns a slice over the member segments. The segments are in reverse order, - pub(super) fn rev_member_segments(&self) -> &'a [MemberSegment] { - self.segments + pub(super) fn parent(&self) -> Option> { + let parent_segments = self.segments.parent()?; + + // The removed segment is always the last one. Find its start offset. + let last_segment = self.segments.iter().last()?; + let path_end = last_segment.offset(); + + Some(MemberExprRef { + path: &self.path[TextRange::new(TextSize::default(), path_end)], + segments: parent_segments, + }) } } @@ -275,6 +337,14 @@ impl<'a> From<&'a MemberExpr> for MemberExprRef<'a> { } } +impl Hash for MemberExprRef<'_> { + fn hash(&self, state: &mut H) { + // Path on its own isn't 100% unique, but it should avoid + // most collisions and avoids iterating all segments. + self.path.hash(state); + } +} + /// Uniquely identifies a member in a scope. #[newtype_index] #[derive(get_size2::GetSize, salsa::Update)] @@ -315,10 +385,8 @@ impl MemberTable { self.members.iter() } - fn hash_member_expression_ref(member: MemberExprRef) -> u64 { - let mut h = FxHasher::default(); - member.hash(&mut h); - h.finish() + fn hash_member_expression_ref(member: &MemberExprRef) -> u64 { + hash_single(member) } /// Returns the ID of the member with the given expression, if it exists. @@ -327,7 +395,7 @@ impl MemberTable { member: impl Into>, ) -> Option { let member = member.into(); - let hash = Self::hash_member_expression_ref(member); + let hash = Self::hash_member_expression_ref(&member); self.map .find(hash, |id| self.members[*id].expression == member) .copied() @@ -369,12 +437,14 @@ impl MemberTableBuilder { /// /// Members are identified by their expression, which is hashed to find the entry in the table. pub(super) fn add(&mut self, mut member: Member) -> (ScopedMemberId, bool) { - let hash = MemberTable::hash_member_expression_ref(member.expression.as_ref()); + let member_ref = member.expression.as_ref(); + let hash = MemberTable::hash_member_expression_ref(&member_ref); let entry = self.table.map.entry( hash, - |id| self.table.members[*id].expression == member.expression, + |id| self.table.members[*id].expression.as_ref() == member.expression.as_ref(), |id| { - MemberTable::hash_member_expression_ref(self.table.members[*id].expression.as_ref()) + let ref_expr = self.table.members[*id].expression.as_ref(); + MemberTable::hash_member_expression_ref(&ref_expr) }, ); @@ -402,7 +472,8 @@ impl MemberTableBuilder { let mut table = self.table; table.members.shrink_to_fit(); table.map.shrink_to_fit(|id| { - MemberTable::hash_member_expression_ref(table.members[*id].expression.as_ref()) + let ref_expr = table.members[*id].expression.as_ref(); + MemberTable::hash_member_expression_ref(&ref_expr) }); table } @@ -421,3 +492,545 @@ impl DerefMut for MemberTableBuilder { &mut self.table } } + +/// Representation of segments that can be either inline or heap-allocated. +/// +/// Design choices: +/// - Uses `Box<[SegmentInfo]>` instead of `ThinVec` because even with a `ThinVec`, the size of `Segments` is still 128 bytes. +/// - Uses u64 for inline storage. That's the largest size without increasing the overall size of `Segments` and allows to encode up to 7 segments. +#[derive(Clone, Debug, PartialEq, Eq, get_size2::GetSize)] +enum Segments { + /// Inline storage for up to 7 segments with 6-bit relative offsets (max 63 bytes per segment) + Small(SmallSegments), + /// Heap storage for expressions that don't fit inline + Heap(Box<[SegmentInfo]>), +} + +static_assertions::assert_eq_size!(SmallSegments, u64); +static_assertions::assert_eq_size!(Segments, [u64; 2]); + +impl Segments { + fn from_vec(segments: SmallVec<[SegmentInfo; 8]>) -> Self { + debug_assert!( + !segments.is_empty(), + "Segments cannot be empty. A member without segments is a symbol" + ); + if let Some(small) = SmallSegments::try_from_slice(&segments) { + Self::Small(small) + } else { + Self::Heap(segments.into_vec().into_boxed_slice()) + } + } + + fn len(&self) -> usize { + match self { + Self::Small(small) => small.len(), + Self::Heap(segments) => segments.len(), + } + } + + fn iter(&self) -> impl Iterator + '_ { + match self { + Self::Small(small) => itertools::Either::Left(small.iter()), + Self::Heap(heap) => itertools::Either::Right(heap.iter().copied()), + } + } +} + +/// Segment metadata - packed into a single u32 +/// Layout: [kind: 2 bits][offset: 30 bits] +/// - Bits 0-1: `SegmentKind` (0=Attribute, 1=IntSubscript, 2=StringSubscript) +/// - Bits 2-31: Absolute offset from start of path (up to 1,073,741,823 bytes) +#[derive(Clone, Copy, PartialEq, Eq, Hash, get_size2::GetSize)] +struct SegmentInfo(u32); + +const KIND_MASK: u32 = 0b11; +const OFFSET_SHIFT: u32 = 2; +const MAX_OFFSET: u32 = (1 << 30) - 1; // 2^30 - 1 + +impl SegmentInfo { + const fn new(kind: SegmentKind, offset: TextSize) -> Self { + assert!(offset.to_u32() < MAX_OFFSET); + + let value = (offset.to_u32() << OFFSET_SHIFT) | (kind as u32); + Self(value) + } + + const fn kind(self) -> SegmentKind { + match self.0 & KIND_MASK { + 0 => SegmentKind::Attribute, + 1 => SegmentKind::IntSubscript, + 2 => SegmentKind::StringSubscript, + _ => panic!("Invalid SegmentKind bits"), + } + } + + const fn offset(self) -> TextSize { + TextSize::new(self.0 >> OFFSET_SHIFT) + } +} + +impl std::fmt::Debug for SegmentInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SegmentInfo") + .field("kind", &self.kind()) + .field("offset", &self.offset()) + .finish() + } +} + +struct Segment<'a> { + kind: SegmentKind, + text: &'a str, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, get_size2::GetSize)] +#[repr(u8)] +enum SegmentKind { + Attribute = 0, + IntSubscript = 1, + StringSubscript = 2, +} + +/// Iterator over segments that converts `SegmentInfo` to `Segment` with text slices. +struct SegmentsIterator<'a, I> { + path: &'a str, + segment_infos: I, + current: Option, + next: Option, +} + +impl<'a, I> SegmentsIterator<'a, I> +where + I: Iterator, +{ + fn new(path: &'a str, mut segment_infos: I) -> Self { + let current = segment_infos.next(); + let next = segment_infos.next(); + + Self { + path, + segment_infos, + current, + next, + } + } +} + +impl<'a, I> Iterator for SegmentsIterator<'a, I> +where + I: Iterator, +{ + type Item = Segment<'a>; + + fn next(&mut self) -> Option { + let info = self.current.take()?; + let end = self + .next + .map(SegmentInfo::offset) + .unwrap_or(self.path.text_len()); + + self.current = self.next; + self.next = self.segment_infos.next(); + + Some(Segment { + kind: info.kind(), + text: &self.path[TextRange::new(info.offset(), end)], + }) + } +} + +const INLINE_COUNT_BITS: u32 = 3; +const INLINE_COUNT_MASK: u64 = (1 << INLINE_COUNT_BITS) - 1; +const INLINE_SEGMENT_BITS: u32 = 8; +const INLINE_SEGMENT_MASK: u64 = (1 << INLINE_SEGMENT_BITS) - 1; +const INLINE_KIND_BITS: u32 = 2; +const INLINE_KIND_MASK: u64 = (1 << INLINE_KIND_BITS) - 1; +const INLINE_PREV_LEN_BITS: u32 = 6; +const INLINE_PREV_LEN_MASK: u64 = (1 << INLINE_PREV_LEN_BITS) - 1; +const INLINE_MAX_SEGMENTS: usize = 7; +const INLINE_MAX_RELATIVE_OFFSET: u32 = (1 << INLINE_PREV_LEN_BITS) - 1; // 63 + +/// Compact representation that can store up to 7 segments inline in a u64. +/// +/// Layout: +/// - Bits 0-2: Number of segments minus 1 (0-6, representing 1-7 segments) +/// - Bits 3-10: Segment 0 (2 bits kind + 6 bits relative offset, max 63 bytes) +/// - Bits 11-18: Segment 1 (2 bits kind + 6 bits relative offset, max 63 bytes) +/// - Bits 19-26: Segment 2 (2 bits kind + 6 bits relative offset, max 63 bytes) +/// - Bits 27-34: Segment 3 (2 bits kind + 6 bits relative offset, max 63 bytes) +/// - Bits 35-42: Segment 4 (2 bits kind + 6 bits relative offset, max 63 bytes) +/// - Bits 43-50: Segment 5 (2 bits kind + 6 bits relative offset, max 63 bytes) +/// - Bits 51-58: Segment 6 (2 bits kind + 6 bits relative offset, max 63 bytes) +/// - Bits 59-63: Unused (5 bits) +/// +/// Constraints: +/// - Maximum 7 segments (realistic limit for member access chains) +/// - Maximum 63-byte relative offset per segment (sufficient for most identifiers) +/// - Never empty (`segments.len()` >= 1) +/// +#[derive(Clone, Copy, PartialEq, Eq, get_size2::GetSize)] +#[repr(transparent)] +struct SmallSegments(u64); + +impl SmallSegments { + fn try_from_slice(segments: &[SegmentInfo]) -> Option { + if segments.is_empty() || segments.len() > INLINE_MAX_SEGMENTS { + return None; + } + + // Pack into inline representation + // Store count minus 1 (since segments are never empty, range 0-6 represents 1-7 segments) + let mut packed = (segments.len() - 1) as u64; + let mut prev_offset = TextSize::new(0); + + for (i, segment) in segments.iter().enumerate() { + // Compute relative offset on-the-fly + let relative_offset = segment.offset() - prev_offset; + if relative_offset > TextSize::from(INLINE_MAX_RELATIVE_OFFSET) { + return None; + } + + let kind = segment.kind() as u64; + let relative_offset_val = u64::from(relative_offset.to_u32()); + let segment_data = (relative_offset_val << INLINE_KIND_BITS) | kind; + let shift = INLINE_COUNT_BITS + + (u32::try_from(i).expect("i is bounded by INLINE_MAX_SEGMENTS") + * INLINE_SEGMENT_BITS); + packed |= segment_data << shift; + + prev_offset = segment.offset(); + } + + Some(Self(packed)) + } + + #[expect( + clippy::cast_possible_truncation, + reason = "INLINE_COUNT_MASK ensures value is at most 7" + )] + const fn len(self) -> usize { + // Add 1 because we store count minus 1 + ((self.0 & INLINE_COUNT_MASK) + 1) as usize + } + + fn iter(self) -> SmallSegmentsInfoIterator { + SmallSegmentsInfoIterator { + segments: self, + index: 0, + next_offset: TextSize::new(0), + } + } + + /// Returns the parent member expression, e.g. `x.b` from `x.b.c`, or `None` if the parent is + /// the `symbol` itself (e, g. parent of `x.a` is just `x`). + const fn parent(self) -> Option { + let len = self.len(); + if len <= 1 { + return None; + } + + // Simply copy the packed value but update the count + let mut new_packed = self.0; + + // Clear the count bits and set the new count (len - 2, since we store count - 1) + new_packed &= !INLINE_COUNT_MASK; + new_packed |= (len - 2) as u64; + + Some(Self(new_packed)) + } +} + +impl std::fmt::Debug for SmallSegments { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_list().entries(self.iter()).finish() + } +} + +struct SmallSegmentsInfoIterator { + segments: SmallSegments, + index: usize, + next_offset: TextSize, +} + +impl Iterator for SmallSegmentsInfoIterator { + type Item = SegmentInfo; + + fn next(&mut self) -> Option { + let count = self.segments.len(); + if self.index >= count { + return None; + } + + // Extract the relative offset and kind for the current segment + let shift = INLINE_COUNT_BITS + + (u32::try_from(self.index).expect("index is bounded by INLINE_MAX_SEGMENTS") + * INLINE_SEGMENT_BITS); + let segment_data = (self.segments.0 >> shift) & INLINE_SEGMENT_MASK; + let kind = (segment_data & INLINE_KIND_MASK) as u8; + let relative_offset = ((segment_data >> INLINE_KIND_BITS) & INLINE_PREV_LEN_MASK) as u32; + + // Update the running absolute offset + self.next_offset += TextSize::new(relative_offset); + + let kind = match kind { + 0 => SegmentKind::Attribute, + 1 => SegmentKind::IntSubscript, + 2 => SegmentKind::StringSubscript, + _ => panic!("Invalid SegmentKind bits"), + }; + + self.index += 1; + Some(SegmentInfo::new(kind, self.next_offset)) + } +} + +/// Reference view of segments, can be either small (inline) or heap-allocated. +#[derive(Clone, Copy, Debug)] +enum SegmentsRef<'a> { + Small(SmallSegments), + Heap(&'a [SegmentInfo]), +} + +impl<'a> SegmentsRef<'a> { + fn len(&self) -> usize { + match self { + Self::Small(small) => small.len(), + Self::Heap(segments) => segments.len(), + } + } + + fn iter(&self) -> impl Iterator + '_ { + match self { + Self::Small(small) => itertools::Either::Left(small.iter()), + Self::Heap(heap) => itertools::Either::Right(heap.iter().copied()), + } + } + + /// Returns a parent view with one fewer segment, or None if <= 1 segment + fn parent(&self) -> Option> { + match self { + Self::Small(small) => small.parent().map(SegmentsRef::Small), + Self::Heap(segments) => { + let len = segments.len(); + if len <= 1 { + None + } else { + Some(SegmentsRef::Heap(&segments[..len - 1])) + } + } + } + } +} + +impl<'a> From<&'a Segments> for SegmentsRef<'a> { + fn from(segments: &'a Segments) -> Self { + match segments { + Segments::Small(small) => SegmentsRef::Small(*small), + Segments::Heap(heap) => SegmentsRef::Heap(heap), + } + } +} + +impl PartialEq for SegmentsRef<'_> { + fn eq(&self, other: &Self) -> bool { + let len = self.len(); + if len != other.len() { + return false; + } + self.iter().eq(other.iter()) + } +} + +impl Eq for SegmentsRef<'_> {} + +/// Helper function to hash a single value and return the hash. +fn hash_single(value: &T) -> u64 { + let mut hasher = FxHasher::default(); + value.hash(&mut hasher); + hasher.finish() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_member_expr_ref_hash_and_eq_small_heap() { + // For expression: foo.bar[0]["baz"] + // The path would be: "foobar0baz" (no dots or brackets in the path) + let path = "foobar0baz"; + + let segments = vec![ + SegmentInfo::new(SegmentKind::Attribute, TextSize::new(3)), // .bar at offset 3 + SegmentInfo::new(SegmentKind::IntSubscript, TextSize::new(6)), // [0] at offset 6 + SegmentInfo::new(SegmentKind::StringSubscript, TextSize::new(7)), // ["baz"] at offset 7 + ]; + + // Create Small version. + let small_segments = SmallSegments::try_from_slice(&segments).unwrap(); + let member_ref_small = MemberExprRef { + path, + segments: SegmentsRef::Small(small_segments), + }; + + // Create Heap version with the same data. + let heap_segments: Box<[SegmentInfo]> = segments.into_boxed_slice(); + let member_ref_heap = MemberExprRef { + path, + segments: SegmentsRef::Heap(&heap_segments), + }; + + // Test hash equality (MemberExprRef only hashes the path). + assert_eq!( + hash_single(&member_ref_small), + hash_single(&member_ref_heap) + ); + + // Test equality in both directions. + assert_eq!(member_ref_small, member_ref_heap); + assert_eq!(member_ref_heap, member_ref_small); + } + + #[test] + fn test_member_expr_ref_different_segments() { + // For expressions: foo.bar[0] vs foo.bar["0"] + // Both have the same path "foobar0" but different segment types + let path = "foobar0"; + + // First expression: foo.bar[0] + let segments1 = vec![ + SegmentInfo::new(SegmentKind::Attribute, TextSize::new(3)), // .bar at offset 3 + SegmentInfo::new(SegmentKind::IntSubscript, TextSize::new(6)), // [0] at offset 6 + ]; + + // Second expression: foo.bar["0"] + let segments2 = vec![ + SegmentInfo::new(SegmentKind::Attribute, TextSize::new(3)), // .bar at offset 3 + SegmentInfo::new(SegmentKind::StringSubscript, TextSize::new(6)), // ["0"] at offset 6 + ]; + + // Create MemberExprRef instances + let small1 = SmallSegments::try_from_slice(&segments1).unwrap(); + let member_ref1 = MemberExprRef { + path, + segments: SegmentsRef::Small(small1), + }; + + let small2 = SmallSegments::try_from_slice(&segments2).unwrap(); + let member_ref2 = MemberExprRef { + path, + segments: SegmentsRef::Small(small2), + }; + + // Test inequality + assert_ne!(member_ref1, member_ref2); + assert_ne!(member_ref2, member_ref1); + + // Test hash equality (MemberExprRef only hashes the path, not segments) + assert_eq!(hash_single(&member_ref1), hash_single(&member_ref2)); + } + + #[test] + fn test_member_expr_ref_parent() { + use ruff_python_parser::parse_expression; + + // Parse a real Python expression + let parsed = parse_expression(r#"foo.bar[0]["baz"]"#).unwrap(); + let expr = parsed.expr(); + + // Convert to MemberExpr + let member_expr = MemberExpr::try_from_expr(ast::ExprRef::from(expr)).unwrap(); + let member_ref = member_expr.as_ref(); + + // Verify the initial state: foo.bar[0]["baz"] + assert_eq!(member_ref.symbol_name(), "foo"); + let segments: Vec<_> = member_ref.segments().map(|s| (s.kind, s.text)).collect(); + assert_eq!( + segments, + vec![ + (SegmentKind::Attribute, "bar"), + (SegmentKind::IntSubscript, "0"), + (SegmentKind::StringSubscript, "baz") + ] + ); + + // Test parent() removes the last segment ["baz"] -> foo.bar[0] + let parent1 = member_ref.parent().unwrap(); + assert_eq!(parent1.symbol_name(), "foo"); + let parent1_segments: Vec<_> = parent1.segments().map(|s| (s.kind, s.text)).collect(); + assert_eq!( + parent1_segments, + vec![ + (SegmentKind::Attribute, "bar"), + (SegmentKind::IntSubscript, "0") + ] + ); + + // Test parent of parent removes [0] -> foo.bar + let parent2 = parent1.parent().unwrap(); + assert_eq!(parent2.symbol_name(), "foo"); + let parent2_segments: Vec<_> = parent2.segments().map(|s| (s.kind, s.text)).collect(); + assert_eq!(parent2_segments, vec![(SegmentKind::Attribute, "bar")]); + + // Test parent of single segment is a symbol and not a member. + let parent3 = parent2.parent(); + assert!(parent3.is_none()); + } + + #[test] + fn test_member_expr_small_vs_heap_allocation() { + use ruff_python_parser::parse_expression; + + // Test Small allocation: 7 segments (maximum for inline storage) + // Create expression with exactly 7 segments: x.a.b.c.d.e.f.g + let small_expr = parse_expression("x.a.b.c.d.e.f.g").unwrap(); + let small_member = + MemberExpr::try_from_expr(ast::ExprRef::from(small_expr.expr())).unwrap(); + + // Should use Small allocation + assert!(matches!(small_member.segments, Segments::Small(_))); + assert_eq!(small_member.num_segments(), 7); + + // Test Heap allocation: 8 segments (exceeds inline capacity) + // Create expression with 8 segments: x.a.b.c.d.e.f.g.h + let heap_expr = parse_expression("x.a.b.c.d.e.f.g.h").unwrap(); + let heap_member = MemberExpr::try_from_expr(ast::ExprRef::from(heap_expr.expr())).unwrap(); + + // Should use Heap allocation + assert!(matches!(heap_member.segments, Segments::Heap(_))); + assert_eq!(heap_member.num_segments(), 8); + + // Test Small allocation with relative offset limit + // Create expression where relative offsets are small enough: a.b[0]["c"] + let small_offset_expr = parse_expression(r#"a.b[0]["c"]"#).unwrap(); + let small_offset_member = + MemberExpr::try_from_expr(ast::ExprRef::from(small_offset_expr.expr())).unwrap(); + + // Should use Small allocation (3 segments, small offsets) + assert!(matches!(small_offset_member.segments, Segments::Small(_))); + assert_eq!(small_offset_member.num_segments(), 3); + + // Test Small allocation with maximum 63-byte relative offset limit + // Create expression where one segment has exactly 63 bytes (the limit) + let segment_63_bytes = "a".repeat(63); + let max_offset_expr_code = format!("x.{segment_63_bytes}.y"); + let max_offset_expr = parse_expression(&max_offset_expr_code).unwrap(); + let max_offset_member = + MemberExpr::try_from_expr(ast::ExprRef::from(max_offset_expr.expr())).unwrap(); + // Should still use Small allocation (exactly at the limit) + assert!(matches!(max_offset_member.segments, Segments::Small(_))); + assert_eq!(max_offset_member.num_segments(), 2); + + // Test that heap allocation works for segment content that would exceed relative offset limits + // This would require very long identifiers (>63 bytes between segments), which is uncommon + // but we can test by creating an expression with long attribute names + let long_name = "a".repeat(64); // 64 bytes (exceeds 63-byte limit) + let long_expr_code = format!("x.{long_name}.y"); + let long_expr = parse_expression(&long_expr_code).unwrap(); + let long_member = MemberExpr::try_from_expr(ast::ExprRef::from(long_expr.expr())).unwrap(); + // Should use Heap allocation due to large relative offset + assert!(matches!(long_member.segments, Segments::Heap(_))); + assert_eq!(long_member.num_segments(), 2); + } +} diff --git a/crates/ty_python_semantic/src/semantic_index/place.rs b/crates/ty_python_semantic/src/semantic_index/place.rs index b36b683a2d..f16c606747 100644 --- a/crates/ty_python_semantic/src/semantic_index/place.rs +++ b/crates/ty_python_semantic/src/semantic_index/place.rs @@ -1,6 +1,5 @@ use crate::semantic_index::member::{ - Member, MemberExpr, MemberExprRef, MemberSegment, MemberTable, MemberTableBuilder, - ScopedMemberId, + Member, MemberExpr, MemberExprRef, MemberTable, MemberTableBuilder, ScopedMemberId, }; use crate::semantic_index::scope::FileScopeId; use crate::semantic_index::symbol::{ScopedSymbolId, Symbol, SymbolTable, SymbolTableBuilder}; @@ -37,48 +36,14 @@ impl PlaceExpr { /// * attribute: `x.y` /// * subscripts with integer or string literals: `x[0]`, `x['key']` pub(crate) fn try_from_expr<'e>(expr: impl Into>) -> Option { - let mut current = expr.into(); - let mut segments = smallvec::SmallVec::new_const(); + let expr = expr.into(); - loop { - match current { - ast::ExprRef::Name(name) => { - if segments.is_empty() { - return Some(PlaceExpr::Symbol(Symbol::new(name.id.clone()))); - } - - return Some(PlaceExpr::Member(Member::new(MemberExpr::new( - name.id.clone(), - segments, - )))); - } - ast::ExprRef::Attribute(attribute) => { - segments.push(MemberSegment::Attribute(attribute.attr.id.clone())); - current = ast::ExprRef::from(&attribute.value); - } - ast::ExprRef::Subscript(subscript) => { - match &*subscript.slice { - ast::Expr::NumberLiteral(ast::ExprNumberLiteral { - value: ast::Number::Int(index), - .. - }) => { - segments.push(MemberSegment::IntSubscript(index.clone())); - } - ast::Expr::StringLiteral(string) => { - segments.push(MemberSegment::StringSubscript(string.value.to_string())); - } - _ => { - return None; - } - } - - current = ast::ExprRef::from(&subscript.value); - } - _ => { - return None; - } - } + if let ast::ExprRef::Name(name) = expr { + return Some(PlaceExpr::Symbol(Symbol::new(name.id.clone()))); } + + let member_expression = MemberExpr::try_from_expr(expr)?; + Some(Self::Member(Member::new(member_expression))) } } @@ -132,7 +97,7 @@ impl<'a> PlaceExprRef<'a> { pub(crate) fn num_member_segments(self) -> usize { match self { PlaceExprRef::Symbol(_) => 0, - PlaceExprRef::Member(member) => member.expression().member_segments().len(), + PlaceExprRef::Member(member) => member.expression().num_segments(), } } } @@ -517,24 +482,20 @@ enum ParentPlaceIterState<'a> { impl<'a> ParentPlaceIterState<'a> { fn parent_state( - expression: MemberExprRef<'a>, + expression: &MemberExprRef<'a>, symbols: &'a SymbolTable, members: &'a MemberTable, ) -> Self { - let segments = expression.rev_member_segments(); - let segments = &segments[1..]; - - if segments.is_empty() { - Self::Symbol { - symbol_name: expression.symbol_name(), - symbols, - } - } else { - Self::Member { - next_member: MemberExprRef::from_raw(expression.symbol_name(), segments), + match expression.parent() { + Some(parent) => Self::Member { + next_member: parent, symbols, members, - } + }, + None => Self::Symbol { + symbol_name: expression.symbol_name(), + symbols, + }, } } } @@ -549,9 +510,10 @@ impl<'a> ParentPlaceIter<'a> { symbol_table: &'a SymbolTable, member_table: &'a MemberTable, ) -> Self { + let expr_ref = expression.as_ref(); ParentPlaceIter { state: Some(ParentPlaceIterState::parent_state( - expression.as_ref(), + &expr_ref, symbol_table, member_table, )), @@ -578,7 +540,7 @@ impl Iterator for ParentPlaceIter<'_> { next_member, } => { self.state = Some(ParentPlaceIterState::parent_state( - next_member, + &next_member, symbols, members, )); diff --git a/crates/ty_python_semantic/src/types/ide_support.rs b/crates/ty_python_semantic/src/types/ide_support.rs index 06d30244f6..27e1a0fcdb 100644 --- a/crates/ty_python_semantic/src/types/ide_support.rs +++ b/crates/ty_python_semantic/src/types/ide_support.rs @@ -309,12 +309,12 @@ impl<'db> AllMembers<'db> { let Some(name) = place_expr.as_instance_attribute() else { continue; }; - let result = ty.member(db, name.as_str()); + let result = ty.member(db, name); let Some(ty) = result.place.ignore_possibly_unbound() else { continue; }; self.members.insert(Member { - name: name.clone(), + name: Name::new(name), ty, }); }