From b2a49d81401a737290177ba75794f768b10bacd1 Mon Sep 17 00:00:00 2001
From: Carl Meyer <carl@astral.sh>
Date: Wed, 17 Jul 2024 17:50:58 -0700
Subject: [PATCH] [red-knot] better docs for use-def maps (#12357)

Add better doc comments and comments, as well as one debug assertion, to
use-def map building.
---
 .../src/semantic_index.rs                     |   5 +-
 .../src/semantic_index/builder.rs             |  20 +-
 .../src/semantic_index/use_def.rs             | 258 +++++++++++++++---
 3 files changed, 231 insertions(+), 52 deletions(-)
diff --git a/crates/red_knot_python_semantic/src/semantic_index.rs b/crates/red_knot_python_semantic/src/semantic_index.rs
index 1ef5bfaee0..32a4648bb1 100644
--- a/crates/red_knot_python_semantic/src/semantic_index.rs
+++ b/crates/red_knot_python_semantic/src/semantic_index.rs
@@ -15,7 +15,6 @@ use crate::semantic_index::expression::Expression;
 use crate::semantic_index::symbol::{
     FileScopeId, NodeWithScopeKey, NodeWithScopeRef, Scope, ScopeId, ScopedSymbolId, SymbolTable,
 };
-use crate::semantic_index::use_def::UseDefMap;
 use crate::Db;
 
 pub mod ast_ids;
@@ -23,7 +22,9 @@ mod builder;
 pub mod definition;
 pub mod expression;
 pub mod symbol;
-pub mod use_def;
+mod use_def;
+
+pub(crate) use self::use_def::UseDefMap;
 
 type SymbolMap = hashbrown::HashMap<ScopedSymbolId, (), ()>;
 
diff --git a/crates/red_knot_python_semantic/src/semantic_index/builder.rs b/crates/red_knot_python_semantic/src/semantic_index/builder.rs
index c9285116c3..0a6733db23 100644
--- a/crates/red_knot_python_semantic/src/semantic_index/builder.rs
+++ b/crates/red_knot_python_semantic/src/semantic_index/builder.rs
@@ -139,11 +139,11 @@ impl<'db> SemanticIndexBuilder<'db> {
         self.current_use_def_map().snapshot()
     }
 
-    fn flow_set(&mut self, state: &FlowSnapshot) {
-        self.current_use_def_map().set(state);
+    fn flow_restore(&mut self, state: FlowSnapshot) {
+        self.current_use_def_map().restore(state);
     }
 
-    fn flow_merge(&mut self, state: &FlowSnapshot) {
+    fn flow_merge(&mut self, state: FlowSnapshot) {
         self.current_use_def_map().merge(state);
     }
 
@@ -397,20 +397,20 @@ where
                 let mut post_clauses: Vec<FlowSnapshot> = vec![self.flow_snapshot()];
                 for clause in &node.elif_else_clauses {
                     // we can only take an elif/else clause if none of the previous ones were taken
-                    self.flow_set(&pre_if);
+                    self.flow_restore(pre_if.clone());
                     self.visit_elif_else_clause(clause);
                     post_clauses.push(self.flow_snapshot());
                     if clause.test.is_none() {
                         last_clause_is_else = true;
                     }
                 }
-                let mut post_clause_iter = post_clauses.iter();
+                let mut post_clause_iter = post_clauses.into_iter();
                 if last_clause_is_else {
                     // if the last clause was an else, the pre_if state can't directly reach the
-                    // post-state; we have to enter one of the clauses.
-                    self.flow_set(post_clause_iter.next().unwrap());
+                    // post-state; we must enter one of the clauses.
+                    self.flow_restore(post_clause_iter.next().unwrap());
                 } else {
-                    self.flow_set(&pre_if);
+                    self.flow_restore(pre_if);
                 }
                 for post_clause_state in post_clause_iter {
                     self.flow_merge(post_clause_state);
@@ -483,9 +483,9 @@ where
                 let pre_if = self.flow_snapshot();
                 self.visit_expr(body);
                 let post_body = self.flow_snapshot();
-                self.flow_set(&pre_if);
+                self.flow_restore(pre_if);
                 self.visit_expr(orelse);
-                self.flow_merge(&post_body);
+                self.flow_merge(post_body);
             }
             _ => {
                 walk_expr(self, expr);
diff --git a/crates/red_knot_python_semantic/src/semantic_index/use_def.rs b/crates/red_knot_python_semantic/src/semantic_index/use_def.rs
index 4aa0aa0f76..9e501a30a8 100644
--- a/crates/red_knot_python_semantic/src/semantic_index/use_def.rs
+++ b/crates/red_knot_python_semantic/src/semantic_index/use_def.rs
@@ -1,3 +1,130 @@
+//! Build a map from each use of a symbol to the definitions visible from that use.
+//!
+//! Let's take this code sample:
+//!
+//! ```python
+//! x = 1
+//! x = 2
+//! y = x
+//! if flag:
+//!     x = 3
+//! else:
+//!     x = 4
+//! z = x
+//! ```
+//!
+//! In this snippet, we have four definitions of `x` (the statements assigning `1`, `2`, `3`,
+//! and `4` to it), and two uses of `x` (the `y = x` and `z = x` assignments). The first
+//! [`Definition`] of `x` is never visible to any use, because it's immediately replaced by the
+//! second definition, before any use happens. (A linter could thus flag the statement `x = 1`
+//! as likely superfluous.)
+//!
+//! The first use of `x` has one definition visible to it: the assignment `x = 2`.
+//!
+//! Things get a bit more complex when we have branches. We will definitely take either the `if` or
+//! the `else` branch. Thus, the second use of `x` has two definitions visible to it: `x = 3` and
+//! `x = 4`. The `x = 2` definition is no longer visible, because it must be replaced by either `x
+//! = 3` or `x = 4`, no matter which branch was taken. We don't know which branch was taken, so we
+//! must consider both definitions as visible, which means eventually we would (in type inference)
+//! look at these two definitions and infer a type of `Literal[3, 4]` -- the union of `Literal[3]`
+//! and `Literal[4]` -- for the second use of `x`.
+//!
+//! So that's one question our use-def map needs to answer: given a specific use of a symbol, which
+//! definition(s) is/are visible from that use. In
+//! [`AstIds`](crate::semantic_index::ast_ids::AstIds) we number all uses (that means a `Name` node
+//! with `Load` context) so we have a `ScopedUseId` to efficiently represent each use.
+//!
+//! The other case we need to handle is when a symbol is referenced from a different scope (the
+//! most obvious example of this is an import). We call this "public" use of a symbol. So the other
+//! question we need to be able to answer is, what are the publicly-visible definitions of each
+//! symbol?
+//!
+//! Technically, public use of a symbol could also occur from any point in control flow of the
+//! scope where the symbol is defined (via inline imports and import cycles, in the case of an
+//! import, or via a function call partway through the local scope that ends up using a symbol from
+//! the scope via a global or nonlocal reference.) But modeling this fully accurately requires
+//! whole-program analysis that isn't tractable for an efficient incremental compiler, since it
+//! means a given symbol could have a different type every place it's referenced throughout the
+//! program, depending on the shape of arbitrarily-sized call/import graphs. So we follow other
+//! Python type-checkers in making the simplifying assumption that usually the scope will finish
+//! execution before its symbols are made visible to other scopes; for instance, most imports will
+//! import from a complete module, not a partially-executed module. (We may want to get a little
+//! smarter than this in the future, in particular for closures, but for now this is where we
+//! start.)
+//!
+//! So this means that the publicly-visible definitions of a symbol are the definitions still
+//! visible at the end of the scope.
+//!
+//! The data structure we build to answer these two questions is the `UseDefMap`. It has a
+//! `definitions_by_use` vector indexed by [`ScopedUseId`] and a `public_definitions` vector
+//! indexed by [`ScopedSymbolId`]. The values in each of these vectors are (in principle) a list of
+//! visible definitions at that use, or at the end of the scope for that symbol.
+//!
+//! In order to avoid vectors-of-vectors and all the allocations that would entail, we don't
+//! actually store these "list of visible definitions" as a vector of [`Definition`] IDs. Instead,
+//! the values in `definitions_by_use` and `public_definitions` are a [`Definitions`] struct that
+//! keeps a [`Range`] into a third vector of [`Definition`] IDs, `all_definitions`. The trick with
+//! this representation is that it requires that the definitions visible at any given use of a
+//! symbol are stored sequentially in `all_definitions`.
+//!
+//! There is another special kind of possible "definition" for a symbol: it might be unbound in the
+//! scope. (This isn't equivalent to "zero visible definitions", since we may go through an `if`
+//! that has a definition for the symbol, leaving us with one visible definition, but still also
+//! the "unbound" possibility, since we might not have taken the `if` branch.)
+//!
+//! The simplest way to model "unbound" would be as an actual [`Definition`] itself: the initial
+//! visible [`Definition`] for each symbol in a scope. But actually modeling it this way would
+//! dramatically increase the number of [`Definition`] that Salsa must track. Since "unbound" is a
+//! special definition in that all symbols share it, and it doesn't have any additional per-symbol
+//! state, we can represent it more efficiently: we use the `may_be_unbound` boolean on the
+//! [`Definitions`] struct. If this flag is `true`, it means the symbol/use really has one
+//! additional visible "definition", which is the unbound state. If this flag is `false`, it means
+//! we've eliminated the possibility of unbound: every path we've followed includes a definition
+//! for this symbol.
+//!
+//! To build a [`UseDefMap`], the [`UseDefMapBuilder`] is notified of each new use and definition
+//! as they are encountered by the
+//! [`SemanticIndexBuilder`](crate::semantic_index::builder::SemanticIndexBuilder) AST visit. For
+//! each symbol, the builder tracks the currently-visible definitions for that symbol. When we hit
+//! a use of a symbol, it records the currently-visible definitions for that symbol as the visible
+//! definitions for that use. When we reach the end of the scope, it records the currently-visible
+//! definitions for each symbol as the public definitions of that symbol.
+//!
+//! Let's walk through the above example. Initially we record for `x` that it has no visible
+//! definitions, and may be unbound. When we see `x = 1`, we record that as the sole visible
+//! definition of `x`, and flip `may_be_unbound` to `false`. Then we see `x = 2`, and it replaces
+//! `x = 1` as the sole visible definition of `x`. When we get to `y = x`, we record that the
+//! visible definitions for that use of `x` are just the `x = 2` definition.
+//!
+//! Then we hit the `if` branch. We visit the `test` node (`flag` in this case), since that will
+//! happen regardless. Then we take a pre-branch snapshot of the currently visible definitions for
+//! all symbols, which we'll need later. Then we go ahead and visit the `if` body. When we see `x =
+//! 3`, it replaces `x = 2` as the sole visible definition of `x`. At the end of the `if` body, we
+//! take another snapshot of the currently-visible definitions; we'll call this the post-if-body
+//! snapshot.
+//!
+//! Now we need to visit the `else` clause. The conditions when entering the `else` clause should
+//! be the pre-if conditions; if we are entering the `else` clause, we know that the `if` test
+//! failed and we didn't execute the `if` body. So we first reset the builder to the pre-if state,
+//! using the snapshot we took previously (meaning we now have `x = 2` as the sole visible
+//! definition for `x` again), then visit the `else` clause, where `x = 4` replaces `x = 2` as the
+//! sole visible definition of `x`.
+//!
+//! Now we reach the end of the if/else, and want to visit the following code. The state here needs
+//! to reflect that we might have gone through the `if` branch, or we might have gone through the
+//! `else` branch, and we don't know which. So we need to "merge" our current builder state
+//! (reflecting the end-of-else state, with `x = 4` as the only visible definition) with our
+//! post-if-body snapshot (which has `x = 3` as the only visible definition). The result of this
+//! merge is that we now have two visible definitions of `x`: `x = 3` and `x = 4`.
+//!
+//! The [`UseDefMapBuilder`] itself just exposes methods for taking a snapshot, resetting to a
+//! snapshot, and merging a snapshot into the current state. The logic using these methods lives in
+//! [`SemanticIndexBuilder`](crate::semantic_index::builder::SemanticIndexBuilder), e.g. where it
+//! visits a `StmtIf` node.
+//!
+//! (In the future we may have some other questions we want to answer as well, such as "is this
+//! definition used?", which will require tracking a bit more info in our map, e.g. a "used" bit
+//! for each [`Definition`] which is flipped to true when we record that definition for a use.)
 use crate::semantic_index::ast_ids::ScopedUseId;
 use crate::semantic_index::definition::Definition;
 use crate::semantic_index::symbol::ScopedSymbolId;
@@ -8,18 +135,19 @@ use std::ops::Range;
 #[derive(Debug, PartialEq, Eq)]
 pub(crate) struct UseDefMap<'db> {
     // TODO store constraints with definitions for type narrowing
+    /// Definition IDs array for `definitions_by_use` and `public_definitions` to slice into.
     all_definitions: Vec<Definition<'db>>,
 
     /// Definitions that can reach a [`ScopedUseId`].
     definitions_by_use: IndexVec<ScopedUseId, Definitions>,
 
-    /// Definitions of a symbol visible to other scopes.
+    /// Definitions of each symbol visible at end of scope.
     public_definitions: IndexVec<ScopedSymbolId, Definitions>,
 }
 
 impl<'db> UseDefMap<'db> {
     pub(crate) fn use_definitions(&self, use_id: ScopedUseId) -> &[Definition<'db>] {
-        &self.all_definitions[self.definitions_by_use[use_id].definitions.clone()]
+        &self.all_definitions[self.definitions_by_use[use_id].definitions_range.clone()]
     }
 
     pub(crate) fn use_may_be_unbound(&self, use_id: ScopedUseId) -> bool {
@@ -27,7 +155,7 @@ impl<'db> UseDefMap<'db> {
     }
 
     pub(crate) fn public_definitions(&self, symbol: ScopedSymbolId) -> &[Definition<'db>] {
-        &self.all_definitions[self.public_definitions[symbol].definitions.clone()]
+        &self.all_definitions[self.public_definitions[symbol].definitions_range.clone()]
     }
 
     pub(crate) fn public_may_be_unbound(&self, symbol: ScopedSymbolId) -> bool {
@@ -35,32 +163,45 @@ impl<'db> UseDefMap<'db> {
     }
 }
 
+/// Definitions visible for a symbol at a particular use (or end-of-scope).
 #[derive(Clone, Debug, PartialEq, Eq)]
 struct Definitions {
-    definitions: Range<usize>,
+    /// [`Range`] in `all_definitions` of the visible definition IDs.
+    definitions_range: Range<usize>,
+    /// Is the symbol possibly unbound at this point?
     may_be_unbound: bool,
 }
 
-impl Default for Definitions {
-    fn default() -> Self {
+impl Definitions {
+    /// The default state of a symbol is "no definitions, may be unbound", aka definitely-unbound.
+    fn unbound() -> Self {
         Self {
-            definitions: Range::default(),
+            definitions_range: Range::default(),
             may_be_unbound: true,
         }
     }
 }
 
-#[derive(Debug)]
+impl Default for Definitions {
+    fn default() -> Self {
+        Definitions::unbound()
+    }
+}
+
+/// A snapshot of the visible definitions for each symbol at a particular point in control flow.
+#[derive(Clone, Debug)]
 pub(super) struct FlowSnapshot {
     definitions_by_symbol: IndexVec<ScopedSymbolId, Definitions>,
 }
 
 pub(super) struct UseDefMapBuilder<'db> {
+    /// Definition IDs array for `definitions_by_use` and `definitions_by_symbol` to slice into.
     all_definitions: Vec<Definition<'db>>,
 
+    /// Visible definitions at each so-far-recorded use.
     definitions_by_use: IndexVec<ScopedUseId, Definitions>,
 
-    /// builder state: currently visible definitions for each symbol
+    /// Currently visible definitions for each symbol.
     definitions_by_symbol: IndexVec<ScopedSymbolId, Definitions>,
 }
 
@@ -74,7 +215,7 @@ impl<'db> UseDefMapBuilder<'db> {
     }
 
     pub(super) fn add_symbol(&mut self, symbol: ScopedSymbolId) {
-        let new_symbol = self.definitions_by_symbol.push(Definitions::default());
+        let new_symbol = self.definitions_by_symbol.push(Definitions::unbound());
         debug_assert_eq!(symbol, new_symbol);
     }
 
@@ -83,69 +224,106 @@ impl<'db> UseDefMapBuilder<'db> {
         symbol: ScopedSymbolId,
         definition: Definition<'db>,
     ) {
+        // We have a new definition of a symbol; this replaces any previous definitions in this
+        // path.
         let def_idx = self.all_definitions.len();
         self.all_definitions.push(definition);
         self.definitions_by_symbol[symbol] = Definitions {
             #[allow(clippy::range_plus_one)]
-            definitions: def_idx..(def_idx + 1),
+            definitions_range: def_idx..(def_idx + 1),
             may_be_unbound: false,
         };
     }
 
     pub(super) fn record_use(&mut self, symbol: ScopedSymbolId, use_id: ScopedUseId) {
+        // We have a use of a symbol; clone the currently visible definitions for that symbol, and
+        // record them as the visible definitions for this use.
         let new_use = self
             .definitions_by_use
             .push(self.definitions_by_symbol[symbol].clone());
         debug_assert_eq!(use_id, new_use);
     }
 
+    /// Take a snapshot of the current visible-symbols state.
     pub(super) fn snapshot(&self) -> FlowSnapshot {
         FlowSnapshot {
             definitions_by_symbol: self.definitions_by_symbol.clone(),
         }
     }
 
-    pub(super) fn set(&mut self, state: &FlowSnapshot) {
+    /// Restore the current builder visible-definitions state to the given snapshot.
+    pub(super) fn restore(&mut self, snapshot: FlowSnapshot) {
+        // We never remove symbols from `definitions_by_symbol` (its an IndexVec, and the symbol
+        // IDs need to line up), so the current number of recorded symbols must always be equal or
+        // greater than the number of symbols in a previously-recorded snapshot.
         let num_symbols = self.definitions_by_symbol.len();
-        self.definitions_by_symbol = state.definitions_by_symbol.clone();
+        debug_assert!(num_symbols >= snapshot.definitions_by_symbol.len());
+
+        // Restore the current visible-definitions state to the given snapshot.
+        self.definitions_by_symbol = snapshot.definitions_by_symbol;
+
+        // If the snapshot we are restoring is missing some symbols we've recorded since, we need
+        // to fill them in so the symbol IDs continue to line up. Since they don't exist in the
+        // snapshot, the correct state to fill them in with is "unbound", the default.
         self.definitions_by_symbol
-            .resize(num_symbols, Definitions::default());
+            .resize(num_symbols, Definitions::unbound());
     }
 
-    pub(super) fn merge(&mut self, state: &FlowSnapshot) {
-        for (symbol_id, to_merge) in state.definitions_by_symbol.iter_enumerated() {
+    /// Merge the given snapshot into the current state, reflecting that we might have taken either
+    /// path to get here. The new visible-definitions state for each symbol should include
+    /// definitions from both the prior state and the snapshot.
+    #[allow(clippy::needless_pass_by_value)]
+    pub(super) fn merge(&mut self, snapshot: FlowSnapshot) {
+        // The tricky thing about merging two Ranges pointing into `all_definitions` is that if the
+        // two Ranges aren't already adjacent in `all_definitions`, we will have to copy at least
+        // one or the other of the ranges to the end of `all_definitions` so as to make them
+        // adjacent. We can't ever move things around in `all_definitions` because previously
+        // recorded uses may still have ranges pointing to any part of it; all we can do is append.
+        // It's possible we may end up with some old entries in `all_definitions` that nobody is
+        // pointing to, but that's OK.
+
+        for (symbol_id, to_merge) in snapshot.definitions_by_symbol.iter_enumerated() {
             let current = &mut self.definitions_by_symbol[symbol_id];
-            // if the symbol can be unbound in either predecessor, it can be unbound
+
+            // If the symbol can be unbound in either predecessor, it can be unbound post-merge.
             current.may_be_unbound |= to_merge.may_be_unbound;
-            // merge the definition ranges
-            if current.definitions == to_merge.definitions {
-                // ranges already identical, nothing to do!
-            } else if current.definitions.end == to_merge.definitions.start {
-                // ranges adjacent (current first), just merge them
-                current.definitions = (current.definitions.start)..(to_merge.definitions.end);
-            } else if current.definitions.start == to_merge.definitions.end {
-                // ranges adjacent (to_merge first), just merge them
-                current.definitions = (to_merge.definitions.start)..(current.definitions.end);
-            } else if current.definitions.end == self.all_definitions.len() {
-                // ranges not adjacent but current is at end, copy only to_merge
+
+            // Merge the definition ranges.
+            if current.definitions_range == to_merge.definitions_range {
+                // Ranges already identical, nothing to do!
+            } else if current.definitions_range.end == to_merge.definitions_range.start {
+                // Ranges are adjacent (`current` first), just merge them into one range.
+                current.definitions_range =
+                    (current.definitions_range.start)..(to_merge.definitions_range.end);
+            } else if current.definitions_range.start == to_merge.definitions_range.end {
+                // Ranges are adjacent (`to_merge` first), just merge them into one range.
+                current.definitions_range =
+                    (to_merge.definitions_range.start)..(current.definitions_range.end);
+            } else if current.definitions_range.end == self.all_definitions.len() {
+                // Ranges are not adjacent, `current` is at the end of `all_definitions`, we need
+                // to copy `to_merge` to the end so they are adjacent and can be merged into one
+                // range.
                 self.all_definitions
-                    .extend_from_within(to_merge.definitions.clone());
-                current.definitions.end = self.all_definitions.len();
-            } else if to_merge.definitions.end == self.all_definitions.len() {
-                // ranges not adjacent but to_merge is at end, copy only current
+                    .extend_from_within(to_merge.definitions_range.clone());
+                current.definitions_range.end = self.all_definitions.len();
+            } else if to_merge.definitions_range.end == self.all_definitions.len() {
+                // Ranges are not adjacent, `to_merge` is at the end of `all_definitions`, we need
+                // to copy `current` to the end so they are adjacent and can be merged into one
+                // range.
                 self.all_definitions
-                    .extend_from_within(current.definitions.clone());
-                current.definitions.start = to_merge.definitions.start;
-                current.definitions.end = self.all_definitions.len();
+                    .extend_from_within(current.definitions_range.clone());
+                current.definitions_range.start = to_merge.definitions_range.start;
+                current.definitions_range.end = self.all_definitions.len();
             } else {
-                // ranges not adjacent and neither at end, must copy both
+                // Ranges are not adjacent and neither one is at the end of `all_definitions`, we
+                // have to copy both to the end so they are adjacent and we can merge them.
                 let start = self.all_definitions.len();
                 self.all_definitions
-                    .extend_from_within(current.definitions.clone());
+                    .extend_from_within(current.definitions_range.clone());
                 self.all_definitions
-                    .extend_from_within(to_merge.definitions.clone());
-                current.definitions.start = start;
-                current.definitions.end = self.all_definitions.len();
+                    .extend_from_within(to_merge.definitions_range.clone());
+                current.definitions_range.start = start;
+                current.definitions_range.end = self.all_definitions.len();
             }
         }
     }