From e15b9c5572227df15e09b806a7b707ac748cd89f Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Thu, 27 Jul 2023 13:01:56 -0400 Subject: [PATCH] Cache name resolutions in the semantic model (#6047) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary This PR stores the mapping from `ExprName` node to resolved `BindingId`, which lets us skip scope lookups in `resolve_call_path`. It's enabled by #6045, since that PR ensures that when we analyze a node (and thus call `resolve_call_path`), we'll have already visited its `ExprName` elements. In more detail: imagine that we're traversing over `foo.bar()`. When we read `foo`, it will be an `ExprName`, which we'll then resolve to a binding via `handle_node_load`. With this change, we then store that binding in a map. Later, if we call `collect_call_path` on `foo.bar`, we'll identify `foo` (the "head" of the attribute) and grab the resolved binding in that map. _Almost_ all names are now resolved in advance, though it's not a strict requirement, and some rules break that pattern (e.g., if we're analyzing arguments, and they need to inspect their annotations, which are visited in a deferred manner). This improves performance by 4-6% on the all-rules benchmark. It looks like it hurts performance (1-2% drop) in the default-rules benchmark, presumedly because those rules don't call `resolve_call_path` nearly as much, and so we're paying for these extra writes. Here's the benchmark data: ``` linter/default-rules/numpy/globals.py time: [67.270 µs 67.380 µs 67.489 µs] thrpt: [43.720 MiB/s 43.792 MiB/s 43.863 MiB/s] change: time: [+0.4747% +0.7752% +1.0626%] (p = 0.00 < 0.05) thrpt: [-1.0514% -0.7693% -0.4724%] Change within noise threshold. Found 1 outliers among 100 measurements (1.00%) 1 (1.00%) high severe linter/default-rules/pydantic/types.py time: [1.4067 ms 1.4105 ms 1.4146 ms] thrpt: [18.028 MiB/s 18.081 MiB/s 18.129 MiB/s] change: time: [+1.3152% +1.6953% +2.0414%] (p = 0.00 < 0.05) thrpt: [-2.0006% -1.6671% -1.2981%] Performance has regressed. linter/default-rules/numpy/ctypeslib.py time: [637.67 µs 638.96 µs 640.28 µs] thrpt: [26.006 MiB/s 26.060 MiB/s 26.113 MiB/s] change: time: [+1.5859% +1.8109% +2.0353%] (p = 0.00 < 0.05) thrpt: [-1.9947% -1.7787% -1.5611%] Performance has regressed. linter/default-rules/large/dataset.py time: [3.2289 ms 3.2336 ms 3.2383 ms] thrpt: [12.563 MiB/s 12.581 MiB/s 12.599 MiB/s] change: time: [+0.8029% +0.9898% +1.1740%] (p = 0.00 < 0.05) thrpt: [-1.1604% -0.9801% -0.7965%] Change within noise threshold. linter/all-rules/numpy/globals.py time: [134.05 µs 134.15 µs 134.26 µs] thrpt: [21.977 MiB/s 21.995 MiB/s 22.012 MiB/s] change: time: [-4.4571% -4.1175% -3.8268%] (p = 0.00 < 0.05) thrpt: [+3.9791% +4.2943% +4.6651%] Performance has improved. Found 8 outliers among 100 measurements (8.00%) 2 (2.00%) low mild 3 (3.00%) high mild 3 (3.00%) high severe linter/all-rules/pydantic/types.py time: [2.5627 ms 2.5669 ms 2.5720 ms] thrpt: [9.9158 MiB/s 9.9354 MiB/s 9.9516 MiB/s] change: time: [-5.8304% -5.6374% -5.4452%] (p = 0.00 < 0.05) thrpt: [+5.7587% +5.9742% +6.1914%] Performance has improved. Found 7 outliers among 100 measurements (7.00%) 6 (6.00%) high mild 1 (1.00%) high severe linter/all-rules/numpy/ctypeslib.py time: [1.3949 ms 1.3956 ms 1.3964 ms] thrpt: [11.925 MiB/s 11.931 MiB/s 11.937 MiB/s] change: time: [-6.2496% -6.0856% -5.9293%] (p = 0.00 < 0.05) thrpt: [+6.3030% +6.4799% +6.6662%] Performance has improved. Found 7 outliers among 100 measurements (7.00%) 3 (3.00%) high mild 4 (4.00%) high severe linter/all-rules/large/dataset.py time: [5.5951 ms 5.6019 ms 5.6093 ms] thrpt: [7.2527 MiB/s 7.2623 MiB/s 7.2711 MiB/s] change: time: [-5.1781% -4.9783% -4.8070%] (p = 0.00 < 0.05) thrpt: [+5.0497% +5.2391% +5.4608%] Performance has improved. ``` Still playing with this (the concepts need better names, documentation, etc.), but opening up for feedback. --- crates/ruff/src/checkers/ast/mod.rs | 4 +- crates/ruff_python_semantic/src/model.rs | 121 ++++++++++++++++------- 2 files changed, 85 insertions(+), 40 deletions(-) diff --git a/crates/ruff/src/checkers/ast/mod.rs b/crates/ruff/src/checkers/ast/mod.rs index 0deea82933..a538da7287 100644 --- a/crates/ruff/src/checkers/ast/mod.rs +++ b/crates/ruff/src/checkers/ast/mod.rs @@ -1547,10 +1547,10 @@ impl<'a> Checker<'a> { } fn handle_node_load(&mut self, expr: &Expr) { - let Expr::Name(ast::ExprName { id, .. }) = expr else { + let Expr::Name(expr) = expr else { return; }; - self.semantic.resolve_load(id, expr.range()); + self.semantic.resolve_load(expr); } fn handle_node_store(&mut self, id: &'a str, expr: &Expr) { diff --git a/crates/ruff_python_semantic/src/model.rs b/crates/ruff_python_semantic/src/model.rs index 85c61400fd..ee338cd8c1 100644 --- a/crates/ruff_python_semantic/src/model.rs +++ b/crates/ruff_python_semantic/src/model.rs @@ -1,10 +1,10 @@ use std::path::Path; use bitflags::bitflags; -use ruff_python_ast::{Expr, Ranged, Stmt}; -use ruff_text_size::TextRange; +use ruff_python_ast::{self as ast, Expr, Ranged, Stmt}; +use ruff_text_size::{TextRange, TextSize}; use rustc_hash::FxHashMap; -use smallvec::SmallVec; +use smallvec::smallvec; use ruff_python_ast::call_path::{collect_call_path, from_unqualified_name, CallPath}; use ruff_python_ast::helpers::from_relative_import; @@ -118,6 +118,10 @@ pub struct SemanticModel<'a> { /// Exceptions that have been handled by the current scope. pub handled_exceptions: Vec, + + /// Map from [`ast::ExprName`] node (represented as a [`NameId`]) to the [`Binding`] to which + /// it resolved (represented as a [`BindingId`]). + resolved_names: FxHashMap, } impl<'a> SemanticModel<'a> { @@ -141,6 +145,7 @@ impl<'a> SemanticModel<'a> { rebinding_scopes: FxHashMap::default(), flags: SemanticModelFlags::new(path), handled_exceptions: Vec::default(), + resolved_names: FxHashMap::default(), } } @@ -270,29 +275,32 @@ impl<'a> SemanticModel<'a> { } } - /// Resolve a `load` reference to `symbol` at `range`. - pub fn resolve_load(&mut self, symbol: &str, range: TextRange) -> ReadResult { + /// Resolve a `load` reference to an [`ast::ExprName`]. + pub fn resolve_load(&mut self, name: &ast::ExprName) -> ReadResult { // PEP 563 indicates that if a forward reference can be resolved in the module scope, we // should prefer it over local resolutions. if self.in_forward_reference() { - if let Some(binding_id) = self.scopes.global().get(symbol) { + if let Some(binding_id) = self.scopes.global().get(name.id.as_str()) { if !self.bindings[binding_id].is_unbound() { // Mark the binding as used. let reference_id = self.resolved_references - .push(ScopeId::global(), range, self.flags); + .push(ScopeId::global(), name.range, self.flags); self.bindings[binding_id].references.push(reference_id); // Mark any submodule aliases as used. if let Some(binding_id) = - self.resolve_submodule(symbol, ScopeId::global(), binding_id) + self.resolve_submodule(name.id.as_str(), ScopeId::global(), binding_id) { - let reference_id = - self.resolved_references - .push(ScopeId::global(), range, self.flags); + let reference_id = self.resolved_references.push( + ScopeId::global(), + name.range, + self.flags, + ); self.bindings[binding_id].references.push(reference_id); } + self.resolved_names.insert(name.into(), binding_id); return ReadResult::Resolved(binding_id); } } @@ -310,7 +318,7 @@ impl<'a> SemanticModel<'a> { // def __init__(self): // print(__class__) // ``` - if seen_function && matches!(symbol, "__class__") { + if seen_function && matches!(name.id.as_str(), "__class__") { return ReadResult::ImplicitGlobal; } // Do not allow usages of class symbols unless it is the immediate parent, e.g.: @@ -332,18 +340,20 @@ impl<'a> SemanticModel<'a> { } } - if let Some(binding_id) = scope.get(symbol) { + if let Some(binding_id) = scope.get(name.id.as_str()) { // Mark the binding as used. - let reference_id = self - .resolved_references - .push(self.scope_id, range, self.flags); + let reference_id = + self.resolved_references + .push(self.scope_id, name.range, self.flags); self.bindings[binding_id].references.push(reference_id); // Mark any submodule aliases as used. - if let Some(binding_id) = self.resolve_submodule(symbol, scope_id, binding_id) { + if let Some(binding_id) = + self.resolve_submodule(name.id.as_str(), scope_id, binding_id) + { let reference_id = self.resolved_references - .push(self.scope_id, range, self.flags); + .push(self.scope_id, name.range, self.flags); self.bindings[binding_id].references.push(reference_id); } @@ -383,7 +393,7 @@ impl<'a> SemanticModel<'a> { // The `x` in `print(x)` should be treated as unresolved. BindingKind::Deletion | BindingKind::UnboundException(None) => { self.unresolved_references.push( - range, + name.range, self.exceptions(), UnresolvedReferenceFlags::empty(), ); @@ -409,24 +419,30 @@ impl<'a> SemanticModel<'a> { // Mark the binding as used. let reference_id = self.resolved_references - .push(self.scope_id, range, self.flags); + .push(self.scope_id, name.range, self.flags); self.bindings[binding_id].references.push(reference_id); // Mark any submodule aliases as used. if let Some(binding_id) = - self.resolve_submodule(symbol, scope_id, binding_id) + self.resolve_submodule(name.id.as_str(), scope_id, binding_id) { - let reference_id = - self.resolved_references - .push(self.scope_id, range, self.flags); + let reference_id = self.resolved_references.push( + self.scope_id, + name.range, + self.flags, + ); self.bindings[binding_id].references.push(reference_id); } + self.resolved_names.insert(name.into(), binding_id); return ReadResult::Resolved(binding_id); } - // Otherwise, treat it as resolved. - _ => return ReadResult::Resolved(binding_id), + _ => { + // Otherwise, treat it as resolved. + self.resolved_names.insert(name.into(), binding_id); + return ReadResult::Resolved(binding_id); + } } } @@ -446,7 +462,7 @@ impl<'a> SemanticModel<'a> { // print(__qualname__) // ``` if index == 0 && scope.kind.is_class() { - if matches!(symbol, "__module__" | "__qualname__") { + if matches!(name.id.as_str(), "__module__" | "__qualname__") { return ReadResult::ImplicitGlobal; } } @@ -457,14 +473,14 @@ impl<'a> SemanticModel<'a> { if import_starred { self.unresolved_references.push( - range, + name.range, self.exceptions(), UnresolvedReferenceFlags::WILDCARD_IMPORT, ); ReadResult::WildcardImport } else { self.unresolved_references.push( - range, + name.range, self.exceptions(), UnresolvedReferenceFlags::empty(), ); @@ -585,13 +601,30 @@ impl<'a> SemanticModel<'a> { /// /// ...then `resolve_call_path(${python_version})` will resolve to `sys.version_info`. pub fn resolve_call_path(&'a self, value: &'a Expr) -> Option> { - let call_path = collect_call_path(value)?; - let (head, tail) = call_path.split_first()?; - let binding = self.find_binding(head)?; + /// Return the [`ast::ExprName`] at the head of the expression, if any. + const fn match_head(value: &Expr) -> Option<&ast::ExprName> { + match value { + Expr::Attribute(ast::ExprAttribute { value, .. }) => match_head(value), + Expr::Name(name) => Some(name), + _ => None, + } + } + + // If the name was already resolved, look it up; otherwise, search for the symbol. + let head = match_head(value)?; + let binding = self + .resolved_names + .get(&head.into()) + .map(|id| self.binding(*id)) + .or_else(|| self.find_binding(&head.id))?; + match &binding.kind { BindingKind::Import(Import { qualified_name: name, }) => { + let call_path = collect_call_path(value)?; + let (_, tail) = call_path.split_first()?; + let mut source_path: CallPath = from_unqualified_name(name); source_path.extend_from_slice(tail); Some(source_path) @@ -599,6 +632,9 @@ impl<'a> SemanticModel<'a> { BindingKind::SubmoduleImport(SubmoduleImport { qualified_name: name, }) => { + let call_path = collect_call_path(value)?; + let (_, tail) = call_path.split_first()?; + let name = name.split('.').next().unwrap_or(name); let mut source_path: CallPath = from_unqualified_name(name); source_path.extend_from_slice(tail); @@ -607,6 +643,9 @@ impl<'a> SemanticModel<'a> { BindingKind::FromImport(FromImport { qualified_name: name, }) => { + let call_path = collect_call_path(value)?; + let (_, tail) = call_path.split_first()?; + if name.starts_with('.') { let mut source_path = from_relative_import(self.module_path?, name); if source_path.is_empty() { @@ -621,12 +660,7 @@ impl<'a> SemanticModel<'a> { Some(source_path) } } - BindingKind::Builtin => { - let mut source_path: CallPath = SmallVec::with_capacity(1 + call_path.len()); - source_path.push(""); - source_path.extend_from_slice(call_path.as_slice()); - Some(source_path) - } + BindingKind::Builtin => Some(smallvec!["", head.id.as_str()]), _ => None, } } @@ -1504,3 +1538,14 @@ impl Ranged for ImportedName { self.range } } + +/// A unique identifier for an [`ast::ExprName`]. No two names can even appear at the same location +/// in the source code, so the starting offset is a cheap and sufficient unique identifier. +#[derive(Debug, Hash, PartialEq, Eq)] +struct NameId(TextSize); + +impl From<&ast::ExprName> for NameId { + fn from(name: &ast::ExprName) -> Self { + Self(name.start()) + } +}