From a825b2db068fd6e854fe602c03ede93393da89e3 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Thu, 7 Dec 2023 00:02:46 -0500 Subject: [PATCH] Shard the registry cache by package (#583) ## Summary This PR modifies the cache structure in a few ways. Most notably, we now shard the set of registry wheels by package, and index them lazily when computing the install plan. This applies both to built wheels: Screen Shot 2023-12-06 at 4 42 19 PM And remote wheels: Screen Shot 2023-12-06 at 4 42 30 PM For other distributions, we now consistently cache using the package name, which is really just for clarity and debuggability (we could consider omitting these): Screen Shot 2023-12-06 at 4 58 30 PM Obliquely closes https://github.com/astral-sh/puffin/issues/575. --- crates/puffin-cache/src/lib.rs | 124 ++++++++++++------ crates/puffin-cache/src/wheel.rs | 6 +- crates/puffin-client/src/registry_client.rs | 10 +- .../src/distribution_database.rs | 12 +- .../src/index/built_wheel_index.rs | 42 ++---- .../src/index/registry_wheel_index.rs | 107 +++++++++------ crates/puffin-distribution/src/source_dist.rs | 36 +++-- crates/puffin-installer/src/plan.rs | 53 ++++---- crates/puffin-normalize/src/package_name.rs | 6 +- 9 files changed, 227 insertions(+), 169 deletions(-) diff --git a/crates/puffin-cache/src/lib.rs b/crates/puffin-cache/src/lib.rs index 216b6573a..196d50c82 100644 --- a/crates/puffin-cache/src/lib.rs +++ b/crates/puffin-cache/src/lib.rs @@ -1,6 +1,7 @@ use std::fmt::{Display, Formatter}; use std::io; use std::io::Write; +use std::ops::Deref; use std::path::{Path, PathBuf}; use std::sync::Arc; @@ -37,8 +38,32 @@ impl CacheEntry { } #[must_use] - pub fn with_file(self, file: String) -> Self { - Self { file, ..self } + pub fn with_file(self, file: impl Into) -> Self { + Self { + file: file.into(), + ..self + } + } +} + +/// A subdirectory within the cache. +#[derive(Debug, Clone)] +pub struct CacheShard(PathBuf); + +impl CacheShard { + pub fn entry(&self, file: impl Into) -> CacheEntry { + CacheEntry { + dir: self.0.clone(), + file: file.into(), + } + } +} + +impl Deref for CacheShard { + type Target = Path; + + fn deref(&self) -> &Self::Target { + &self.0 } } @@ -82,6 +107,11 @@ impl Cache { self.root.join(cache_bucket.to_str()) } + /// Compute an entry in the cache. + pub fn shard(&self, cache_bucket: CacheBucket, dir: impl AsRef) -> CacheShard { + CacheShard(self.bucket(cache_bucket).join(dir.as_ref())) + } + /// Compute an entry in the cache. pub fn entry( &self, @@ -120,17 +150,17 @@ impl Cache { /// are subdirectories of the cache root. #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] pub enum CacheBucket { - /// Wheels (excluding built wheels), their metadata and cache policy. + /// Wheels (excluding built wheels), alongside their metadata and cache policy. /// /// There are three kinds from cache entries: Wheel metadata and policy as JSON files, the /// wheels themselves, and the unzipped wheel archives. If a wheel file is over an in-memory /// size threshold, we first download the zip file into the cache, then unzip it into a - /// directory with the same name, omitting the `.whl` extension. + /// directory with the same name (exclusive of the `.whl` extension). /// /// Cache structure: - /// * `wheel-metadata-v0/pypi/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}` - /// * `wheel-metadata-v0//{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}` - /// * `wheel-metadata-v0/url//{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}` + /// * `wheel-metadata-v0/pypi/foo/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}` + /// * `wheel-metadata-v0//foo/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}` + /// * `wheel-metadata-v0/url//foo/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}` /// /// See `puffin_client::RegistryClient::wheel_metadata` for information on how wheel metadata /// is fetched. @@ -151,11 +181,13 @@ pub enum CacheBucket { /// wheel-v0 /// ├── pypi /// │ ... - /// │ ├── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.json + /// │ ├── pandas + /// │ │ └── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.json /// │ ... /// └── url /// └── 4b8be67c801a7ecb - /// └── flask-3.0.0-py3-none-any.json + /// └── flask + /// └── flask-3.0.0-py3-none-any.json /// ``` /// /// We get the following `requirement.txt` from `pip-compile`: @@ -176,16 +208,18 @@ pub enum CacheBucket { /// wheel-v0 /// ├── pypi /// │ ... - /// │ ├── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - /// │ ├── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64 + /// │ ├── pandas + /// │ │ ├── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + /// │ │ ├── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64 /// │ ... /// └── url /// └── 4b8be67c801a7ecb - /// └── flask-3.0.0-py3-none-any.whl - /// ├── flask - /// │ └── ... - /// └── flask-3.0.0.dist-info - /// └── ... + /// └── flask + /// └── flask-3.0.0-py3-none-any.whl + /// ├── flask + /// │ └── ... + /// └── flask-3.0.0.dist-info + /// └── ... /// ``` /// /// If we run first `pip-compile` and then `pip-sync` on the same machine, we get both: @@ -194,24 +228,26 @@ pub enum CacheBucket { /// wheels-v0 /// ├── pypi /// │ ├── ... - /// │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.json - /// │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - /// │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64 - /// │ │ ├── pandas - /// │ │ │ ├── ... - /// │ │ ├── pandas-2.1.3.dist-info - /// │ │ │ ├── ... - /// │ │ └── pandas.libs + /// │ ├── pandas + /// │ │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.json + /// │ │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + /// │ │ └── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64 + /// │ │ ├── pandas + /// │ │ │ ├── ... + /// │ │ ├── pandas-2.1.3.dist-info + /// │ │ │ ├── ... + /// │ │ └── pandas.libs /// │ ├── ... /// └── url /// └── 4b8be67c801a7ecb - /// ├── flask-3.0.0-py3-none-any.json - /// ├── flask-3.0.0-py3-none-any.json - /// └── flask-3.0.0-py3-none-any - /// ├── flask - /// │ └── ... - /// └── flask-3.0.0.dist-info - /// └── ... + /// └── flask + /// ├── flask-3.0.0-py3-none-any.json + /// ├── flask-3.0.0-py3-none-any.json + /// └── flask-3.0.0-py3-none-any + /// ├── flask + /// │ └── ... + /// └── flask-3.0.0.dist-info + /// └── ... Wheels, /// Wheels built from source distributions, their extracted metadata and the cache policy of /// the source distribution. @@ -225,14 +261,14 @@ pub enum CacheBucket { /// /// Source distributions are built into zipped wheel files (as PEP 517 specifies) and unzipped /// lazily before installing. So when resolving, we only build the wheel and store the archive - /// file in the cache, when installing, we unpack it under the same name, omitting the `.whl` - /// extension. You may find a mix of wheel archive zip files and unzipped wheel directories in - /// the cache. + /// file in the cache, when installing, we unpack it under the same name (exclusive of the + /// `.whl` extension). You may find a mix of wheel archive zip files and unzipped wheel + /// directories in the cache. /// /// Cache structure: - /// * `built-wheels-v0/pypi/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` - /// * `built-wheels-v0//foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` - /// * `built-wheels-v0/url//foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` + /// * `built-wheels-v0/pypi/foo/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` + /// * `built-wheels-v0//foo/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` + /// * `built-wheels-v0/url//foo/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` /// * `built-wheels-v0/git///foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` /// /// But the url filename does not need to be a valid source dist filename @@ -261,14 +297,16 @@ pub enum CacheBucket { /// │ ├── metadata.json /// │ └── pydantic_extra_types-2.1.0-py3-none-any.whl /// ├── pypi - /// │ └── django-allauth-0.51.0.tar.gz - /// │ ├── django_allauth-0.51.0-py3-none-any.whl - /// │ └── metadata.json + /// │ └── django + /// │ └── django-allauth-0.51.0.tar.gz + /// │ ├── django_allauth-0.51.0-py3-none-any.whl + /// │ └── metadata.json /// └── url /// └── 6781bd6440ae72c2 - /// └── werkzeug-3.0.1.tar.gz - /// ├── metadata.json - /// └── werkzeug-3.0.1-py3-none-any.whl + /// └── werkzeug + /// └── werkzeug-3.0.1.tar.gz + /// ├── metadata.json + /// └── werkzeug-3.0.1-py3-none-any.whl /// ``` /// /// The inside of a `metadata.json`: diff --git a/crates/puffin-cache/src/wheel.rs b/crates/puffin-cache/src/wheel.rs index d336b51b7..e3312e83b 100644 --- a/crates/puffin-cache/src/wheel.rs +++ b/crates/puffin-cache/src/wheel.rs @@ -10,7 +10,7 @@ use crate::{digest, CanonicalUrl}; /// Cache wheels and their metadata, both from remote wheels and built from source distributions. /// -/// Use [`WheelCache::wheel_dir`] for remote wheel metadata caching and +/// Use [`WheelCache::remote_wheel_dir`] for remote wheel metadata caching and /// [`WheelCache::built_wheel_dir`] for built source distributions metadata caching. pub enum WheelCache<'a> { /// Either pypi or an alternative index, which we key by index URL. @@ -38,8 +38,8 @@ impl<'a> WheelCache<'a> { } /// Metadata of a remote wheel. See [`CacheBucket::Wheels`] - pub fn wheel_dir(&self) -> PathBuf { - self.bucket() + pub fn remote_wheel_dir(&self, package_name: impl AsRef) -> PathBuf { + self.bucket().join(package_name) } /// Metadata of a built source distribution. See [`CacheBucket::BuiltWheels`] diff --git a/crates/puffin-client/src/registry_client.rs b/crates/puffin-client/src/registry_client.rs index f6d53a0e3..cbd744fc9 100644 --- a/crates/puffin-client/src/registry_client.rs +++ b/crates/puffin-client/src/registry_client.rs @@ -241,7 +241,7 @@ impl RegistryClient { let cache_entry = self.cache.entry( CacheBucket::Wheels, - WheelCache::Index(&index).wheel_dir(), + WheelCache::Index(&index).remote_wheel_dir(filename.name.as_ref()), format!("{}.json", filename.stem()), ); @@ -276,7 +276,7 @@ impl RegistryClient { let cache_entry = self.cache.entry( CacheBucket::Wheels, - cache_shard.wheel_dir(), + cache_shard.remote_wheel_dir(filename.name.as_ref()), format!("{}.json", filename.stem()), ); @@ -311,13 +311,13 @@ impl RegistryClient { // The range request version failed (this is bad, the webserver should support this), fall // back to downloading the entire file and the reading the file from the zip the regular way - debug!("Range requests not supported for {filename}, downloading whole wheel"); + debug!("Range requests not supported for {filename}; downloading wheel"); // TODO(konstin): Download the wheel into a cache shared with the installer instead // Note that this branch is only hit when you're not using and the server where // you host your wheels for some reasons doesn't support range requests // (tbh we should probably warn here and tell users to get a better registry because - // their current one makes resolution unnecessary slow) - let temp_download = tempfile_in(cache_shard.wheel_dir()).map_err(Error::CacheWrite)?; + // their current one makes resolution unnecessary slow). + let temp_download = tempfile_in(self.cache.root()).map_err(Error::CacheWrite)?; let mut writer = BufWriter::new(tokio::fs::File::from_std(temp_download)); let mut reader = self.stream_external(url).await?.compat(); tokio::io::copy(&mut reader, &mut writer) diff --git a/crates/puffin-distribution/src/distribution_database.rs b/crates/puffin-distribution/src/distribution_database.rs index c25c897c8..61c1b8847 100644 --- a/crates/puffin-distribution/src/distribution_database.rs +++ b/crates/puffin-distribution/src/distribution_database.rs @@ -15,7 +15,7 @@ use url::Url; use distribution_filename::{WheelFilename, WheelFilenameError}; use distribution_types::direct_url::DirectGitUrl; -use distribution_types::{BuiltDist, Dist, RemoteSource, SourceDist}; +use distribution_types::{BuiltDist, Dist, Metadata, RemoteSource, SourceDist}; use platform_tags::Tags; use puffin_cache::{Cache, CacheBucket, WheelCache}; use puffin_client::RegistryClient; @@ -171,7 +171,7 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> let cache_entry = self.cache.entry( CacheBucket::Wheels, - WheelCache::Index(&wheel.index).wheel_dir(), + WheelCache::Index(&wheel.index).remote_wheel_dir(filename.name.as_ref()), filename.stem(), ); @@ -193,12 +193,11 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> let cache_entry = self.cache.entry( CacheBucket::Wheels, - WheelCache::Index(&wheel.index).wheel_dir(), + WheelCache::Index(&wheel.index).remote_wheel_dir(filename.name.as_ref()), filename.to_string(), ); // Download the wheel into the cache. - // TODO(charlie): Use an atomic write, and remove any existing files or directories. fs::create_dir_all(&cache_entry.dir).await?; let mut writer = fs::File::create(cache_entry.path()).await?; tokio::io::copy(&mut reader.compat(), &mut writer).await?; @@ -225,10 +224,9 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> let reader = self.client.stream_external(&wheel.url).await?; // Download the wheel into the cache. - // TODO(charlie): Use an atomic write, and remove any existing files or directories. let cache_entry = self.cache.entry( CacheBucket::Wheels, - WheelCache::Url(&wheel.url).wheel_dir(), + WheelCache::Url(&wheel.url).remote_wheel_dir(wheel.name().as_ref()), wheel.filename.to_string(), ); fs::create_dir_all(&cache_entry.dir).await?; @@ -252,7 +250,7 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> Dist::Built(BuiltDist::Path(wheel)) => { let cache_entry = self.cache.entry( CacheBucket::Wheels, - WheelCache::Url(&wheel.url).wheel_dir(), + WheelCache::Url(&wheel.url).remote_wheel_dir(wheel.name().as_ref()), wheel.filename.stem(), ); diff --git a/crates/puffin-distribution/src/index/built_wheel_index.rs b/crates/puffin-distribution/src/index/built_wheel_index.rs index c39f08091..3b52c8340 100644 --- a/crates/puffin-distribution/src/index/built_wheel_index.rs +++ b/crates/puffin-distribution/src/index/built_wheel_index.rs @@ -1,24 +1,22 @@ -use std::path::PathBuf; - use fs_err as fs; use tracing::warn; use distribution_types::CachedWheel; use platform_tags::Tags; +use puffin_cache::CacheShard; use crate::index::iter_directories; /// A local index of built distributions for a specific source distribution. -#[derive(Debug)] -pub struct BuiltWheelIndex<'a> { - directory: PathBuf, - tags: &'a Tags, -} +pub struct BuiltWheelIndex; -impl<'a> BuiltWheelIndex<'a> { - /// Create a new index of built distributions. +impl BuiltWheelIndex { + /// Find the "best" distribution in the index for a given source distribution. /// - /// The `directory` should be the directory containing the built distributions for a specific + /// This lookup prefers newer versions over older versions, and aims to maximize compatibility + /// with the target platform. + /// + /// The `shard` should point to a directory containing the built distributions for a specific /// source distribution. For example, given the built wheel cache structure: /// ```text /// built-wheels-v0/ @@ -28,27 +26,16 @@ impl<'a> BuiltWheelIndex<'a> { /// └── metadata.json /// ``` /// - /// The `directory` should be `built-wheels-v0/pypi/django-allauth-0.51.0.tar.gz`. - pub fn new(directory: impl Into, tags: &'a Tags) -> Self { - Self { - directory: directory.into(), - tags, - } - } - - /// Find the "best" distribution in the index. - /// - /// This lookup prefers newer versions over older versions, and aims to maximize compatibility - /// with the target platform. - pub fn find(&self) -> Option { + /// The `shard` should be `built-wheels-v0/pypi/django-allauth-0.51.0.tar.gz`. + pub fn find(shard: &CacheShard, tags: &Tags) -> Option { let mut candidate: Option = None; - for subdir in iter_directories(self.directory.read_dir().ok()?) { + for subdir in iter_directories(shard.read_dir().ok()?) { match CachedWheel::from_path(&subdir) { Ok(None) => {} Ok(Some(dist_info)) => { // Pick the wheel with the highest priority - let compatibility = dist_info.filename.compatibility(self.tags); + let compatibility = dist_info.filename.compatibility(tags); // Only consider wheels that are compatible with our tags. if compatibility.is_none() { @@ -62,7 +49,7 @@ impl<'a> BuiltWheelIndex<'a> { if let Some(existing) = candidate.as_ref() { // Override if the wheel is newer, or "more" compatible. if dist_info.filename.version > existing.filename.version - || compatibility > existing.filename.compatibility(self.tags) + || compatibility > existing.filename.compatibility(tags) { candidate = Some(dist_info); } @@ -75,8 +62,7 @@ impl<'a> BuiltWheelIndex<'a> { "Invalid cache entry at {}, removing. {err}", subdir.display() ); - let result = fs::remove_dir_all(&subdir); - if let Err(err) = result { + if let Err(err) = fs::remove_dir_all(&subdir) { warn!( "Failed to remove invalid cache entry at {}: {err}", subdir.display() diff --git a/crates/puffin-distribution/src/index/registry_wheel_index.rs b/crates/puffin-distribution/src/index/registry_wheel_index.rs index f7593435c..31af9489b 100644 --- a/crates/puffin-distribution/src/index/registry_wheel_index.rs +++ b/crates/puffin-distribution/src/index/registry_wheel_index.rs @@ -1,11 +1,13 @@ +use std::collections::hash_map::Entry; use std::collections::BTreeMap; + use std::path::Path; use fs_err as fs; use fxhash::FxHashMap; use tracing::warn; -use distribution_types::{CachedRegistryDist, CachedWheel, Metadata}; +use distribution_types::{CachedRegistryDist, CachedWheel}; use pep440_rs::Version; use platform_tags::Tags; use puffin_cache::{Cache, CacheBucket, WheelCache}; @@ -15,55 +17,87 @@ use pypi_types::IndexUrls; use crate::index::iter_directories; /// A local index of distributions that originate from a registry, like `PyPI`. -#[derive(Debug, Default)] -pub struct RegistryWheelIndex(FxHashMap>); +#[derive(Debug)] +pub struct RegistryWheelIndex<'a> { + cache: &'a Cache, + tags: &'a Tags, + index_urls: &'a IndexUrls, + index: FxHashMap>, +} -impl RegistryWheelIndex { - /// Build an index of cached distributions from a directory. - pub fn from_directory(cache: &Cache, tags: &Tags, index_urls: &IndexUrls) -> Self { - let mut index = Self::default(); +impl<'a> RegistryWheelIndex<'a> { + /// Initialize an index of cached distributions from a directory. + pub fn new(cache: &'a Cache, tags: &'a Tags, index_urls: &'a IndexUrls) -> Self { + Self { + cache, + tags, + index_urls, + index: FxHashMap::default(), + } + } + + /// Return an iterator over available wheels for a given package. + /// + /// If the package is not yet indexed, this will index the package by reading from the cache. + pub fn get( + &mut self, + name: &PackageName, + ) -> impl Iterator { + let versions = match self.index.entry(name.clone()) { + Entry::Occupied(entry) => entry.into_mut(), + Entry::Vacant(entry) => { + entry.insert(Self::index(name, self.cache, self.tags, self.index_urls)) + } + }; + versions.iter().rev() + } + + /// Add a package to the index by reading from the cache. + fn index( + package: &PackageName, + cache: &Cache, + tags: &Tags, + index_urls: &IndexUrls, + ) -> BTreeMap { + let mut versions = BTreeMap::new(); for index_url in index_urls { // Index all the wheels that were downloaded directly from the registry. - // TODO(charlie): Shard the cache by package name, and do this lazily. - let wheel_dir = cache - .bucket(CacheBucket::Wheels) - .join(WheelCache::Index(index_url).wheel_dir()); + let wheel_dir = cache.shard( + CacheBucket::Wheels, + WheelCache::Index(index_url).remote_wheel_dir(package.to_string()), + ); - index.add_directory(wheel_dir, tags); + Self::add_directory(&*wheel_dir, tags, &mut versions); // Index all the built wheels, created by downloading and building source distributions // from the registry. - // TODO(charlie): Shard the cache by package name, and do this lazily. - let built_wheel_dir = cache - .bucket(CacheBucket::BuiltWheels) - .join(WheelCache::Index(index_url).wheel_dir()); + let built_wheel_dir = cache.shard( + CacheBucket::BuiltWheels, + WheelCache::Index(index_url).built_wheel_dir(package.to_string()), + ); + // Built wheels have one more level of indirection, as they are keyed by the source + // distribution filename. let Ok(read_dir) = built_wheel_dir.read_dir() else { continue; }; for subdir in iter_directories(read_dir) { - index.add_directory(subdir, tags); + Self::add_directory(subdir, tags, &mut versions); } } - index - } - - /// Returns a distribution from the index, if it exists. - pub fn by_name( - &self, - name: &PackageName, - ) -> impl Iterator { - // Using static to extend the lifetime - static DEFAULT_MAP: BTreeMap = BTreeMap::new(); - self.0.get(name).unwrap_or(&DEFAULT_MAP).iter().rev() + versions } /// Add the wheels in a given directory to the index. /// /// Each subdirectory in the given path is expected to be that of an unzipped wheel. - fn add_directory(&mut self, path: impl AsRef, tags: &Tags) { + fn add_directory( + path: impl AsRef, + tags: &Tags, + versions: &mut BTreeMap, + ) { let Ok(read_dir) = path.as_ref().read_dir() else { return; }; @@ -76,20 +110,13 @@ impl RegistryWheelIndex { // Pick the wheel with the highest priority let compatibility = dist_info.filename.compatibility(tags); - if let Some(existing) = self - .0 - .get_mut(dist_info.name()) - .and_then(|package| package.get_mut(&dist_info.filename.version)) - { + if let Some(existing) = versions.get_mut(&dist_info.filename.version) { // Override if we have better compatibility if compatibility > existing.filename.compatibility(tags) { *existing = dist_info; } } else if compatibility.is_some() { - self.0 - .entry(dist_info.name().clone()) - .or_default() - .insert(dist_info.filename.version.clone(), dist_info); + versions.insert(dist_info.filename.version.clone(), dist_info); } } Err(err) => { @@ -97,8 +124,8 @@ impl RegistryWheelIndex { "Invalid cache entry at {}, removing. {err}", wheel_dir.display() ); - let result = fs::remove_dir_all(&wheel_dir); - if let Err(err) = result { + + if let Err(err) = fs::remove_dir_all(&wheel_dir) { warn!( "Failed to remove invalid cache entry at {}: {err}", wheel_dir.display() diff --git a/crates/puffin-distribution/src/source_dist.rs b/crates/puffin-distribution/src/source_dist.rs index f6e4d55ec..6402792e4 100644 --- a/crates/puffin-distribution/src/source_dist.rs +++ b/crates/puffin-distribution/src/source_dist.rs @@ -24,7 +24,9 @@ use distribution_types::direct_url::{DirectArchiveUrl, DirectGitUrl}; use distribution_types::{GitSourceDist, Metadata, PathSourceDist, RemoteSource, SourceDist}; use install_wheel_rs::read_dist_info; use platform_tags::Tags; -use puffin_cache::{digest, CacheBucket, CacheEntry, CachedByTimestamp, CanonicalUrl, WheelCache}; +use puffin_cache::{ + digest, CacheBucket, CacheEntry, CacheShard, CachedByTimestamp, CanonicalUrl, WheelCache, +}; use puffin_client::{CachedClient, CachedClientError, DataWithCachePolicy}; use puffin_fs::write_atomic; use puffin_git::{Fetch, GitSource}; @@ -161,11 +163,17 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { let DirectArchiveUrl { url, subdirectory } = DirectArchiveUrl::from(&direct_url_source_dist.url); + // For direct URLs, cache directly under the hash of the URL itself. + let cache_shard = self.build_context.cache().shard( + CacheBucket::BuiltWheels, + WheelCache::Url(&url).remote_wheel_dir(direct_url_source_dist.name().as_ref()), + ); + self.url( source_dist, filename, &url, - WheelCache::Url(&url), + cache_shard, subdirectory.as_deref(), ) .await? @@ -174,11 +182,21 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { let url = Url::parse(®istry_source_dist.file.url).map_err(|err| { SourceDistError::UrlParse(registry_source_dist.file.url.clone(), err) })?; + + // For registry source distributions, shard by distribution, then by filename. + // Ex) `pypi/requests/requests-2.25.1.tar.gz` + let cache_shard = self.build_context.cache().shard( + CacheBucket::BuiltWheels, + WheelCache::Index(®istry_source_dist.index) + .remote_wheel_dir(registry_source_dist.name.as_ref()) + .join(®istry_source_dist.file.filename), + ); + self.url( source_dist, ®istry_source_dist.file.filename, &url, - WheelCache::Index(®istry_source_dist.index), + cache_shard, None, ) .await? @@ -197,14 +215,10 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { source_dist: &'data SourceDist, filename: &'data str, url: &'data Url, - cache_shard: WheelCache<'data>, + cache_shard: CacheShard, subdirectory: Option<&'data Path>, ) -> Result { - let cache_entry = self.build_context.cache().entry( - CacheBucket::BuiltWheels, - cache_shard.built_wheel_dir(filename), - METADATA_JSON.to_string(), - ); + let cache_entry = cache_shard.entry(METADATA_JSON.to_string()); let response_callback = |response| async { // New or changed source distribution, delete all built wheels @@ -345,10 +359,10 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { source_dist: &SourceDist, path_source_dist: &PathSourceDist, ) -> Result { - let cache_shard = WheelCache::Path(&path_source_dist.url); let cache_entry = self.build_context.cache().entry( CacheBucket::BuiltWheels, - cache_shard.built_wheel_dir(source_dist.name().to_string()), + WheelCache::Path(&path_source_dist.url) + .remote_wheel_dir(path_source_dist.name().as_ref()), METADATA_JSON.to_string(), ); diff --git a/crates/puffin-installer/src/plan.rs b/crates/puffin-installer/src/plan.rs index 5bd35ec05..826de3b4e 100644 --- a/crates/puffin-installer/src/plan.rs +++ b/crates/puffin-installer/src/plan.rs @@ -3,8 +3,7 @@ use tracing::debug; use distribution_types::direct_url::{git_reference, DirectUrl}; use distribution_types::{ - BuiltDist, CachedDirectUrlDist, CachedDist, Dist, InstalledDist, Metadata, RemoteSource, - SourceDist, + BuiltDist, CachedDirectUrlDist, CachedDist, Dist, InstalledDist, Metadata, SourceDist, }; use pep508_rs::{Requirement, VersionOrUrl}; use platform_tags::Tags; @@ -45,7 +44,7 @@ impl InstallPlan { SitePackages::try_from_executable(venv).context("Failed to list installed packages")?; // Index all the already-downloaded wheels in the cache. - let registry_index = RegistryWheelIndex::from_directory(cache, tags, index_urls); + let mut registry_index = RegistryWheelIndex::new(cache, tags, index_urls); let mut local = vec![]; let mut remote = vec![]; @@ -89,9 +88,8 @@ impl InstallPlan { // Identify any locally-available distributions that satisfy the requirement. match requirement.version_or_url.as_ref() { None => { - // TODO(charlie): This doesn't respect built wheels. if let Some((_version, distribution)) = - registry_index.by_name(&requirement.name).next() + registry_index.get(&requirement.name).next() { debug!("Requirement already cached: {distribution}"); local.push(CachedDist::Registry(distribution.clone())); @@ -99,12 +97,16 @@ impl InstallPlan { } } Some(VersionOrUrl::VersionSpecifier(specifier)) => { - if let Some((_version, distribution)) = registry_index - .by_name(&requirement.name) - .find(|(version, dist)| { - specifier.contains(version) - && requirement.is_satisfied_by(&dist.filename.version) - }) + if let Some(distribution) = + registry_index + .get(&requirement.name) + .find_map(|(version, distribution)| { + if specifier.contains(version) { + Some(distribution) + } else { + None + } + }) { debug!("Requirement already cached: {distribution}"); local.push(CachedDist::Registry(distribution.clone())); @@ -124,7 +126,7 @@ impl InstallPlan { // advance. let cache_entry = cache.entry( CacheBucket::Wheels, - WheelCache::Url(&wheel.url).wheel_dir(), + WheelCache::Url(&wheel.url).remote_wheel_dir(wheel.name().as_ref()), wheel.filename.stem(), ); @@ -145,7 +147,7 @@ impl InstallPlan { // advance. let cache_entry = cache.entry( CacheBucket::Wheels, - WheelCache::Url(&wheel.url).wheel_dir(), + WheelCache::Url(&wheel.url).remote_wheel_dir(wheel.name().as_ref()), wheel.filename.stem(), ); @@ -164,14 +166,12 @@ impl InstallPlan { Dist::Source(SourceDist::DirectUrl(sdist)) => { // Find the most-compatible wheel from the cache, since we don't know // the filename in advance. - let cache_entry = cache.entry( + let cache_shard = cache.shard( CacheBucket::BuiltWheels, - WheelCache::Url(&sdist.url).wheel_dir(), - sdist.filename()?.to_string(), + WheelCache::Url(&sdist.url).remote_wheel_dir(sdist.name().as_ref()), ); - let index = BuiltWheelIndex::new(cache_entry.path(), tags); - if let Some(wheel) = index.find() { + if let Some(wheel) = BuiltWheelIndex::find(&cache_shard, tags) { let cached_dist = wheel.into_url_dist(url.clone()); debug!("URL source requirement already cached: {cached_dist}"); local.push(CachedDist::Url(cached_dist.clone())); @@ -181,14 +181,13 @@ impl InstallPlan { Dist::Source(SourceDist::Path(sdist)) => { // Find the most-compatible wheel from the cache, since we don't know // the filename in advance. - let cache_entry = cache.entry( + let cache_shard = cache.shard( CacheBucket::BuiltWheels, - WheelCache::Path(&sdist.url).wheel_dir(), - sdist.name().to_string(), + WheelCache::Path(&sdist.url) + .remote_wheel_dir(sdist.name().as_ref()), ); - let index = BuiltWheelIndex::new(cache_entry.path(), tags); - if let Some(wheel) = index.find() { + if let Some(wheel) = BuiltWheelIndex::find(&cache_shard, tags) { let cached_dist = wheel.into_url_dist(url.clone()); debug!("Path source requirement already cached: {cached_dist}"); local.push(CachedDist::Url(cached_dist.clone())); @@ -199,14 +198,12 @@ impl InstallPlan { // Find the most-compatible wheel from the cache, since we don't know // the filename in advance. if let Ok(Some(reference)) = git_reference(&sdist.url) { - let cache_entry = cache.entry( + let cache_shard = cache.shard( CacheBucket::BuiltWheels, - WheelCache::Git(&sdist.url).wheel_dir(), - reference.to_string(), + WheelCache::Git(&sdist.url).built_wheel_dir(reference), ); - let index = BuiltWheelIndex::new(cache_entry.path(), tags); - if let Some(wheel) = index.find() { + if let Some(wheel) = BuiltWheelIndex::find(&cache_shard, tags) { let cached_dist = wheel.into_url_dist(url.clone()); debug!("Git source requirement already cached: {cached_dist}"); local.push(CachedDist::Url(cached_dist.clone())); diff --git a/crates/puffin-normalize/src/package_name.rs b/crates/puffin-normalize/src/package_name.rs index ada7b9d8c..48145e59c 100644 --- a/crates/puffin-normalize/src/package_name.rs +++ b/crates/puffin-normalize/src/package_name.rs @@ -1,5 +1,3 @@ -use std::fmt; -use std::fmt::{Display, Formatter}; use std::str::FromStr; use serde::{Deserialize, Deserializer, Serialize}; @@ -54,8 +52,8 @@ impl<'de> Deserialize<'de> for PackageName { } } -impl Display for PackageName { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { +impl std::fmt::Display for PackageName { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.0.fmt(f) } }