Shard the registry cache by package (#583)

## Summary

This PR modifies the cache structure in a few ways. Most notably, we now
shard the set of registry wheels by package, and index them lazily when
computing the install plan.

This applies both to built wheels:

<img width="989" alt="Screen Shot 2023-12-06 at 4 42 19 PM"
src="https://github.com/astral-sh/puffin/assets/1309177/0e8a306f-befd-4be9-a63e-2303389837bb">

And remote wheels:

<img width="836" alt="Screen Shot 2023-12-06 at 4 42 30 PM"
src="https://github.com/astral-sh/puffin/assets/1309177/7fd908cd-dd86-475e-9779-07ed067b4a1a">

For other distributions, we now consistently cache using the package
name, which is really just for clarity and debuggability (we could
consider omitting these):

<img width="955" alt="Screen Shot 2023-12-06 at 4 58 30 PM"
src="https://github.com/astral-sh/puffin/assets/1309177/3e8d0f99-df45-429a-9175-d57b54a72e56">

Obliquely closes https://github.com/astral-sh/puffin/issues/575.
This commit is contained in:
Charlie Marsh 2023-12-07 00:02:46 -05:00 committed by GitHub
parent aa065f5c97
commit a825b2db06
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 227 additions and 169 deletions

View File

@ -1,6 +1,7 @@
use std::fmt::{Display, Formatter}; use std::fmt::{Display, Formatter};
use std::io; use std::io;
use std::io::Write; use std::io::Write;
use std::ops::Deref;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::sync::Arc; use std::sync::Arc;
@ -37,8 +38,32 @@ impl CacheEntry {
} }
#[must_use] #[must_use]
pub fn with_file(self, file: String) -> Self { pub fn with_file(self, file: impl Into<String>) -> Self {
Self { file, ..self } Self {
file: file.into(),
..self
}
}
}
/// A subdirectory within the cache.
#[derive(Debug, Clone)]
pub struct CacheShard(PathBuf);
impl CacheShard {
pub fn entry(&self, file: impl Into<String>) -> CacheEntry {
CacheEntry {
dir: self.0.clone(),
file: file.into(),
}
}
}
impl Deref for CacheShard {
type Target = Path;
fn deref(&self) -> &Self::Target {
&self.0
} }
} }
@ -82,6 +107,11 @@ impl Cache {
self.root.join(cache_bucket.to_str()) self.root.join(cache_bucket.to_str())
} }
/// Compute an entry in the cache.
pub fn shard(&self, cache_bucket: CacheBucket, dir: impl AsRef<Path>) -> CacheShard {
CacheShard(self.bucket(cache_bucket).join(dir.as_ref()))
}
/// Compute an entry in the cache. /// Compute an entry in the cache.
pub fn entry( pub fn entry(
&self, &self,
@ -120,17 +150,17 @@ impl Cache {
/// are subdirectories of the cache root. /// are subdirectories of the cache root.
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
pub enum CacheBucket { pub enum CacheBucket {
/// Wheels (excluding built wheels), their metadata and cache policy. /// Wheels (excluding built wheels), alongside their metadata and cache policy.
/// ///
/// There are three kinds from cache entries: Wheel metadata and policy as JSON files, the /// There are three kinds from cache entries: Wheel metadata and policy as JSON files, the
/// wheels themselves, and the unzipped wheel archives. If a wheel file is over an in-memory /// wheels themselves, and the unzipped wheel archives. If a wheel file is over an in-memory
/// size threshold, we first download the zip file into the cache, then unzip it into a /// size threshold, we first download the zip file into the cache, then unzip it into a
/// directory with the same name, omitting the `.whl` extension. /// directory with the same name (exclusive of the `.whl` extension).
/// ///
/// Cache structure: /// Cache structure:
/// * `wheel-metadata-v0/pypi/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}` /// * `wheel-metadata-v0/pypi/foo/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}`
/// * `wheel-metadata-v0/<digest(index-url)>/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}` /// * `wheel-metadata-v0/<digest(index-url)>/foo/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}`
/// * `wheel-metadata-v0/url/<digest(url)>/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}` /// * `wheel-metadata-v0/url/<digest(url)>/foo/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}`
/// ///
/// See `puffin_client::RegistryClient::wheel_metadata` for information on how wheel metadata /// See `puffin_client::RegistryClient::wheel_metadata` for information on how wheel metadata
/// is fetched. /// is fetched.
@ -151,11 +181,13 @@ pub enum CacheBucket {
/// wheel-v0 /// wheel-v0
/// ├── pypi /// ├── pypi
/// │ ... /// │ ...
/// │ ├── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.json /// │ ├── pandas
/// │ │ └── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.json
/// │ ... /// │ ...
/// └── url /// └── url
/// └── 4b8be67c801a7ecb /// └── 4b8be67c801a7ecb
/// └── flask-3.0.0-py3-none-any.json /// └── flask
/// └── flask-3.0.0-py3-none-any.json
/// ``` /// ```
/// ///
/// We get the following `requirement.txt` from `pip-compile`: /// We get the following `requirement.txt` from `pip-compile`:
@ -176,16 +208,18 @@ pub enum CacheBucket {
/// wheel-v0 /// wheel-v0
/// ├── pypi /// ├── pypi
/// │ ... /// │ ...
/// │ ├── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl /// │ ├── pandas
/// │ ├── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64 /// │ │ ├── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
/// │ │ ├── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64
/// │ ... /// │ ...
/// └── url /// └── url
/// └── 4b8be67c801a7ecb /// └── 4b8be67c801a7ecb
/// └── flask-3.0.0-py3-none-any.whl /// └── flask
/// ├── flask /// └── flask-3.0.0-py3-none-any.whl
/// │ └── ... /// ├── flask
/// └── flask-3.0.0.dist-info /// │ └── ...
/// └── ... /// └── flask-3.0.0.dist-info
/// └── ...
/// ``` /// ```
/// ///
/// If we run first `pip-compile` and then `pip-sync` on the same machine, we get both: /// If we run first `pip-compile` and then `pip-sync` on the same machine, we get both:
@ -194,24 +228,26 @@ pub enum CacheBucket {
/// wheels-v0 /// wheels-v0
/// ├── pypi /// ├── pypi
/// │ ├── ... /// │ ├── ...
/// │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.json /// │ ├── pandas
/// │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl /// │ │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.json
/// │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64 /// │ │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
/// │ │ ├── pandas /// │ │ └── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64
/// │ │ │ ├── ... /// │ │ ├── pandas
/// │ │ ├── pandas-2.1.3.dist-info /// │ │ │ ├── ...
/// │ │ │ ├── ... /// │ │ ├── pandas-2.1.3.dist-info
/// │ │ └── pandas.libs /// │ │ │ ├── ...
/// │ │ └── pandas.libs
/// │ ├── ... /// │ ├── ...
/// └── url /// └── url
/// └── 4b8be67c801a7ecb /// └── 4b8be67c801a7ecb
/// ├── flask-3.0.0-py3-none-any.json /// └── flask
/// ├── flask-3.0.0-py3-none-any.json /// ├── flask-3.0.0-py3-none-any.json
/// └── flask-3.0.0-py3-none-any /// ├── flask-3.0.0-py3-none-any.json
/// ├── flask /// └── flask-3.0.0-py3-none-any
/// │ └── ... /// ├── flask
/// └── flask-3.0.0.dist-info /// │ └── ...
/// └── ... /// └── flask-3.0.0.dist-info
/// └── ...
Wheels, Wheels,
/// Wheels built from source distributions, their extracted metadata and the cache policy of /// Wheels built from source distributions, their extracted metadata and the cache policy of
/// the source distribution. /// the source distribution.
@ -225,14 +261,14 @@ pub enum CacheBucket {
/// ///
/// Source distributions are built into zipped wheel files (as PEP 517 specifies) and unzipped /// Source distributions are built into zipped wheel files (as PEP 517 specifies) and unzipped
/// lazily before installing. So when resolving, we only build the wheel and store the archive /// lazily before installing. So when resolving, we only build the wheel and store the archive
/// file in the cache, when installing, we unpack it under the same name, omitting the `.whl` /// file in the cache, when installing, we unpack it under the same name (exclusive of the
/// extension. You may find a mix of wheel archive zip files and unzipped wheel directories in /// `.whl` extension). You may find a mix of wheel archive zip files and unzipped wheel
/// the cache. /// directories in the cache.
/// ///
/// Cache structure: /// Cache structure:
/// * `built-wheels-v0/pypi/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` /// * `built-wheels-v0/pypi/foo/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}`
/// * `built-wheels-v0/<digest(index-url)>/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` /// * `built-wheels-v0/<digest(index-url)>/foo/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}`
/// * `built-wheels-v0/url/<digest(url)>/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` /// * `built-wheels-v0/url/<digest(url)>/foo/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}`
/// * `built-wheels-v0/git/<digest(url)>/<git sha>/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` /// * `built-wheels-v0/git/<digest(url)>/<git sha>/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}`
/// ///
/// But the url filename does not need to be a valid source dist filename /// But the url filename does not need to be a valid source dist filename
@ -261,14 +297,16 @@ pub enum CacheBucket {
/// │ ├── metadata.json /// │ ├── metadata.json
/// │ └── pydantic_extra_types-2.1.0-py3-none-any.whl /// │ └── pydantic_extra_types-2.1.0-py3-none-any.whl
/// ├── pypi /// ├── pypi
/// │ └── django-allauth-0.51.0.tar.gz /// │ └── django
/// │ ├── django_allauth-0.51.0-py3-none-any.whl /// │ └── django-allauth-0.51.0.tar.gz
/// │ └── metadata.json /// │ ├── django_allauth-0.51.0-py3-none-any.whl
/// │ └── metadata.json
/// └── url /// └── url
/// └── 6781bd6440ae72c2 /// └── 6781bd6440ae72c2
/// └── werkzeug-3.0.1.tar.gz /// └── werkzeug
/// ├── metadata.json /// └── werkzeug-3.0.1.tar.gz
/// └── werkzeug-3.0.1-py3-none-any.whl /// ├── metadata.json
/// └── werkzeug-3.0.1-py3-none-any.whl
/// ``` /// ```
/// ///
/// The inside of a `metadata.json`: /// The inside of a `metadata.json`:

View File

@ -10,7 +10,7 @@ use crate::{digest, CanonicalUrl};
/// Cache wheels and their metadata, both from remote wheels and built from source distributions. /// Cache wheels and their metadata, both from remote wheels and built from source distributions.
/// ///
/// Use [`WheelCache::wheel_dir`] for remote wheel metadata caching and /// Use [`WheelCache::remote_wheel_dir`] for remote wheel metadata caching and
/// [`WheelCache::built_wheel_dir`] for built source distributions metadata caching. /// [`WheelCache::built_wheel_dir`] for built source distributions metadata caching.
pub enum WheelCache<'a> { pub enum WheelCache<'a> {
/// Either pypi or an alternative index, which we key by index URL. /// Either pypi or an alternative index, which we key by index URL.
@ -38,8 +38,8 @@ impl<'a> WheelCache<'a> {
} }
/// Metadata of a remote wheel. See [`CacheBucket::Wheels`] /// Metadata of a remote wheel. See [`CacheBucket::Wheels`]
pub fn wheel_dir(&self) -> PathBuf { pub fn remote_wheel_dir(&self, package_name: impl AsRef<Path>) -> PathBuf {
self.bucket() self.bucket().join(package_name)
} }
/// Metadata of a built source distribution. See [`CacheBucket::BuiltWheels`] /// Metadata of a built source distribution. See [`CacheBucket::BuiltWheels`]

View File

@ -241,7 +241,7 @@ impl RegistryClient {
let cache_entry = self.cache.entry( let cache_entry = self.cache.entry(
CacheBucket::Wheels, CacheBucket::Wheels,
WheelCache::Index(&index).wheel_dir(), WheelCache::Index(&index).remote_wheel_dir(filename.name.as_ref()),
format!("{}.json", filename.stem()), format!("{}.json", filename.stem()),
); );
@ -276,7 +276,7 @@ impl RegistryClient {
let cache_entry = self.cache.entry( let cache_entry = self.cache.entry(
CacheBucket::Wheels, CacheBucket::Wheels,
cache_shard.wheel_dir(), cache_shard.remote_wheel_dir(filename.name.as_ref()),
format!("{}.json", filename.stem()), format!("{}.json", filename.stem()),
); );
@ -311,13 +311,13 @@ impl RegistryClient {
// The range request version failed (this is bad, the webserver should support this), fall // The range request version failed (this is bad, the webserver should support this), fall
// back to downloading the entire file and the reading the file from the zip the regular way // back to downloading the entire file and the reading the file from the zip the regular way
debug!("Range requests not supported for {filename}, downloading whole wheel"); debug!("Range requests not supported for {filename}; downloading wheel");
// TODO(konstin): Download the wheel into a cache shared with the installer instead // TODO(konstin): Download the wheel into a cache shared with the installer instead
// Note that this branch is only hit when you're not using and the server where // Note that this branch is only hit when you're not using and the server where
// you host your wheels for some reasons doesn't support range requests // you host your wheels for some reasons doesn't support range requests
// (tbh we should probably warn here and tell users to get a better registry because // (tbh we should probably warn here and tell users to get a better registry because
// their current one makes resolution unnecessary slow) // their current one makes resolution unnecessary slow).
let temp_download = tempfile_in(cache_shard.wheel_dir()).map_err(Error::CacheWrite)?; let temp_download = tempfile_in(self.cache.root()).map_err(Error::CacheWrite)?;
let mut writer = BufWriter::new(tokio::fs::File::from_std(temp_download)); let mut writer = BufWriter::new(tokio::fs::File::from_std(temp_download));
let mut reader = self.stream_external(url).await?.compat(); let mut reader = self.stream_external(url).await?.compat();
tokio::io::copy(&mut reader, &mut writer) tokio::io::copy(&mut reader, &mut writer)

View File

@ -15,7 +15,7 @@ use url::Url;
use distribution_filename::{WheelFilename, WheelFilenameError}; use distribution_filename::{WheelFilename, WheelFilenameError};
use distribution_types::direct_url::DirectGitUrl; use distribution_types::direct_url::DirectGitUrl;
use distribution_types::{BuiltDist, Dist, RemoteSource, SourceDist}; use distribution_types::{BuiltDist, Dist, Metadata, RemoteSource, SourceDist};
use platform_tags::Tags; use platform_tags::Tags;
use puffin_cache::{Cache, CacheBucket, WheelCache}; use puffin_cache::{Cache, CacheBucket, WheelCache};
use puffin_client::RegistryClient; use puffin_client::RegistryClient;
@ -171,7 +171,7 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
let cache_entry = self.cache.entry( let cache_entry = self.cache.entry(
CacheBucket::Wheels, CacheBucket::Wheels,
WheelCache::Index(&wheel.index).wheel_dir(), WheelCache::Index(&wheel.index).remote_wheel_dir(filename.name.as_ref()),
filename.stem(), filename.stem(),
); );
@ -193,12 +193,11 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
let cache_entry = self.cache.entry( let cache_entry = self.cache.entry(
CacheBucket::Wheels, CacheBucket::Wheels,
WheelCache::Index(&wheel.index).wheel_dir(), WheelCache::Index(&wheel.index).remote_wheel_dir(filename.name.as_ref()),
filename.to_string(), filename.to_string(),
); );
// Download the wheel into the cache. // Download the wheel into the cache.
// TODO(charlie): Use an atomic write, and remove any existing files or directories.
fs::create_dir_all(&cache_entry.dir).await?; fs::create_dir_all(&cache_entry.dir).await?;
let mut writer = fs::File::create(cache_entry.path()).await?; let mut writer = fs::File::create(cache_entry.path()).await?;
tokio::io::copy(&mut reader.compat(), &mut writer).await?; tokio::io::copy(&mut reader.compat(), &mut writer).await?;
@ -225,10 +224,9 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
let reader = self.client.stream_external(&wheel.url).await?; let reader = self.client.stream_external(&wheel.url).await?;
// Download the wheel into the cache. // Download the wheel into the cache.
// TODO(charlie): Use an atomic write, and remove any existing files or directories.
let cache_entry = self.cache.entry( let cache_entry = self.cache.entry(
CacheBucket::Wheels, CacheBucket::Wheels,
WheelCache::Url(&wheel.url).wheel_dir(), WheelCache::Url(&wheel.url).remote_wheel_dir(wheel.name().as_ref()),
wheel.filename.to_string(), wheel.filename.to_string(),
); );
fs::create_dir_all(&cache_entry.dir).await?; fs::create_dir_all(&cache_entry.dir).await?;
@ -252,7 +250,7 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
Dist::Built(BuiltDist::Path(wheel)) => { Dist::Built(BuiltDist::Path(wheel)) => {
let cache_entry = self.cache.entry( let cache_entry = self.cache.entry(
CacheBucket::Wheels, CacheBucket::Wheels,
WheelCache::Url(&wheel.url).wheel_dir(), WheelCache::Url(&wheel.url).remote_wheel_dir(wheel.name().as_ref()),
wheel.filename.stem(), wheel.filename.stem(),
); );

View File

@ -1,24 +1,22 @@
use std::path::PathBuf;
use fs_err as fs; use fs_err as fs;
use tracing::warn; use tracing::warn;
use distribution_types::CachedWheel; use distribution_types::CachedWheel;
use platform_tags::Tags; use platform_tags::Tags;
use puffin_cache::CacheShard;
use crate::index::iter_directories; use crate::index::iter_directories;
/// A local index of built distributions for a specific source distribution. /// A local index of built distributions for a specific source distribution.
#[derive(Debug)] pub struct BuiltWheelIndex;
pub struct BuiltWheelIndex<'a> {
directory: PathBuf,
tags: &'a Tags,
}
impl<'a> BuiltWheelIndex<'a> { impl BuiltWheelIndex {
/// Create a new index of built distributions. /// Find the "best" distribution in the index for a given source distribution.
/// ///
/// The `directory` should be the directory containing the built distributions for a specific /// This lookup prefers newer versions over older versions, and aims to maximize compatibility
/// with the target platform.
///
/// The `shard` should point to a directory containing the built distributions for a specific
/// source distribution. For example, given the built wheel cache structure: /// source distribution. For example, given the built wheel cache structure:
/// ```text /// ```text
/// built-wheels-v0/ /// built-wheels-v0/
@ -28,27 +26,16 @@ impl<'a> BuiltWheelIndex<'a> {
/// └── metadata.json /// └── metadata.json
/// ``` /// ```
/// ///
/// The `directory` should be `built-wheels-v0/pypi/django-allauth-0.51.0.tar.gz`. /// The `shard` should be `built-wheels-v0/pypi/django-allauth-0.51.0.tar.gz`.
pub fn new(directory: impl Into<PathBuf>, tags: &'a Tags) -> Self { pub fn find(shard: &CacheShard, tags: &Tags) -> Option<CachedWheel> {
Self {
directory: directory.into(),
tags,
}
}
/// Find the "best" distribution in the index.
///
/// This lookup prefers newer versions over older versions, and aims to maximize compatibility
/// with the target platform.
pub fn find(&self) -> Option<CachedWheel> {
let mut candidate: Option<CachedWheel> = None; let mut candidate: Option<CachedWheel> = None;
for subdir in iter_directories(self.directory.read_dir().ok()?) { for subdir in iter_directories(shard.read_dir().ok()?) {
match CachedWheel::from_path(&subdir) { match CachedWheel::from_path(&subdir) {
Ok(None) => {} Ok(None) => {}
Ok(Some(dist_info)) => { Ok(Some(dist_info)) => {
// Pick the wheel with the highest priority // Pick the wheel with the highest priority
let compatibility = dist_info.filename.compatibility(self.tags); let compatibility = dist_info.filename.compatibility(tags);
// Only consider wheels that are compatible with our tags. // Only consider wheels that are compatible with our tags.
if compatibility.is_none() { if compatibility.is_none() {
@ -62,7 +49,7 @@ impl<'a> BuiltWheelIndex<'a> {
if let Some(existing) = candidate.as_ref() { if let Some(existing) = candidate.as_ref() {
// Override if the wheel is newer, or "more" compatible. // Override if the wheel is newer, or "more" compatible.
if dist_info.filename.version > existing.filename.version if dist_info.filename.version > existing.filename.version
|| compatibility > existing.filename.compatibility(self.tags) || compatibility > existing.filename.compatibility(tags)
{ {
candidate = Some(dist_info); candidate = Some(dist_info);
} }
@ -75,8 +62,7 @@ impl<'a> BuiltWheelIndex<'a> {
"Invalid cache entry at {}, removing. {err}", "Invalid cache entry at {}, removing. {err}",
subdir.display() subdir.display()
); );
let result = fs::remove_dir_all(&subdir); if let Err(err) = fs::remove_dir_all(&subdir) {
if let Err(err) = result {
warn!( warn!(
"Failed to remove invalid cache entry at {}: {err}", "Failed to remove invalid cache entry at {}: {err}",
subdir.display() subdir.display()

View File

@ -1,11 +1,13 @@
use std::collections::hash_map::Entry;
use std::collections::BTreeMap; use std::collections::BTreeMap;
use std::path::Path; use std::path::Path;
use fs_err as fs; use fs_err as fs;
use fxhash::FxHashMap; use fxhash::FxHashMap;
use tracing::warn; use tracing::warn;
use distribution_types::{CachedRegistryDist, CachedWheel, Metadata}; use distribution_types::{CachedRegistryDist, CachedWheel};
use pep440_rs::Version; use pep440_rs::Version;
use platform_tags::Tags; use platform_tags::Tags;
use puffin_cache::{Cache, CacheBucket, WheelCache}; use puffin_cache::{Cache, CacheBucket, WheelCache};
@ -15,55 +17,87 @@ use pypi_types::IndexUrls;
use crate::index::iter_directories; use crate::index::iter_directories;
/// A local index of distributions that originate from a registry, like `PyPI`. /// A local index of distributions that originate from a registry, like `PyPI`.
#[derive(Debug, Default)] #[derive(Debug)]
pub struct RegistryWheelIndex(FxHashMap<PackageName, BTreeMap<Version, CachedRegistryDist>>); pub struct RegistryWheelIndex<'a> {
cache: &'a Cache,
tags: &'a Tags,
index_urls: &'a IndexUrls,
index: FxHashMap<PackageName, BTreeMap<Version, CachedRegistryDist>>,
}
impl RegistryWheelIndex { impl<'a> RegistryWheelIndex<'a> {
/// Build an index of cached distributions from a directory. /// Initialize an index of cached distributions from a directory.
pub fn from_directory(cache: &Cache, tags: &Tags, index_urls: &IndexUrls) -> Self { pub fn new(cache: &'a Cache, tags: &'a Tags, index_urls: &'a IndexUrls) -> Self {
let mut index = Self::default(); Self {
cache,
tags,
index_urls,
index: FxHashMap::default(),
}
}
/// Return an iterator over available wheels for a given package.
///
/// If the package is not yet indexed, this will index the package by reading from the cache.
pub fn get(
&mut self,
name: &PackageName,
) -> impl Iterator<Item = (&Version, &CachedRegistryDist)> {
let versions = match self.index.entry(name.clone()) {
Entry::Occupied(entry) => entry.into_mut(),
Entry::Vacant(entry) => {
entry.insert(Self::index(name, self.cache, self.tags, self.index_urls))
}
};
versions.iter().rev()
}
/// Add a package to the index by reading from the cache.
fn index(
package: &PackageName,
cache: &Cache,
tags: &Tags,
index_urls: &IndexUrls,
) -> BTreeMap<Version, CachedRegistryDist> {
let mut versions = BTreeMap::new();
for index_url in index_urls { for index_url in index_urls {
// Index all the wheels that were downloaded directly from the registry. // Index all the wheels that were downloaded directly from the registry.
// TODO(charlie): Shard the cache by package name, and do this lazily. let wheel_dir = cache.shard(
let wheel_dir = cache CacheBucket::Wheels,
.bucket(CacheBucket::Wheels) WheelCache::Index(index_url).remote_wheel_dir(package.to_string()),
.join(WheelCache::Index(index_url).wheel_dir()); );
index.add_directory(wheel_dir, tags); Self::add_directory(&*wheel_dir, tags, &mut versions);
// Index all the built wheels, created by downloading and building source distributions // Index all the built wheels, created by downloading and building source distributions
// from the registry. // from the registry.
// TODO(charlie): Shard the cache by package name, and do this lazily. let built_wheel_dir = cache.shard(
let built_wheel_dir = cache CacheBucket::BuiltWheels,
.bucket(CacheBucket::BuiltWheels) WheelCache::Index(index_url).built_wheel_dir(package.to_string()),
.join(WheelCache::Index(index_url).wheel_dir()); );
// Built wheels have one more level of indirection, as they are keyed by the source
// distribution filename.
let Ok(read_dir) = built_wheel_dir.read_dir() else { let Ok(read_dir) = built_wheel_dir.read_dir() else {
continue; continue;
}; };
for subdir in iter_directories(read_dir) { for subdir in iter_directories(read_dir) {
index.add_directory(subdir, tags); Self::add_directory(subdir, tags, &mut versions);
} }
} }
index versions
}
/// Returns a distribution from the index, if it exists.
pub fn by_name(
&self,
name: &PackageName,
) -> impl Iterator<Item = (&Version, &CachedRegistryDist)> {
// Using static to extend the lifetime
static DEFAULT_MAP: BTreeMap<Version, CachedRegistryDist> = BTreeMap::new();
self.0.get(name).unwrap_or(&DEFAULT_MAP).iter().rev()
} }
/// Add the wheels in a given directory to the index. /// Add the wheels in a given directory to the index.
/// ///
/// Each subdirectory in the given path is expected to be that of an unzipped wheel. /// Each subdirectory in the given path is expected to be that of an unzipped wheel.
fn add_directory(&mut self, path: impl AsRef<Path>, tags: &Tags) { fn add_directory(
path: impl AsRef<Path>,
tags: &Tags,
versions: &mut BTreeMap<Version, CachedRegistryDist>,
) {
let Ok(read_dir) = path.as_ref().read_dir() else { let Ok(read_dir) = path.as_ref().read_dir() else {
return; return;
}; };
@ -76,20 +110,13 @@ impl RegistryWheelIndex {
// Pick the wheel with the highest priority // Pick the wheel with the highest priority
let compatibility = dist_info.filename.compatibility(tags); let compatibility = dist_info.filename.compatibility(tags);
if let Some(existing) = self if let Some(existing) = versions.get_mut(&dist_info.filename.version) {
.0
.get_mut(dist_info.name())
.and_then(|package| package.get_mut(&dist_info.filename.version))
{
// Override if we have better compatibility // Override if we have better compatibility
if compatibility > existing.filename.compatibility(tags) { if compatibility > existing.filename.compatibility(tags) {
*existing = dist_info; *existing = dist_info;
} }
} else if compatibility.is_some() { } else if compatibility.is_some() {
self.0 versions.insert(dist_info.filename.version.clone(), dist_info);
.entry(dist_info.name().clone())
.or_default()
.insert(dist_info.filename.version.clone(), dist_info);
} }
} }
Err(err) => { Err(err) => {
@ -97,8 +124,8 @@ impl RegistryWheelIndex {
"Invalid cache entry at {}, removing. {err}", "Invalid cache entry at {}, removing. {err}",
wheel_dir.display() wheel_dir.display()
); );
let result = fs::remove_dir_all(&wheel_dir);
if let Err(err) = result { if let Err(err) = fs::remove_dir_all(&wheel_dir) {
warn!( warn!(
"Failed to remove invalid cache entry at {}: {err}", "Failed to remove invalid cache entry at {}: {err}",
wheel_dir.display() wheel_dir.display()

View File

@ -24,7 +24,9 @@ use distribution_types::direct_url::{DirectArchiveUrl, DirectGitUrl};
use distribution_types::{GitSourceDist, Metadata, PathSourceDist, RemoteSource, SourceDist}; use distribution_types::{GitSourceDist, Metadata, PathSourceDist, RemoteSource, SourceDist};
use install_wheel_rs::read_dist_info; use install_wheel_rs::read_dist_info;
use platform_tags::Tags; use platform_tags::Tags;
use puffin_cache::{digest, CacheBucket, CacheEntry, CachedByTimestamp, CanonicalUrl, WheelCache}; use puffin_cache::{
digest, CacheBucket, CacheEntry, CacheShard, CachedByTimestamp, CanonicalUrl, WheelCache,
};
use puffin_client::{CachedClient, CachedClientError, DataWithCachePolicy}; use puffin_client::{CachedClient, CachedClientError, DataWithCachePolicy};
use puffin_fs::write_atomic; use puffin_fs::write_atomic;
use puffin_git::{Fetch, GitSource}; use puffin_git::{Fetch, GitSource};
@ -161,11 +163,17 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
let DirectArchiveUrl { url, subdirectory } = let DirectArchiveUrl { url, subdirectory } =
DirectArchiveUrl::from(&direct_url_source_dist.url); DirectArchiveUrl::from(&direct_url_source_dist.url);
// For direct URLs, cache directly under the hash of the URL itself.
let cache_shard = self.build_context.cache().shard(
CacheBucket::BuiltWheels,
WheelCache::Url(&url).remote_wheel_dir(direct_url_source_dist.name().as_ref()),
);
self.url( self.url(
source_dist, source_dist,
filename, filename,
&url, &url,
WheelCache::Url(&url), cache_shard,
subdirectory.as_deref(), subdirectory.as_deref(),
) )
.await? .await?
@ -174,11 +182,21 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
let url = Url::parse(&registry_source_dist.file.url).map_err(|err| { let url = Url::parse(&registry_source_dist.file.url).map_err(|err| {
SourceDistError::UrlParse(registry_source_dist.file.url.clone(), err) SourceDistError::UrlParse(registry_source_dist.file.url.clone(), err)
})?; })?;
// For registry source distributions, shard by distribution, then by filename.
// Ex) `pypi/requests/requests-2.25.1.tar.gz`
let cache_shard = self.build_context.cache().shard(
CacheBucket::BuiltWheels,
WheelCache::Index(&registry_source_dist.index)
.remote_wheel_dir(registry_source_dist.name.as_ref())
.join(&registry_source_dist.file.filename),
);
self.url( self.url(
source_dist, source_dist,
&registry_source_dist.file.filename, &registry_source_dist.file.filename,
&url, &url,
WheelCache::Index(&registry_source_dist.index), cache_shard,
None, None,
) )
.await? .await?
@ -197,14 +215,10 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
source_dist: &'data SourceDist, source_dist: &'data SourceDist,
filename: &'data str, filename: &'data str,
url: &'data Url, url: &'data Url,
cache_shard: WheelCache<'data>, cache_shard: CacheShard,
subdirectory: Option<&'data Path>, subdirectory: Option<&'data Path>,
) -> Result<BuiltWheelMetadata, SourceDistError> { ) -> Result<BuiltWheelMetadata, SourceDistError> {
let cache_entry = self.build_context.cache().entry( let cache_entry = cache_shard.entry(METADATA_JSON.to_string());
CacheBucket::BuiltWheels,
cache_shard.built_wheel_dir(filename),
METADATA_JSON.to_string(),
);
let response_callback = |response| async { let response_callback = |response| async {
// New or changed source distribution, delete all built wheels // New or changed source distribution, delete all built wheels
@ -345,10 +359,10 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
source_dist: &SourceDist, source_dist: &SourceDist,
path_source_dist: &PathSourceDist, path_source_dist: &PathSourceDist,
) -> Result<BuiltWheelMetadata, SourceDistError> { ) -> Result<BuiltWheelMetadata, SourceDistError> {
let cache_shard = WheelCache::Path(&path_source_dist.url);
let cache_entry = self.build_context.cache().entry( let cache_entry = self.build_context.cache().entry(
CacheBucket::BuiltWheels, CacheBucket::BuiltWheels,
cache_shard.built_wheel_dir(source_dist.name().to_string()), WheelCache::Path(&path_source_dist.url)
.remote_wheel_dir(path_source_dist.name().as_ref()),
METADATA_JSON.to_string(), METADATA_JSON.to_string(),
); );

View File

@ -3,8 +3,7 @@ use tracing::debug;
use distribution_types::direct_url::{git_reference, DirectUrl}; use distribution_types::direct_url::{git_reference, DirectUrl};
use distribution_types::{ use distribution_types::{
BuiltDist, CachedDirectUrlDist, CachedDist, Dist, InstalledDist, Metadata, RemoteSource, BuiltDist, CachedDirectUrlDist, CachedDist, Dist, InstalledDist, Metadata, SourceDist,
SourceDist,
}; };
use pep508_rs::{Requirement, VersionOrUrl}; use pep508_rs::{Requirement, VersionOrUrl};
use platform_tags::Tags; use platform_tags::Tags;
@ -45,7 +44,7 @@ impl InstallPlan {
SitePackages::try_from_executable(venv).context("Failed to list installed packages")?; SitePackages::try_from_executable(venv).context("Failed to list installed packages")?;
// Index all the already-downloaded wheels in the cache. // Index all the already-downloaded wheels in the cache.
let registry_index = RegistryWheelIndex::from_directory(cache, tags, index_urls); let mut registry_index = RegistryWheelIndex::new(cache, tags, index_urls);
let mut local = vec![]; let mut local = vec![];
let mut remote = vec![]; let mut remote = vec![];
@ -89,9 +88,8 @@ impl InstallPlan {
// Identify any locally-available distributions that satisfy the requirement. // Identify any locally-available distributions that satisfy the requirement.
match requirement.version_or_url.as_ref() { match requirement.version_or_url.as_ref() {
None => { None => {
// TODO(charlie): This doesn't respect built wheels.
if let Some((_version, distribution)) = if let Some((_version, distribution)) =
registry_index.by_name(&requirement.name).next() registry_index.get(&requirement.name).next()
{ {
debug!("Requirement already cached: {distribution}"); debug!("Requirement already cached: {distribution}");
local.push(CachedDist::Registry(distribution.clone())); local.push(CachedDist::Registry(distribution.clone()));
@ -99,12 +97,16 @@ impl InstallPlan {
} }
} }
Some(VersionOrUrl::VersionSpecifier(specifier)) => { Some(VersionOrUrl::VersionSpecifier(specifier)) => {
if let Some((_version, distribution)) = registry_index if let Some(distribution) =
.by_name(&requirement.name) registry_index
.find(|(version, dist)| { .get(&requirement.name)
specifier.contains(version) .find_map(|(version, distribution)| {
&& requirement.is_satisfied_by(&dist.filename.version) if specifier.contains(version) {
}) Some(distribution)
} else {
None
}
})
{ {
debug!("Requirement already cached: {distribution}"); debug!("Requirement already cached: {distribution}");
local.push(CachedDist::Registry(distribution.clone())); local.push(CachedDist::Registry(distribution.clone()));
@ -124,7 +126,7 @@ impl InstallPlan {
// advance. // advance.
let cache_entry = cache.entry( let cache_entry = cache.entry(
CacheBucket::Wheels, CacheBucket::Wheels,
WheelCache::Url(&wheel.url).wheel_dir(), WheelCache::Url(&wheel.url).remote_wheel_dir(wheel.name().as_ref()),
wheel.filename.stem(), wheel.filename.stem(),
); );
@ -145,7 +147,7 @@ impl InstallPlan {
// advance. // advance.
let cache_entry = cache.entry( let cache_entry = cache.entry(
CacheBucket::Wheels, CacheBucket::Wheels,
WheelCache::Url(&wheel.url).wheel_dir(), WheelCache::Url(&wheel.url).remote_wheel_dir(wheel.name().as_ref()),
wheel.filename.stem(), wheel.filename.stem(),
); );
@ -164,14 +166,12 @@ impl InstallPlan {
Dist::Source(SourceDist::DirectUrl(sdist)) => { Dist::Source(SourceDist::DirectUrl(sdist)) => {
// Find the most-compatible wheel from the cache, since we don't know // Find the most-compatible wheel from the cache, since we don't know
// the filename in advance. // the filename in advance.
let cache_entry = cache.entry( let cache_shard = cache.shard(
CacheBucket::BuiltWheels, CacheBucket::BuiltWheels,
WheelCache::Url(&sdist.url).wheel_dir(), WheelCache::Url(&sdist.url).remote_wheel_dir(sdist.name().as_ref()),
sdist.filename()?.to_string(),
); );
let index = BuiltWheelIndex::new(cache_entry.path(), tags);
if let Some(wheel) = index.find() { if let Some(wheel) = BuiltWheelIndex::find(&cache_shard, tags) {
let cached_dist = wheel.into_url_dist(url.clone()); let cached_dist = wheel.into_url_dist(url.clone());
debug!("URL source requirement already cached: {cached_dist}"); debug!("URL source requirement already cached: {cached_dist}");
local.push(CachedDist::Url(cached_dist.clone())); local.push(CachedDist::Url(cached_dist.clone()));
@ -181,14 +181,13 @@ impl InstallPlan {
Dist::Source(SourceDist::Path(sdist)) => { Dist::Source(SourceDist::Path(sdist)) => {
// Find the most-compatible wheel from the cache, since we don't know // Find the most-compatible wheel from the cache, since we don't know
// the filename in advance. // the filename in advance.
let cache_entry = cache.entry( let cache_shard = cache.shard(
CacheBucket::BuiltWheels, CacheBucket::BuiltWheels,
WheelCache::Path(&sdist.url).wheel_dir(), WheelCache::Path(&sdist.url)
sdist.name().to_string(), .remote_wheel_dir(sdist.name().as_ref()),
); );
let index = BuiltWheelIndex::new(cache_entry.path(), tags);
if let Some(wheel) = index.find() { if let Some(wheel) = BuiltWheelIndex::find(&cache_shard, tags) {
let cached_dist = wheel.into_url_dist(url.clone()); let cached_dist = wheel.into_url_dist(url.clone());
debug!("Path source requirement already cached: {cached_dist}"); debug!("Path source requirement already cached: {cached_dist}");
local.push(CachedDist::Url(cached_dist.clone())); local.push(CachedDist::Url(cached_dist.clone()));
@ -199,14 +198,12 @@ impl InstallPlan {
// Find the most-compatible wheel from the cache, since we don't know // Find the most-compatible wheel from the cache, since we don't know
// the filename in advance. // the filename in advance.
if let Ok(Some(reference)) = git_reference(&sdist.url) { if let Ok(Some(reference)) = git_reference(&sdist.url) {
let cache_entry = cache.entry( let cache_shard = cache.shard(
CacheBucket::BuiltWheels, CacheBucket::BuiltWheels,
WheelCache::Git(&sdist.url).wheel_dir(), WheelCache::Git(&sdist.url).built_wheel_dir(reference),
reference.to_string(),
); );
let index = BuiltWheelIndex::new(cache_entry.path(), tags);
if let Some(wheel) = index.find() { if let Some(wheel) = BuiltWheelIndex::find(&cache_shard, tags) {
let cached_dist = wheel.into_url_dist(url.clone()); let cached_dist = wheel.into_url_dist(url.clone());
debug!("Git source requirement already cached: {cached_dist}"); debug!("Git source requirement already cached: {cached_dist}");
local.push(CachedDist::Url(cached_dist.clone())); local.push(CachedDist::Url(cached_dist.clone()));

View File

@ -1,5 +1,3 @@
use std::fmt;
use std::fmt::{Display, Formatter};
use std::str::FromStr; use std::str::FromStr;
use serde::{Deserialize, Deserializer, Serialize}; use serde::{Deserialize, Deserializer, Serialize};
@ -54,8 +52,8 @@ impl<'de> Deserialize<'de> for PackageName {
} }
} }
impl Display for PackageName { impl std::fmt::Display for PackageName {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f) self.0.fmt(f)
} }
} }