From 5370484307ba69301b0c6291e6a44e8373ee4d98 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Tue, 5 Dec 2023 17:41:22 -0500 Subject: [PATCH] Remove `.whl` extension for cached, unzipped wheels (#574) ## Summary This PR uses the wheel stem (e.g., `foo-1.2.3-py3-none-any`) instead of the wheel name (e.g., `foo-1.2.3-py3-none-any.whl`) when storing unzipped wheels in the cache, which removes a class of confusing issues around overwrites and directory-vs.-file collisions. For now, we retain _both_ the zipped and unzipped wheels in the cache, though we can easily change this by storing the zipped wheels in a temporary directory. Closes https://github.com/astral-sh/puffin/issues/573. ## Test Plan Some examples from my local cache: Screen Shot 2023-12-05 at 4 09 55 PM Screen Shot 2023-12-05 at 4 12 14 PM Screen Shot 2023-12-05 at 4 09 50 PM --- crates/distribution-filename/src/wheel.rs | 104 ++++++++++-------- crates/distribution-types/src/cached.rs | 9 +- crates/puffin-cache/src/lib.rs | 32 ++++-- .../src/distribution_database.rs | 51 +++++---- crates/puffin-distribution/src/download.rs | 6 +- crates/puffin-distribution/src/source_dist.rs | 14 ++- crates/puffin-installer/src/plan.rs | 10 +- crates/puffin-installer/src/registry_index.rs | 5 - 8 files changed, 133 insertions(+), 98 deletions(-) diff --git a/crates/distribution-filename/src/wheel.rs b/crates/distribution-filename/src/wheel.rs index d7720a85b..6cfae56a4 100644 --- a/crates/distribution-filename/src/wheel.rs +++ b/crates/distribution-filename/src/wheel.rs @@ -1,8 +1,8 @@ -#[cfg(feature = "serde")] -use serde::{de, Deserialize, Deserializer, Serialize, Serializer}; use std::fmt::{Display, Formatter}; use std::str::FromStr; +#[cfg(feature = "serde")] +use serde::{de, Deserialize, Deserializer, Serialize, Serializer}; use thiserror::Error; use url::Url; @@ -23,13 +23,69 @@ impl FromStr for WheelFilename { type Err = WheelFilenameError; fn from_str(filename: &str) -> Result { - let basename = filename.strip_suffix(".whl").ok_or_else(|| { + let stem = filename.strip_suffix(".whl").ok_or_else(|| { WheelFilenameError::InvalidWheelFileName( filename.to_string(), "Must end with .whl".to_string(), ) })?; + Self::parse(stem, filename) + } +} +impl Display for WheelFilename { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}-{}-{}.whl", + self.name.as_dist_info_name(), + self.version, + self.get_tag() + ) + } +} + +impl WheelFilename { + /// Returns `true` if the wheel is compatible with the given tags. + pub fn is_compatible(&self, compatible_tags: &Tags) -> bool { + compatible_tags.is_compatible(&self.python_tag, &self.abi_tag, &self.platform_tag) + } + + /// Return the [`TagPriority`] score of the wheel with the given tags, or `None` if the wheel is + /// incompatible. + pub fn compatibility(&self, compatible_tags: &Tags) -> Option { + compatible_tags.compatibility(&self.python_tag, &self.abi_tag, &self.platform_tag) + } + + /// Get the tag for this wheel. + pub fn get_tag(&self) -> String { + format!( + "{}-{}-{}", + self.python_tag.join("."), + self.abi_tag.join("."), + self.platform_tag.join(".") + ) + } + + /// The wheel filename without the extension. + pub fn stem(&self) -> String { + format!( + "{}-{}-{}", + self.name.as_dist_info_name(), + self.version, + self.get_tag() + ) + } + + /// Parse a wheel filename from the stem (e.g., `foo-1.2.3-py3-none-any`). + pub fn from_stem(stem: &str) -> Result { + Self::parse(stem, stem) + } + + /// Parse a wheel filename from the stem (e.g., `foo-1.2.3-py3-none-any`). + /// + /// The originating `filename` is used for high-fidelity error messages. + fn parse(stem: &str, filename: &str) -> Result { // The wheel filename should contain either five or six entries. If six, then the third // entry is the build tag. If five, then the third entry is the Python tag. // https://www.python.org/dev/peps/pep-0427/#file-name-convention @@ -39,7 +95,7 @@ impl FromStr for WheelFilename { // is used to break ties. This might mean that we generate identical // `WheelName` values for multiple distinct wheels, but it's not clear // if this is a problem in practice. - let mut parts = basename.split('-'); + let mut parts = stem.split('-'); let name = parts .next() @@ -112,46 +168,6 @@ impl FromStr for WheelFilename { } } -impl Display for WheelFilename { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{}-{}-{}.whl", - self.name.as_dist_info_name(), - self.version, - self.get_tag() - ) - } -} - -impl WheelFilename { - /// Returns `true` if the wheel is compatible with the given tags. - pub fn is_compatible(&self, compatible_tags: &Tags) -> bool { - compatible_tags.is_compatible(&self.python_tag, &self.abi_tag, &self.platform_tag) - } - - /// Return the [`TagPriority`] score of the wheel with the given tags, or `None` if the wheel is - /// incompatible. - pub fn compatibility(&self, compatible_tags: &Tags) -> Option { - compatible_tags.compatibility(&self.python_tag, &self.abi_tag, &self.platform_tag) - } - - /// Get the tag for this wheel. - pub fn get_tag(&self) -> String { - format!( - "{}-{}-{}", - self.python_tag.join("."), - self.abi_tag.join("."), - self.platform_tag.join(".") - ) - } - - /// The wheel filename without the extension - pub fn stem(&self) -> String { - format!("{}-{}-{}", self.name, self.version, self.get_tag()) - } -} - impl TryFrom<&Url> for WheelFilename { type Error = WheelFilenameError; diff --git a/crates/distribution-types/src/cached.rs b/crates/distribution-types/src/cached.rs index b3a9e891f..f26acbbe7 100644 --- a/crates/distribution-types/src/cached.rs +++ b/crates/distribution-types/src/cached.rs @@ -1,5 +1,4 @@ use std::path::{Path, PathBuf}; -use std::str::FromStr; use anyhow::Result; use url::Url; @@ -125,6 +124,7 @@ impl CachedDist { } impl CachedDirectUrlDist { + /// Initialize a [`CachedDirectUrlDist`] from a [`WheelFilename`], [`Url`], and [`Path`]. pub fn from_url(filename: WheelFilename, url: Url, path: PathBuf) -> Self { Self { filename, @@ -135,7 +135,7 @@ impl CachedDirectUrlDist { } impl CachedRegistryDist { - /// Try to parse a distribution from a cached directory name (like `django-5.0a1`). + /// Try to parse a distribution from a cached directory name (like `typing-extensions-4.8.0-py3-none-any`). pub fn try_from_path(path: &Path) -> Result> { let Some(file_name) = path.file_name() else { return Ok(None); @@ -143,9 +143,12 @@ impl CachedRegistryDist { let Some(file_name) = file_name.to_str() else { return Ok(None); }; - let Ok(filename) = WheelFilename::from_str(file_name) else { + let Ok(filename) = WheelFilename::from_stem(file_name) else { return Ok(None); }; + if path.is_file() { + return Ok(None); + } let path = path.to_path_buf(); diff --git a/crates/puffin-cache/src/lib.rs b/crates/puffin-cache/src/lib.rs index 6d12e2a8f..5b89189cf 100644 --- a/crates/puffin-cache/src/lib.rs +++ b/crates/puffin-cache/src/lib.rs @@ -33,6 +33,11 @@ impl CacheEntry { // TODO(konstin): Cache this to avoid allocations? self.dir.join(&self.file) } + + #[must_use] + pub fn with_file(self, file: String) -> Self { + Self { file, ..self } + } } /// The main cache abstraction. @@ -115,10 +120,10 @@ impl Cache { pub enum CacheBucket { /// Wheels (excluding built wheels), their metadata and cache policy. /// - /// There are two kinds from cache entries: Wheel metadata and policy as json files, and the - /// wheels themselves. If a wheel file is over an in-memory size threshold, we first download - /// the zip file into the cache, then unzip it into a directory with the same name. Unlike the - /// built wheels cache, you should only see unpacked wheel directories after running puffin. + /// There are three kinds from cache entries: Wheel metadata and policy as JSON files, the + /// wheels themselves, and the unzipped wheel archives. If a wheel file is over an in-memory + /// size threshold, we first download the zip file into the cache, then unzip it into a + /// directory with the same name, omitting the `.whl` extension. /// /// Cache structure: /// * `wheel-metadata-v0/pypi/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}` @@ -170,6 +175,7 @@ pub enum CacheBucket { /// ├── pypi /// │ ... /// │ ├── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + /// │ ├── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64 /// │ ... /// └── url /// └── 4b8be67c801a7ecb @@ -188,6 +194,7 @@ pub enum CacheBucket { /// │ ├── ... /// │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.json /// │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + /// │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64 /// │ │ ├── pandas /// │ │ │ ├── ... /// │ │ ├── pandas-2.1.3.dist-info @@ -197,7 +204,8 @@ pub enum CacheBucket { /// └── url /// └── 4b8be67c801a7ecb /// ├── flask-3.0.0-py3-none-any.json - /// └── flask-3.0.0-py3-none-any.whl + /// ├── flask-3.0.0-py3-none-any.json + /// └── flask-3.0.0-py3-none-any /// ├── flask /// │ └── ... /// └── flask-3.0.0.dist-info @@ -207,17 +215,17 @@ pub enum CacheBucket { /// the source distribution. /// /// The structure is similar of that of the `Wheel` bucket, except we have an additional layer - /// for the source dist filename and the metadata is on the source dist level, not on the wheel - /// level. + /// for the source distribution filename and the metadata is at the source distribution-level, + /// not at the wheel level. /// - /// TODO(konstin): The cache policy should be on the source dist level, the metadata we can put - /// next to the wheels as in the `Wheels` bucket. + /// TODO(konstin): The cache policy should be on the source distribution level, the metadata we + /// can put next to the wheels as in the `Wheels` bucket. /// /// Source distributions are built into zipped wheel files (as PEP 517 specifies) and unzipped /// lazily before installing. So when resolving, we only build the wheel and store the archive - /// file in the cache, when installing, we unpack it and replace the zip file with an unzipped - /// directory under the same. You may find a mix of wheel archive zip files and unzipped - /// wheel directories in the cache. + /// file in the cache, when installing, we unpack it under the same name, omitting the `.whl` + /// extension. You may find a mix of wheel archive zip files and unzipped wheel directories in + /// the cache. /// /// Cache structure: /// * `built-wheels-v0/pypi/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` diff --git a/crates/puffin-distribution/src/distribution_database.rs b/crates/puffin-distribution/src/distribution_database.rs index d1c1bafae..c25c897c8 100644 --- a/crates/puffin-distribution/src/distribution_database.rs +++ b/crates/puffin-distribution/src/distribution_database.rs @@ -149,13 +149,7 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> DistributionDatabaseError::Url(wheel.file.url.to_string(), err) })?; let filename = WheelFilename::from_str(&wheel.file.filename)?; - let wheel_filename = &wheel.file.filename; - let cache_entry = self.cache.entry( - CacheBucket::Wheels, - WheelCache::Index(&wheel.index).wheel_dir(), - wheel_filename.to_string(), - ); let reader = self.client.stream_external(&url).await?; // If the file is greater than 5MB, write it to disk; otherwise, keep it in memory. @@ -175,6 +169,12 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> ByteSize::b(small_size as u64) ); + let cache_entry = self.cache.entry( + CacheBucket::Wheels, + WheelCache::Index(&wheel.index).wheel_dir(), + filename.stem(), + ); + // Read into a buffer. let mut buffer = Vec::with_capacity(small_size); let mut reader = tokio::io::BufReader::new(reader.compat()); @@ -182,15 +182,21 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> LocalWheel::InMemory(InMemoryWheel { dist: dist.clone(), - filename, - buffer, target: cache_entry.path(), + buffer, + filename, }) } else { let size = small_size.map_or("unknown size".to_string(), |size| size.to_string()); debug!("Fetching disk-based wheel from registry: {dist} ({size})"); + let cache_entry = self.cache.entry( + CacheBucket::Wheels, + WheelCache::Index(&wheel.index).wheel_dir(), + filename.to_string(), + ); + // Download the wheel into the cache. // TODO(charlie): Use an atomic write, and remove any existing files or directories. fs::create_dir_all(&cache_entry.dir).await?; @@ -199,9 +205,9 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> LocalWheel::Disk(DiskWheel { dist: dist.clone(), - filename, path: cache_entry.path(), - target: cache_entry.path(), + target: cache_entry.with_file(filename.stem()).path(), + filename, }) }; @@ -215,28 +221,25 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> Dist::Built(BuiltDist::DirectUrl(wheel)) => { debug!("Fetching disk-based wheel from URL: {}", wheel.url); - // Create a directory for the wheel. - let wheel_filename = wheel.filename()?; - let cache_entry = self.cache.entry( - CacheBucket::Wheels, - WheelCache::Url(&wheel.url).wheel_dir(), - wheel_filename.to_string(), - ); - // Fetch the wheel. let reader = self.client.stream_external(&wheel.url).await?; // Download the wheel into the cache. // TODO(charlie): Use an atomic write, and remove any existing files or directories. + let cache_entry = self.cache.entry( + CacheBucket::Wheels, + WheelCache::Url(&wheel.url).wheel_dir(), + wheel.filename.to_string(), + ); fs::create_dir_all(&cache_entry.dir).await?; let mut writer = fs::File::create(&cache_entry.path()).await?; tokio::io::copy(&mut reader.compat(), &mut writer).await?; let local_wheel = LocalWheel::Disk(DiskWheel { dist: dist.clone(), - filename: wheel.filename.clone(), path: cache_entry.path(), - target: cache_entry.path(), + target: cache_entry.with_file(wheel.filename.stem()).path(), + filename: wheel.filename.clone(), }); if let Some(reporter) = self.reporter.as_ref() { @@ -250,14 +253,14 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> let cache_entry = self.cache.entry( CacheBucket::Wheels, WheelCache::Url(&wheel.url).wheel_dir(), - wheel.filename.to_string(), + wheel.filename.stem(), ); Ok(LocalWheel::Disk(DiskWheel { dist: dist.clone(), - filename: wheel.filename.clone(), path: wheel.path.clone(), target: cache_entry.path(), + filename: wheel.filename.clone(), })) } @@ -268,9 +271,9 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> let built_wheel = self.builder.download_and_build(source_dist).await?; Ok(LocalWheel::Built(BuiltWheel { dist: dist.clone(), + path: built_wheel.path, + target: built_wheel.target, filename: built_wheel.filename, - path: built_wheel.path.clone(), - target: built_wheel.path, })) } } diff --git a/crates/puffin-distribution/src/download.rs b/crates/puffin-distribution/src/download.rs index fc4554fd6..f98ad6191 100644 --- a/crates/puffin-distribution/src/download.rs +++ b/crates/puffin-distribution/src/download.rs @@ -15,7 +15,7 @@ use crate::error::Error; pub struct InMemoryWheel { /// The remote distribution from which this wheel was downloaded. pub(crate) dist: Dist, - /// The parsed filename + /// The parsed filename. pub(crate) filename: WheelFilename, /// The contents of the wheel. pub(crate) buffer: Vec, @@ -28,7 +28,7 @@ pub struct InMemoryWheel { pub struct DiskWheel { /// The remote distribution from which this wheel was downloaded. pub(crate) dist: Dist, - /// The parsed filename + /// The parsed filename. pub(crate) filename: WheelFilename, /// The path to the downloaded wheel. pub(crate) path: PathBuf, @@ -41,7 +41,7 @@ pub struct DiskWheel { pub struct BuiltWheel { /// The remote source distribution from which this wheel was built. pub(crate) dist: Dist, - /// The parsed filename + /// The parsed filename. pub(crate) filename: WheelFilename, /// The path to the built wheel. pub(crate) path: PathBuf, diff --git a/crates/puffin-distribution/src/source_dist.rs b/crates/puffin-distribution/src/source_dist.rs index 40c134943..e496ded6f 100644 --- a/crates/puffin-distribution/src/source_dist.rs +++ b/crates/puffin-distribution/src/source_dist.rs @@ -92,8 +92,13 @@ type Metadata21s = FxHashMap; /// The information about the wheel we either just built or got from the cache #[derive(Debug, Clone)] pub struct BuiltWheelMetadata { + /// The path to the built wheel. pub path: PathBuf, + /// The expected path to the downloaded wheel's entry in the cache. + pub target: PathBuf, + /// The parsed filename. pub filename: WheelFilename, + /// The metadata of the built wheel. pub metadata: Metadata21, } @@ -105,6 +110,7 @@ impl BuiltWheelMetadata { ) -> Self { Self { path: cache_entry.dir.join(&cached_data.disk_filename), + target: cache_entry.dir.join(filename.stem()), filename: filename.clone(), metadata: cached_data.metadata.clone(), } @@ -206,6 +212,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { // Read the metadata from the wheel. let filename = WheelFilename::from_str(&disk_filename)?; let path = wheel_dir.join(disk_filename); + let target = wheel_dir.join(filename.stem()); let metadata = read_metadata(&filename, &path)?; if let Some(task) = task { @@ -216,6 +223,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { BuiltWheelMetadata { path, + target, filename, metadata, } @@ -452,8 +460,12 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { } } + let path = cache_entry.dir.join(&disk_filename); + let target = cache_entry.dir.join(filename.stem()); + Ok(BuiltWheelMetadata { - path: cache_entry.dir.join(&disk_filename), + path, + target, filename, metadata, }) diff --git a/crates/puffin-installer/src/plan.rs b/crates/puffin-installer/src/plan.rs index a8b9a41e3..f891896a0 100644 --- a/crates/puffin-installer/src/plan.rs +++ b/crates/puffin-installer/src/plan.rs @@ -112,11 +112,9 @@ impl InstallPlan { Some(VersionOrUrl::Url(url)) => { // TODO(konstin): Add source dist url support. It's more tricky since we don't // know yet whether source dist is fresh in the cache. - if let Ok((disk_filename, filename)) = - url.filename().and_then(|disk_filename| { - let filename = WheelFilename::from_str(disk_filename)?; - Ok((disk_filename, filename)) - }) + if let Ok(filename) = url + .filename() + .and_then(|disk_filename| Ok(WheelFilename::from_str(disk_filename)?)) { if requirement.name != filename.name { bail!( @@ -129,7 +127,7 @@ impl InstallPlan { let cache_entry = cache.entry( CacheBucket::Wheels, WheelCache::Url(url).wheel_dir(), - disk_filename.to_string(), + filename.stem(), ); // Ignore zipped wheels, which represent intermediary cached artifacts. diff --git a/crates/puffin-installer/src/registry_index.rs b/crates/puffin-installer/src/registry_index.rs index 17af2e546..fbe7e41c9 100644 --- a/crates/puffin-installer/src/registry_index.rs +++ b/crates/puffin-installer/src/registry_index.rs @@ -41,11 +41,6 @@ impl RegistryIndex { } }; - // Ignore zipped wheels, which represent intermediary cached artifacts. - if !path.is_dir() { - continue; - } - match CachedRegistryDist::try_from_path(&path) { Ok(None) => {} Ok(Some(dist_info)) => {