Remove `.whl` extension for cached, unzipped wheels (#574)

## Summary

This PR uses the wheel stem (e.g., `foo-1.2.3-py3-none-any`) instead of
the wheel name (e.g., `foo-1.2.3-py3-none-any.whl`) when storing
unzipped wheels in the cache, which removes a class of confusing issues
around overwrites and directory-vs.-file collisions.

For now, we retain _both_ the zipped and unzipped wheels in the cache,
though we can easily change this by storing the zipped wheels in a
temporary directory.

Closes https://github.com/astral-sh/puffin/issues/573.

## Test Plan

Some examples from my local cache:

<img width="835" alt="Screen Shot 2023-12-05 at 4 09 55 PM"
src="https://github.com/astral-sh/puffin/assets/1309177/784146aa-b080-416e-9767-40c843fe5d6a">
<img width="847" alt="Screen Shot 2023-12-05 at 4 12 14 PM"
src="https://github.com/astral-sh/puffin/assets/1309177/4bc7f30f-bef3-47f1-b4e8-da9cabf87f28">
<img width="637" alt="Screen Shot 2023-12-05 at 4 09 50 PM"
src="https://github.com/astral-sh/puffin/assets/1309177/25ca4944-4a06-4a08-ac85-c6f7d8b5c8ea">
This commit is contained in:
Charlie Marsh 2023-12-05 17:41:22 -05:00 committed by GitHub
parent a15da36d74
commit 5370484307
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 133 additions and 98 deletions

View File

@ -1,8 +1,8 @@
#[cfg(feature = "serde")]
use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
use std::fmt::{Display, Formatter};
use std::str::FromStr;
#[cfg(feature = "serde")]
use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
use thiserror::Error;
use url::Url;
@ -23,13 +23,69 @@ impl FromStr for WheelFilename {
type Err = WheelFilenameError;
fn from_str(filename: &str) -> Result<Self, Self::Err> {
let basename = filename.strip_suffix(".whl").ok_or_else(|| {
let stem = filename.strip_suffix(".whl").ok_or_else(|| {
WheelFilenameError::InvalidWheelFileName(
filename.to_string(),
"Must end with .whl".to_string(),
)
})?;
Self::parse(stem, filename)
}
}
impl Display for WheelFilename {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}-{}-{}.whl",
self.name.as_dist_info_name(),
self.version,
self.get_tag()
)
}
}
impl WheelFilename {
/// Returns `true` if the wheel is compatible with the given tags.
pub fn is_compatible(&self, compatible_tags: &Tags) -> bool {
compatible_tags.is_compatible(&self.python_tag, &self.abi_tag, &self.platform_tag)
}
/// Return the [`TagPriority`] score of the wheel with the given tags, or `None` if the wheel is
/// incompatible.
pub fn compatibility(&self, compatible_tags: &Tags) -> Option<TagPriority> {
compatible_tags.compatibility(&self.python_tag, &self.abi_tag, &self.platform_tag)
}
/// Get the tag for this wheel.
pub fn get_tag(&self) -> String {
format!(
"{}-{}-{}",
self.python_tag.join("."),
self.abi_tag.join("."),
self.platform_tag.join(".")
)
}
/// The wheel filename without the extension.
pub fn stem(&self) -> String {
format!(
"{}-{}-{}",
self.name.as_dist_info_name(),
self.version,
self.get_tag()
)
}
/// Parse a wheel filename from the stem (e.g., `foo-1.2.3-py3-none-any`).
pub fn from_stem(stem: &str) -> Result<Self, WheelFilenameError> {
Self::parse(stem, stem)
}
/// Parse a wheel filename from the stem (e.g., `foo-1.2.3-py3-none-any`).
///
/// The originating `filename` is used for high-fidelity error messages.
fn parse(stem: &str, filename: &str) -> Result<Self, WheelFilenameError> {
// The wheel filename should contain either five or six entries. If six, then the third
// entry is the build tag. If five, then the third entry is the Python tag.
// https://www.python.org/dev/peps/pep-0427/#file-name-convention
@ -39,7 +95,7 @@ impl FromStr for WheelFilename {
// is used to break ties. This might mean that we generate identical
// `WheelName` values for multiple distinct wheels, but it's not clear
// if this is a problem in practice.
let mut parts = basename.split('-');
let mut parts = stem.split('-');
let name = parts
.next()
@ -112,46 +168,6 @@ impl FromStr for WheelFilename {
}
}
impl Display for WheelFilename {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}-{}-{}.whl",
self.name.as_dist_info_name(),
self.version,
self.get_tag()
)
}
}
impl WheelFilename {
/// Returns `true` if the wheel is compatible with the given tags.
pub fn is_compatible(&self, compatible_tags: &Tags) -> bool {
compatible_tags.is_compatible(&self.python_tag, &self.abi_tag, &self.platform_tag)
}
/// Return the [`TagPriority`] score of the wheel with the given tags, or `None` if the wheel is
/// incompatible.
pub fn compatibility(&self, compatible_tags: &Tags) -> Option<TagPriority> {
compatible_tags.compatibility(&self.python_tag, &self.abi_tag, &self.platform_tag)
}
/// Get the tag for this wheel.
pub fn get_tag(&self) -> String {
format!(
"{}-{}-{}",
self.python_tag.join("."),
self.abi_tag.join("."),
self.platform_tag.join(".")
)
}
/// The wheel filename without the extension
pub fn stem(&self) -> String {
format!("{}-{}-{}", self.name, self.version, self.get_tag())
}
}
impl TryFrom<&Url> for WheelFilename {
type Error = WheelFilenameError;

View File

@ -1,5 +1,4 @@
use std::path::{Path, PathBuf};
use std::str::FromStr;
use anyhow::Result;
use url::Url;
@ -125,6 +124,7 @@ impl CachedDist {
}
impl CachedDirectUrlDist {
/// Initialize a [`CachedDirectUrlDist`] from a [`WheelFilename`], [`Url`], and [`Path`].
pub fn from_url(filename: WheelFilename, url: Url, path: PathBuf) -> Self {
Self {
filename,
@ -135,7 +135,7 @@ impl CachedDirectUrlDist {
}
impl CachedRegistryDist {
/// Try to parse a distribution from a cached directory name (like `django-5.0a1`).
/// Try to parse a distribution from a cached directory name (like `typing-extensions-4.8.0-py3-none-any`).
pub fn try_from_path(path: &Path) -> Result<Option<Self>> {
let Some(file_name) = path.file_name() else {
return Ok(None);
@ -143,9 +143,12 @@ impl CachedRegistryDist {
let Some(file_name) = file_name.to_str() else {
return Ok(None);
};
let Ok(filename) = WheelFilename::from_str(file_name) else {
let Ok(filename) = WheelFilename::from_stem(file_name) else {
return Ok(None);
};
if path.is_file() {
return Ok(None);
}
let path = path.to_path_buf();

View File

@ -33,6 +33,11 @@ impl CacheEntry {
// TODO(konstin): Cache this to avoid allocations?
self.dir.join(&self.file)
}
#[must_use]
pub fn with_file(self, file: String) -> Self {
Self { file, ..self }
}
}
/// The main cache abstraction.
@ -115,10 +120,10 @@ impl Cache {
pub enum CacheBucket {
/// Wheels (excluding built wheels), their metadata and cache policy.
///
/// There are two kinds from cache entries: Wheel metadata and policy as json files, and the
/// wheels themselves. If a wheel file is over an in-memory size threshold, we first download
/// the zip file into the cache, then unzip it into a directory with the same name. Unlike the
/// built wheels cache, you should only see unpacked wheel directories after running puffin.
/// There are three kinds from cache entries: Wheel metadata and policy as JSON files, the
/// wheels themselves, and the unzipped wheel archives. If a wheel file is over an in-memory
/// size threshold, we first download the zip file into the cache, then unzip it into a
/// directory with the same name, omitting the `.whl` extension.
///
/// Cache structure:
/// * `wheel-metadata-v0/pypi/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}`
@ -170,6 +175,7 @@ pub enum CacheBucket {
/// ├── pypi
/// │ ...
/// │ ├── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
/// │ ├── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64
/// │ ...
/// └── url
/// └── 4b8be67c801a7ecb
@ -188,6 +194,7 @@ pub enum CacheBucket {
/// │ ├── ...
/// │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.json
/// │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
/// │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64
/// │ │ ├── pandas
/// │ │ │ ├── ...
/// │ │ ├── pandas-2.1.3.dist-info
@ -197,7 +204,8 @@ pub enum CacheBucket {
/// └── url
/// └── 4b8be67c801a7ecb
/// ├── flask-3.0.0-py3-none-any.json
/// └── flask-3.0.0-py3-none-any.whl
/// ├── flask-3.0.0-py3-none-any.json
/// └── flask-3.0.0-py3-none-any
/// ├── flask
/// │ └── ...
/// └── flask-3.0.0.dist-info
@ -207,17 +215,17 @@ pub enum CacheBucket {
/// the source distribution.
///
/// The structure is similar of that of the `Wheel` bucket, except we have an additional layer
/// for the source dist filename and the metadata is on the source dist level, not on the wheel
/// level.
/// for the source distribution filename and the metadata is at the source distribution-level,
/// not at the wheel level.
///
/// TODO(konstin): The cache policy should be on the source dist level, the metadata we can put
/// next to the wheels as in the `Wheels` bucket.
/// TODO(konstin): The cache policy should be on the source distribution level, the metadata we
/// can put next to the wheels as in the `Wheels` bucket.
///
/// Source distributions are built into zipped wheel files (as PEP 517 specifies) and unzipped
/// lazily before installing. So when resolving, we only build the wheel and store the archive
/// file in the cache, when installing, we unpack it and replace the zip file with an unzipped
/// directory under the same. You may find a mix of wheel archive zip files and unzipped
/// wheel directories in the cache.
/// file in the cache, when installing, we unpack it under the same name, omitting the `.whl`
/// extension. You may find a mix of wheel archive zip files and unzipped wheel directories in
/// the cache.
///
/// Cache structure:
/// * `built-wheels-v0/pypi/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}`

View File

@ -149,13 +149,7 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
DistributionDatabaseError::Url(wheel.file.url.to_string(), err)
})?;
let filename = WheelFilename::from_str(&wheel.file.filename)?;
let wheel_filename = &wheel.file.filename;
let cache_entry = self.cache.entry(
CacheBucket::Wheels,
WheelCache::Index(&wheel.index).wheel_dir(),
wheel_filename.to_string(),
);
let reader = self.client.stream_external(&url).await?;
// If the file is greater than 5MB, write it to disk; otherwise, keep it in memory.
@ -175,6 +169,12 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
ByteSize::b(small_size as u64)
);
let cache_entry = self.cache.entry(
CacheBucket::Wheels,
WheelCache::Index(&wheel.index).wheel_dir(),
filename.stem(),
);
// Read into a buffer.
let mut buffer = Vec::with_capacity(small_size);
let mut reader = tokio::io::BufReader::new(reader.compat());
@ -182,15 +182,21 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
LocalWheel::InMemory(InMemoryWheel {
dist: dist.clone(),
filename,
buffer,
target: cache_entry.path(),
buffer,
filename,
})
} else {
let size =
small_size.map_or("unknown size".to_string(), |size| size.to_string());
debug!("Fetching disk-based wheel from registry: {dist} ({size})");
let cache_entry = self.cache.entry(
CacheBucket::Wheels,
WheelCache::Index(&wheel.index).wheel_dir(),
filename.to_string(),
);
// Download the wheel into the cache.
// TODO(charlie): Use an atomic write, and remove any existing files or directories.
fs::create_dir_all(&cache_entry.dir).await?;
@ -199,9 +205,9 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
LocalWheel::Disk(DiskWheel {
dist: dist.clone(),
filename,
path: cache_entry.path(),
target: cache_entry.path(),
target: cache_entry.with_file(filename.stem()).path(),
filename,
})
};
@ -215,28 +221,25 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
Dist::Built(BuiltDist::DirectUrl(wheel)) => {
debug!("Fetching disk-based wheel from URL: {}", wheel.url);
// Create a directory for the wheel.
let wheel_filename = wheel.filename()?;
let cache_entry = self.cache.entry(
CacheBucket::Wheels,
WheelCache::Url(&wheel.url).wheel_dir(),
wheel_filename.to_string(),
);
// Fetch the wheel.
let reader = self.client.stream_external(&wheel.url).await?;
// Download the wheel into the cache.
// TODO(charlie): Use an atomic write, and remove any existing files or directories.
let cache_entry = self.cache.entry(
CacheBucket::Wheels,
WheelCache::Url(&wheel.url).wheel_dir(),
wheel.filename.to_string(),
);
fs::create_dir_all(&cache_entry.dir).await?;
let mut writer = fs::File::create(&cache_entry.path()).await?;
tokio::io::copy(&mut reader.compat(), &mut writer).await?;
let local_wheel = LocalWheel::Disk(DiskWheel {
dist: dist.clone(),
filename: wheel.filename.clone(),
path: cache_entry.path(),
target: cache_entry.path(),
target: cache_entry.with_file(wheel.filename.stem()).path(),
filename: wheel.filename.clone(),
});
if let Some(reporter) = self.reporter.as_ref() {
@ -250,14 +253,14 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
let cache_entry = self.cache.entry(
CacheBucket::Wheels,
WheelCache::Url(&wheel.url).wheel_dir(),
wheel.filename.to_string(),
wheel.filename.stem(),
);
Ok(LocalWheel::Disk(DiskWheel {
dist: dist.clone(),
filename: wheel.filename.clone(),
path: wheel.path.clone(),
target: cache_entry.path(),
filename: wheel.filename.clone(),
}))
}
@ -268,9 +271,9 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
let built_wheel = self.builder.download_and_build(source_dist).await?;
Ok(LocalWheel::Built(BuiltWheel {
dist: dist.clone(),
path: built_wheel.path,
target: built_wheel.target,
filename: built_wheel.filename,
path: built_wheel.path.clone(),
target: built_wheel.path,
}))
}
}

View File

@ -15,7 +15,7 @@ use crate::error::Error;
pub struct InMemoryWheel {
/// The remote distribution from which this wheel was downloaded.
pub(crate) dist: Dist,
/// The parsed filename
/// The parsed filename.
pub(crate) filename: WheelFilename,
/// The contents of the wheel.
pub(crate) buffer: Vec<u8>,
@ -28,7 +28,7 @@ pub struct InMemoryWheel {
pub struct DiskWheel {
/// The remote distribution from which this wheel was downloaded.
pub(crate) dist: Dist,
/// The parsed filename
/// The parsed filename.
pub(crate) filename: WheelFilename,
/// The path to the downloaded wheel.
pub(crate) path: PathBuf,
@ -41,7 +41,7 @@ pub struct DiskWheel {
pub struct BuiltWheel {
/// The remote source distribution from which this wheel was built.
pub(crate) dist: Dist,
/// The parsed filename
/// The parsed filename.
pub(crate) filename: WheelFilename,
/// The path to the built wheel.
pub(crate) path: PathBuf,

View File

@ -92,8 +92,13 @@ type Metadata21s = FxHashMap<WheelFilename, DiskFilenameAndMetadata>;
/// The information about the wheel we either just built or got from the cache
#[derive(Debug, Clone)]
pub struct BuiltWheelMetadata {
/// The path to the built wheel.
pub path: PathBuf,
/// The expected path to the downloaded wheel's entry in the cache.
pub target: PathBuf,
/// The parsed filename.
pub filename: WheelFilename,
/// The metadata of the built wheel.
pub metadata: Metadata21,
}
@ -105,6 +110,7 @@ impl BuiltWheelMetadata {
) -> Self {
Self {
path: cache_entry.dir.join(&cached_data.disk_filename),
target: cache_entry.dir.join(filename.stem()),
filename: filename.clone(),
metadata: cached_data.metadata.clone(),
}
@ -206,6 +212,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
// Read the metadata from the wheel.
let filename = WheelFilename::from_str(&disk_filename)?;
let path = wheel_dir.join(disk_filename);
let target = wheel_dir.join(filename.stem());
let metadata = read_metadata(&filename, &path)?;
if let Some(task) = task {
@ -216,6 +223,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
BuiltWheelMetadata {
path,
target,
filename,
metadata,
}
@ -452,8 +460,12 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> {
}
}
let path = cache_entry.dir.join(&disk_filename);
let target = cache_entry.dir.join(filename.stem());
Ok(BuiltWheelMetadata {
path: cache_entry.dir.join(&disk_filename),
path,
target,
filename,
metadata,
})

View File

@ -112,11 +112,9 @@ impl InstallPlan {
Some(VersionOrUrl::Url(url)) => {
// TODO(konstin): Add source dist url support. It's more tricky since we don't
// know yet whether source dist is fresh in the cache.
if let Ok((disk_filename, filename)) =
url.filename().and_then(|disk_filename| {
let filename = WheelFilename::from_str(disk_filename)?;
Ok((disk_filename, filename))
})
if let Ok(filename) = url
.filename()
.and_then(|disk_filename| Ok(WheelFilename::from_str(disk_filename)?))
{
if requirement.name != filename.name {
bail!(
@ -129,7 +127,7 @@ impl InstallPlan {
let cache_entry = cache.entry(
CacheBucket::Wheels,
WheelCache::Url(url).wheel_dir(),
disk_filename.to_string(),
filename.stem(),
);
// Ignore zipped wheels, which represent intermediary cached artifacts.

View File

@ -41,11 +41,6 @@ impl RegistryIndex {
}
};
// Ignore zipped wheels, which represent intermediary cached artifacts.
if !path.is_dir() {
continue;
}
match CachedRegistryDist::try_from_path(&path) {
Ok(None) => {}
Ok(Some(dist_info)) => {