From fb35875f24d923280aced423ecbe6a693d2d87ee Mon Sep 17 00:00:00 2001 From: Ankit Saini <74284503+nkitsaini@users.noreply.github.com> Date: Thu, 27 Feb 2025 04:11:57 +0530 Subject: [PATCH] Use hash instead of full wheel name in wheels bucket (#11738) ## Summary Closes #2410 This changes the name of files in `wheels` bucket to use a hash instead of the wheel name as to not exceed maximum file length limit on various systems. This only addresses the primary concern of #2410. It still does _not_ address: - Path limit of 260 on windows: https://github.com/astral-sh/uv/issues/2410#issuecomment-2062020882 To solve this we need to opt-in to longer path limits on windows ([ref](https://github.com/astral-sh/uv/issues/2410#issuecomment-2150532658)), but I think that is a separate issue and should be a separate MR. - Exceeding filename limit while building a wheel from source distribution As per my understanding, this is out of uv's control. Name of the output wheel will be decided by build-backend used by the project. For wheels built from source distribution, pip also uses the wheel names in cache. So I have not touched `sdists` cache. I have added a `filename: WheelFileName` field in `Archive`, so we can use it while indexing instead of relying on the filename on disk. Another way to do this was to read `.dist-info/WHEEL` and `.dist-info/METADATA` and build `WheelFileName` but that seems less robust and will be slower. ## Test Plan Tested by installing `yt-dlp`, `httpie` and `sqlalchemy` and verifying that cache files in `wheels` bucket use hash. --------- Co-authored-by: Charlie Marsh --- Cargo.lock | 1 + crates/uv-cache-key/src/digest.rs | 6 +- crates/uv-cache/src/lib.rs | 169 +++++++----------- crates/uv-client/src/registry_client.rs | 4 +- crates/uv-distribution-filename/Cargo.toml | 1 + crates/uv-distribution-filename/src/wheel.rs | 55 +++++- crates/uv-distribution/src/archive.rs | 6 +- .../src/distribution_database.rs | 18 +- .../uv-distribution/src/index/cached_wheel.rs | 19 +- .../src/source/built_wheel_metadata.rs | 4 +- crates/uv-installer/src/plan.rs | 4 +- crates/uv/tests/it/cache_prune.rs | 2 +- 12 files changed, 148 insertions(+), 141 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index adb63c292..cd650bede 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5033,6 +5033,7 @@ dependencies = [ "smallvec", "thiserror 2.0.11", "url", + "uv-cache-key", "uv-normalize", "uv-pep440", "uv-platform-tags", diff --git a/crates/uv-cache-key/src/digest.rs b/crates/uv-cache-key/src/digest.rs index 8a74ce018..a2387868d 100644 --- a/crates/uv-cache-key/src/digest.rs +++ b/crates/uv-cache-key/src/digest.rs @@ -1,7 +1,9 @@ -use crate::cache_key::{CacheKey, CacheKeyHasher}; -use seahash::SeaHasher; use std::hash::{Hash, Hasher}; +use seahash::SeaHasher; + +use crate::cache_key::{CacheKey, CacheKeyHasher}; + /// Compute a hex string hash of a `CacheKey` object. /// /// The value returned by [`cache_digest`] should be stable across releases and platforms. diff --git a/crates/uv-cache/src/lib.rs b/crates/uv-cache/src/lib.rs index cac37f2d8..aeabeca55 100644 --- a/crates/uv-cache/src/lib.rs +++ b/crates/uv-cache/src/lib.rs @@ -360,43 +360,7 @@ impl Cache { /// Returns the number of entries removed from the cache. pub fn remove(&self, name: &PackageName) -> Result { // Collect the set of referenced archives. - let before = { - let mut references = FxHashSet::default(); - for bucket in CacheBucket::iter() { - let bucket = self.bucket(bucket); - if bucket.is_dir() { - for entry in walkdir::WalkDir::new(bucket) { - let entry = entry?; - - // Ignore any `.lock` files. - if entry - .path() - .extension() - .is_some_and(|ext| ext.eq_ignore_ascii_case("lock")) - { - continue; - } - - // Identify entries that match the wheel stem pattern (e.g., `typing-extensions-4.8.0-py3-none-any`). - let Some(filename) = entry - .path() - .file_name() - .and_then(|file_name| file_name.to_str()) - else { - continue; - }; - - if WheelFilename::from_stem(filename).is_err() { - continue; - } - if let Ok(target) = self.resolve_link(entry.path()) { - references.insert(target); - } - } - } - } - references - }; + let before = self.find_archive_references()?; // Remove any entries for the package from the cache. let mut summary = Removal::default(); @@ -405,42 +369,7 @@ impl Cache { } // Collect the set of referenced archives after the removal. - let after = { - let mut references = FxHashSet::default(); - for bucket in CacheBucket::iter() { - let bucket = self.bucket(bucket); - if bucket.is_dir() { - for entry in walkdir::WalkDir::new(bucket) { - let entry = entry?; - - // Ignore any `.lock` files. - if entry - .path() - .extension() - .is_some_and(|ext| ext.eq_ignore_ascii_case("lock")) - { - continue; - } - - // Identify entries that match the wheel stem pattern (e.g., `typing-extensions-4.8.0-py3-none-any`). - let Some(filename) = entry - .path() - .file_name() - .and_then(|file_name| file_name.to_str()) - else { - continue; - }; - if WheelFilename::from_stem(filename).is_err() { - continue; - } - if let Ok(target) = self.resolve_link(entry.path()) { - references.insert(target); - } - } - } - } - references - }; + let after = self.find_archive_references()?; if before != after { // Remove any archives that are no longer referenced. @@ -562,40 +491,7 @@ impl Cache { } // Fourth, remove any unused archives (by searching for archives that are not symlinked). - let mut references = FxHashSet::default(); - - for bucket in CacheBucket::iter() { - let bucket = self.bucket(bucket); - if bucket.is_dir() { - for entry in walkdir::WalkDir::new(bucket) { - let entry = entry?; - - // Ignore any `.lock` files. - if entry - .path() - .extension() - .is_some_and(|ext| ext.eq_ignore_ascii_case("lock")) - { - continue; - } - - // Identify entries that match the wheel stem pattern (e.g., `typing-extensions-4.8.0-py3-none-any`). - let Some(filename) = entry - .path() - .file_name() - .and_then(|file_name| file_name.to_str()) - else { - continue; - }; - if WheelFilename::from_stem(filename).is_err() { - continue; - } - if let Ok(target) = self.resolve_link(entry.path()) { - references.insert(target); - } - } - } - } + let references = self.find_archive_references()?; match fs_err::read_dir(self.bucket(CacheBucket::Archive)) { Ok(entries) => { @@ -615,6 +511,63 @@ impl Cache { Ok(summary) } + /// Find all references to entries in the archive bucket. + /// + /// Archive entries are often referenced by symlinks in other cache buckets. This method + /// searches for all such references. + fn find_archive_references(&self) -> Result, io::Error> { + let mut references = FxHashSet::default(); + for bucket in CacheBucket::iter() { + let bucket_path = self.bucket(bucket); + if bucket_path.is_dir() { + for entry in walkdir::WalkDir::new(bucket_path) { + let entry = entry?; + + // Ignore any `.lock` files. + if entry + .path() + .extension() + .is_some_and(|ext| ext.eq_ignore_ascii_case("lock")) + { + continue; + } + + let Some(filename) = entry + .path() + .file_name() + .and_then(|file_name| file_name.to_str()) + else { + continue; + }; + + if bucket == CacheBucket::Wheels { + // In the `wheels` bucket, we often use a hash of the filename as the + // directory name, so we can't rely on the stem. + // + // Instead, we skip if it contains an extension (e.g., `.whl`, `.http`, + // `.rev`, and `.msgpack` files). + if filename + .rsplit_once('-') // strip version/tags, might contain a dot ('.') + .is_none_or(|(_, suffix)| suffix.contains('.')) + { + continue; + } + } else { + // For other buckets only include entries that match the wheel stem pattern (e.g., `typing-extensions-4.8.0-py3-none-any`). + if WheelFilename::from_stem(filename).is_err() { + continue; + } + } + + if let Ok(target) = self.resolve_link(entry.path()) { + references.insert(target); + } + } + } + } + Ok(references) + } + /// Create a link to a directory in the archive bucket. /// /// On Windows, we write structured data ([`Link`]) to a file containing the archive ID and @@ -1040,7 +993,7 @@ impl CacheBucket { Self::Simple => "simple-v15", // Note that when bumping this, you'll also need to bump it // in `crates/uv/tests/it/cache_prune.rs`. - Self::Wheels => "wheels-v4", + Self::Wheels => "wheels-v5", // Note that when bumping this, you'll also need to bump // `ARCHIVE_VERSION` in `crates/uv-cache/src/lib.rs`. Self::Archive => "archive-v0", diff --git a/crates/uv-client/src/registry_client.rs b/crates/uv-client/src/registry_client.rs index 76f1d7134..ec636775d 100644 --- a/crates/uv-client/src/registry_client.rs +++ b/crates/uv-client/src/registry_client.rs @@ -618,7 +618,7 @@ impl RegistryClient { let cache_entry = self.cache.entry( CacheBucket::Wheels, WheelCache::Index(index).wheel_dir(filename.name.as_ref()), - format!("{}.msgpack", filename.stem()), + format!("{}.msgpack", filename.cache_key()), ); let cache_control = match self.connectivity { Connectivity::Online => CacheControl::from( @@ -688,7 +688,7 @@ impl RegistryClient { let cache_entry = self.cache.entry( CacheBucket::Wheels, cache_shard.wheel_dir(filename.name.as_ref()), - format!("{}.msgpack", filename.stem()), + format!("{}.msgpack", filename.cache_key()), ); let cache_control = match self.connectivity { Connectivity::Online => CacheControl::from( diff --git a/crates/uv-distribution-filename/Cargo.toml b/crates/uv-distribution-filename/Cargo.toml index ee2eca251..f30e79b3b 100644 --- a/crates/uv-distribution-filename/Cargo.toml +++ b/crates/uv-distribution-filename/Cargo.toml @@ -16,6 +16,7 @@ doctest = false workspace = true [dependencies] +uv-cache-key = { workspace = true } uv-normalize = { workspace = true } uv-pep440 = { workspace = true } uv-platform-tags = { workspace = true } diff --git a/crates/uv-distribution-filename/src/wheel.rs b/crates/uv-distribution-filename/src/wheel.rs index 699261194..bc37bf707 100644 --- a/crates/uv-distribution-filename/src/wheel.rs +++ b/crates/uv-distribution-filename/src/wheel.rs @@ -1,11 +1,13 @@ use std::fmt::{Display, Formatter}; +use std::hash::Hash; +use std::str::FromStr; use memchr::memchr; use serde::{de, Deserialize, Deserializer, Serialize, Serializer}; -use std::str::FromStr; use thiserror::Error; use url::Url; +use uv_cache_key::cache_digest; use uv_normalize::{InvalidNameError, PackageName}; use uv_pep440::{Version, VersionParseError}; use uv_platform_tags::{ @@ -104,6 +106,30 @@ impl WheelFilename { ) } + /// Returns a consistent cache key with a maximum length of 64 characters. + /// + /// Prefers `{version}-{tags}` if such an identifier fits within the maximum allowed length; + /// otherwise, uses a truncated version of the version and a digest of the tags. + pub fn cache_key(&self) -> String { + const CACHE_KEY_MAX_LEN: usize = 64; + + let full = format!("{}-{}", self.version, self.tags); + if full.len() <= CACHE_KEY_MAX_LEN { + return full; + } + + // Create a digest of the tag string (instead of its individual fields) to retain + // compatibility across platforms, Rust versions, etc. + let digest = cache_digest(&format!("{}", self.tags)); + + // Truncate the version, but avoid trailing dots, plus signs, etc. to avoid ambiguity. + let version_width = CACHE_KEY_MAX_LEN - 1 /* dash */ - 16 /* digest */; + let version = format!("{:.version_width$}", self.version); + let version = version.trim_end_matches(['.', '+']); + + format!("{version}-{digest}") + } + /// Return the wheel's Python tags. pub fn python_tags(&self) -> &[LanguageTag] { match &self.tags { @@ -450,4 +476,31 @@ mod tests { ); } } + + #[test] + fn cache_key() { + // Short names should use `version-tags` format. + let filename = WheelFilename::from_str("django_allauth-0.51.0-py3-none-any.whl").unwrap(); + insta::assert_snapshot!(filename.cache_key(), @"0.51.0-py3-none-any"); + + // Common `manylinux` names should use still use the `version-tags` format. + let filename = WheelFilename::from_str( + "numpy-1.26.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + ) + .unwrap(); + insta::assert_snapshot!(filename.cache_key(), @"1.26.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64"); + + // But larger names should use the `truncated(version)-digest(tags)` format. + let filename = WheelFilename::from_str( + "numpy-1.26.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.musllinux_1_2.whl", + ) + .unwrap(); + insta::assert_snapshot!(filename.cache_key(), @"1.26.2-5a2adc379b2dc214"); + + // Larger versions should get truncated. + let filename = WheelFilename::from_str( + "example-1.2.3.4.5.6.7.8.9.0.1.2.3.4.5.6.7.8.9.0.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl" + ).unwrap(); + insta::assert_snapshot!(filename.cache_key(), @"1.2.3.4.5.6.7.8.9.0.1.2.3.4.5.6.7.8.9.0.1.2-80bf8598e9647cf7"); + } } diff --git a/crates/uv-distribution/src/archive.rs b/crates/uv-distribution/src/archive.rs index aa5ebc3e0..d5668249d 100644 --- a/crates/uv-distribution/src/archive.rs +++ b/crates/uv-distribution/src/archive.rs @@ -1,4 +1,5 @@ use uv_cache::{ArchiveId, Cache, ARCHIVE_VERSION}; +use uv_distribution_filename::WheelFilename; use uv_distribution_types::Hashed; use uv_pypi_types::{HashDigest, HashDigests}; @@ -9,16 +10,19 @@ pub struct Archive { pub id: ArchiveId, /// The computed hashes of the archive. pub hashes: HashDigests, + /// The filename of the wheel. + pub filename: WheelFilename, /// The version of the archive bucket. pub version: u8, } impl Archive { /// Create a new [`Archive`] with the given ID and hashes. - pub(crate) fn new(id: ArchiveId, hashes: HashDigests) -> Self { + pub(crate) fn new(id: ArchiveId, hashes: HashDigests, filename: WheelFilename) -> Self { Self { id, hashes, + filename, version: ARCHIVE_VERSION, } } diff --git a/crates/uv-distribution/src/distribution_database.rs b/crates/uv-distribution/src/distribution_database.rs index b9e595cf3..ee32d28bf 100644 --- a/crates/uv-distribution/src/distribution_database.rs +++ b/crates/uv-distribution/src/distribution_database.rs @@ -189,7 +189,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> { let wheel_entry = self.build_context.cache().entry( CacheBucket::Wheels, WheelCache::Index(&wheel.index).wheel_dir(wheel.name().as_ref()), - wheel.filename.stem(), + wheel.filename.cache_key(), ); // If the URL is a file URL, load the wheel directly. @@ -262,7 +262,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> { let wheel_entry = self.build_context.cache().entry( CacheBucket::Wheels, WheelCache::Url(&wheel.url).wheel_dir(wheel.name().as_ref()), - wheel.filename.stem(), + wheel.filename.cache_key(), ); // Download and unzip. @@ -317,7 +317,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> { let cache_entry = self.build_context.cache().entry( CacheBucket::Wheels, WheelCache::Url(&wheel.url).wheel_dir(wheel.name().as_ref()), - wheel.filename.stem(), + wheel.filename.cache_key(), ); self.load_wheel( @@ -526,7 +526,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> { }; // Create an entry for the HTTP cache. - let http_entry = wheel_entry.with_file(format!("{}.http", filename.stem())); + let http_entry = wheel_entry.with_file(format!("{}.http", filename.cache_key())); let download = |response: reqwest::Response| { async { @@ -581,6 +581,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> { Ok(Archive::new( id, hashers.into_iter().map(HashDigest::from).collect(), + filename.clone(), )) } .instrument(info_span!("wheel", wheel = %dist)) @@ -658,7 +659,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> { }; // Create an entry for the HTTP cache. - let http_entry = wheel_entry.with_file(format!("{}.http", filename.stem())); + let http_entry = wheel_entry.with_file(format!("{}.http", filename.cache_key())); let download = |response: reqwest::Response| { async { @@ -745,7 +746,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> { reporter.on_download_complete(dist.name(), progress); } - Ok(Archive::new(id, hashes)) + Ok(Archive::new(id, hashes, filename.clone())) } .instrument(info_span!("wheel", wheel = %dist)) }; @@ -823,7 +824,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> { let modified = Timestamp::from_path(path).map_err(Error::CacheRead)?; // Attempt to read the archive pointer from the cache. - let pointer_entry = wheel_entry.with_file(format!("{}.rev", filename.stem())); + let pointer_entry = wheel_entry.with_file(format!("{}.rev", filename.cache_key())); let pointer = LocalArchivePointer::read_from(&pointer_entry)?; // Extract the archive from the pointer. @@ -846,6 +847,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> { let archive = Archive::new( self.unzip_wheel(path, wheel_entry.path()).await?, HashDigests::empty(), + filename.clone(), ); // Write the archive pointer to the cache. @@ -892,7 +894,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> { .map_err(Error::CacheWrite)?; // Create an archive. - let archive = Archive::new(id, hashes); + let archive = Archive::new(id, hashes, filename.clone()); // Write the archive pointer to the cache. let pointer = LocalArchivePointer { diff --git a/crates/uv-distribution/src/index/cached_wheel.rs b/crates/uv-distribution/src/index/cached_wheel.rs index 1f9f658c8..a53bf562b 100644 --- a/crates/uv-distribution/src/index/cached_wheel.rs +++ b/crates/uv-distribution/src/index/cached_wheel.rs @@ -116,14 +116,10 @@ impl CachedWheel { } } - /// Read a cached wheel from a `.http` pointer (e.g., `anyio-4.0.0-py3-none-any.http`). + /// Read a cached wheel from a `.http` pointer pub fn from_http_pointer(path: impl AsRef, cache: &Cache) -> Option { let path = path.as_ref(); - // Determine the wheel filename. - let filename = path.file_stem()?.to_str()?; - let filename = WheelFilename::from_stem(filename).ok()?; - // Read the pointer. let pointer = HttpArchivePointer::read_from(path).ok()??; let cache_info = pointer.to_cache_info(); @@ -135,26 +131,21 @@ impl CachedWheel { } let Archive { id, hashes, .. } = archive; - let entry = cache.entry(CacheBucket::Archive, "", id); // Convert to a cached wheel. Some(Self { - filename, + filename: archive.filename, entry, hashes, cache_info, }) } - /// Read a cached wheel from a `.rev` pointer (e.g., `anyio-4.0.0-py3-none-any.rev`). + /// Read a cached wheel from a `.rev` pointer pub fn from_local_pointer(path: impl AsRef, cache: &Cache) -> Option { let path = path.as_ref(); - // Determine the wheel filename. - let filename = path.file_stem()?.to_str()?; - let filename = WheelFilename::from_stem(filename).ok()?; - // Read the pointer. let pointer = LocalArchivePointer::read_from(path).ok()??; let cache_info = pointer.to_cache_info(); @@ -166,11 +157,11 @@ impl CachedWheel { } let Archive { id, hashes, .. } = archive; + let entry = cache.entry(CacheBucket::Archive, "", id); // Convert to a cached wheel. - let entry = cache.entry(CacheBucket::Archive, "", id); Some(Self { - filename, + filename: archive.filename, entry, hashes, cache_info, diff --git a/crates/uv-distribution/src/source/built_wheel_metadata.rs b/crates/uv-distribution/src/source/built_wheel_metadata.rs index cd979eea0..43eb2cbea 100644 --- a/crates/uv-distribution/src/source/built_wheel_metadata.rs +++ b/crates/uv-distribution/src/source/built_wheel_metadata.rs @@ -29,8 +29,8 @@ pub(crate) struct BuiltWheelMetadata { impl BuiltWheelMetadata { /// Find a compatible wheel in the cache. pub(crate) fn find_in_cache(tags: &Tags, cache_shard: &CacheShard) -> Option { - for directory in files(cache_shard) { - if let Some(metadata) = Self::from_path(directory, cache_shard) { + for file in files(cache_shard) { + if let Some(metadata) = Self::from_path(file, cache_shard) { // Validate that the wheel is compatible with the target platform. if metadata.filename.is_compatible(tags) { return Some(metadata); diff --git a/crates/uv-installer/src/plan.rs b/crates/uv-installer/src/plan.rs index d9aa7e8a2..157e2c21c 100644 --- a/crates/uv-installer/src/plan.rs +++ b/crates/uv-installer/src/plan.rs @@ -159,7 +159,7 @@ impl<'a> Planner<'a> { CacheBucket::Wheels, WheelCache::Url(&wheel.url).wheel_dir(wheel.name().as_ref()), ) - .entry(format!("{}.http", wheel.filename.stem())); + .entry(format!("{}.http", wheel.filename.cache_key())); // Read the HTTP pointer. if let Some(pointer) = HttpArchivePointer::read_from(&cache_entry)? { @@ -210,7 +210,7 @@ impl<'a> Planner<'a> { CacheBucket::Wheels, WheelCache::Url(&wheel.url).wheel_dir(wheel.name().as_ref()), ) - .entry(format!("{}.rev", wheel.filename.stem())); + .entry(format!("{}.rev", wheel.filename.cache_key())); if let Some(pointer) = LocalArchivePointer::read_from(&cache_entry)? { let timestamp = Timestamp::from_path(&wheel.install_path)?; diff --git a/crates/uv/tests/it/cache_prune.rs b/crates/uv/tests/it/cache_prune.rs index bb4bd876b..43e52d1d3 100644 --- a/crates/uv/tests/it/cache_prune.rs +++ b/crates/uv/tests/it/cache_prune.rs @@ -158,7 +158,7 @@ fn prune_stale_symlink() -> Result<()> { .success(); // Remove the wheels directory, causing the symlink to become stale. - let wheels = context.cache_dir.child("wheels-v4"); + let wheels = context.cache_dir.child("wheels-v5"); fs_err::remove_dir_all(wheels)?; let filters: Vec<_> = context