Use hash instead of full wheel name in wheels bucket (#11738)

<!--
Thank you for contributing to uv! To help us out with reviewing, please
consider the following:

- Does this pull request include a summary of the change? (See below.)
- Does this pull request include a descriptive title?
- Does this pull request include references to any relevant issues?
-->


## Summary
Closes #2410 
<!-- What's the purpose of the change? What does it do, and why? -->
This changes the name of files in `wheels` bucket to use a hash instead
of the wheel name as to not exceed maximum file length limit on various
systems.

This only addresses the primary concern of #2410. It still does _not_
address:
- Path limit of 260 on windows:
https://github.com/astral-sh/uv/issues/2410#issuecomment-2062020882
To solve this we need to opt-in to longer path limits on windows
([ref](https://github.com/astral-sh/uv/issues/2410#issuecomment-2150532658)),
but I think that is a separate issue and should be a separate MR.
- Exceeding filename limit while building a wheel from source
distribution
As per my understanding, this is out of uv's control. Name of the output
wheel will be decided by build-backend used by the project. For wheels
built from source distribution, pip also uses the wheel names in cache.
So I have not touched `sdists` cache.


I have added a `filename: WheelFileName` field in `Archive`, so we can
use it while indexing instead of relying on the filename on disk.
Another way to do this was to read `.dist-info/WHEEL` and
`.dist-info/METADATA` and build `WheelFileName` but that seems less
robust and will be slower.
## Test Plan

<!-- How was it tested? -->
Tested by installing `yt-dlp`, `httpie` and `sqlalchemy` and verifying
that cache files in `wheels` bucket use hash.

---------

Co-authored-by: Charlie Marsh <charlie.r.marsh@gmail.com>
This commit is contained in:
Ankit Saini 2025-02-27 04:11:57 +05:30 committed by GitHub
parent f9497432dc
commit fb35875f24
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 148 additions and 141 deletions

1
Cargo.lock generated
View File

@ -5033,6 +5033,7 @@ dependencies = [
"smallvec",
"thiserror 2.0.11",
"url",
"uv-cache-key",
"uv-normalize",
"uv-pep440",
"uv-platform-tags",

View File

@ -1,7 +1,9 @@
use crate::cache_key::{CacheKey, CacheKeyHasher};
use seahash::SeaHasher;
use std::hash::{Hash, Hasher};
use seahash::SeaHasher;
use crate::cache_key::{CacheKey, CacheKeyHasher};
/// Compute a hex string hash of a `CacheKey` object.
///
/// The value returned by [`cache_digest`] should be stable across releases and platforms.

View File

@ -360,43 +360,7 @@ impl Cache {
/// Returns the number of entries removed from the cache.
pub fn remove(&self, name: &PackageName) -> Result<Removal, io::Error> {
// Collect the set of referenced archives.
let before = {
let mut references = FxHashSet::default();
for bucket in CacheBucket::iter() {
let bucket = self.bucket(bucket);
if bucket.is_dir() {
for entry in walkdir::WalkDir::new(bucket) {
let entry = entry?;
// Ignore any `.lock` files.
if entry
.path()
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("lock"))
{
continue;
}
// Identify entries that match the wheel stem pattern (e.g., `typing-extensions-4.8.0-py3-none-any`).
let Some(filename) = entry
.path()
.file_name()
.and_then(|file_name| file_name.to_str())
else {
continue;
};
if WheelFilename::from_stem(filename).is_err() {
continue;
}
if let Ok(target) = self.resolve_link(entry.path()) {
references.insert(target);
}
}
}
}
references
};
let before = self.find_archive_references()?;
// Remove any entries for the package from the cache.
let mut summary = Removal::default();
@ -405,42 +369,7 @@ impl Cache {
}
// Collect the set of referenced archives after the removal.
let after = {
let mut references = FxHashSet::default();
for bucket in CacheBucket::iter() {
let bucket = self.bucket(bucket);
if bucket.is_dir() {
for entry in walkdir::WalkDir::new(bucket) {
let entry = entry?;
// Ignore any `.lock` files.
if entry
.path()
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("lock"))
{
continue;
}
// Identify entries that match the wheel stem pattern (e.g., `typing-extensions-4.8.0-py3-none-any`).
let Some(filename) = entry
.path()
.file_name()
.and_then(|file_name| file_name.to_str())
else {
continue;
};
if WheelFilename::from_stem(filename).is_err() {
continue;
}
if let Ok(target) = self.resolve_link(entry.path()) {
references.insert(target);
}
}
}
}
references
};
let after = self.find_archive_references()?;
if before != after {
// Remove any archives that are no longer referenced.
@ -562,40 +491,7 @@ impl Cache {
}
// Fourth, remove any unused archives (by searching for archives that are not symlinked).
let mut references = FxHashSet::default();
for bucket in CacheBucket::iter() {
let bucket = self.bucket(bucket);
if bucket.is_dir() {
for entry in walkdir::WalkDir::new(bucket) {
let entry = entry?;
// Ignore any `.lock` files.
if entry
.path()
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("lock"))
{
continue;
}
// Identify entries that match the wheel stem pattern (e.g., `typing-extensions-4.8.0-py3-none-any`).
let Some(filename) = entry
.path()
.file_name()
.and_then(|file_name| file_name.to_str())
else {
continue;
};
if WheelFilename::from_stem(filename).is_err() {
continue;
}
if let Ok(target) = self.resolve_link(entry.path()) {
references.insert(target);
}
}
}
}
let references = self.find_archive_references()?;
match fs_err::read_dir(self.bucket(CacheBucket::Archive)) {
Ok(entries) => {
@ -615,6 +511,63 @@ impl Cache {
Ok(summary)
}
/// Find all references to entries in the archive bucket.
///
/// Archive entries are often referenced by symlinks in other cache buckets. This method
/// searches for all such references.
fn find_archive_references(&self) -> Result<FxHashSet<PathBuf>, io::Error> {
let mut references = FxHashSet::default();
for bucket in CacheBucket::iter() {
let bucket_path = self.bucket(bucket);
if bucket_path.is_dir() {
for entry in walkdir::WalkDir::new(bucket_path) {
let entry = entry?;
// Ignore any `.lock` files.
if entry
.path()
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("lock"))
{
continue;
}
let Some(filename) = entry
.path()
.file_name()
.and_then(|file_name| file_name.to_str())
else {
continue;
};
if bucket == CacheBucket::Wheels {
// In the `wheels` bucket, we often use a hash of the filename as the
// directory name, so we can't rely on the stem.
//
// Instead, we skip if it contains an extension (e.g., `.whl`, `.http`,
// `.rev`, and `.msgpack` files).
if filename
.rsplit_once('-') // strip version/tags, might contain a dot ('.')
.is_none_or(|(_, suffix)| suffix.contains('.'))
{
continue;
}
} else {
// For other buckets only include entries that match the wheel stem pattern (e.g., `typing-extensions-4.8.0-py3-none-any`).
if WheelFilename::from_stem(filename).is_err() {
continue;
}
}
if let Ok(target) = self.resolve_link(entry.path()) {
references.insert(target);
}
}
}
}
Ok(references)
}
/// Create a link to a directory in the archive bucket.
///
/// On Windows, we write structured data ([`Link`]) to a file containing the archive ID and
@ -1040,7 +993,7 @@ impl CacheBucket {
Self::Simple => "simple-v15",
// Note that when bumping this, you'll also need to bump it
// in `crates/uv/tests/it/cache_prune.rs`.
Self::Wheels => "wheels-v4",
Self::Wheels => "wheels-v5",
// Note that when bumping this, you'll also need to bump
// `ARCHIVE_VERSION` in `crates/uv-cache/src/lib.rs`.
Self::Archive => "archive-v0",

View File

@ -618,7 +618,7 @@ impl RegistryClient {
let cache_entry = self.cache.entry(
CacheBucket::Wheels,
WheelCache::Index(index).wheel_dir(filename.name.as_ref()),
format!("{}.msgpack", filename.stem()),
format!("{}.msgpack", filename.cache_key()),
);
let cache_control = match self.connectivity {
Connectivity::Online => CacheControl::from(
@ -688,7 +688,7 @@ impl RegistryClient {
let cache_entry = self.cache.entry(
CacheBucket::Wheels,
cache_shard.wheel_dir(filename.name.as_ref()),
format!("{}.msgpack", filename.stem()),
format!("{}.msgpack", filename.cache_key()),
);
let cache_control = match self.connectivity {
Connectivity::Online => CacheControl::from(

View File

@ -16,6 +16,7 @@ doctest = false
workspace = true
[dependencies]
uv-cache-key = { workspace = true }
uv-normalize = { workspace = true }
uv-pep440 = { workspace = true }
uv-platform-tags = { workspace = true }

View File

@ -1,11 +1,13 @@
use std::fmt::{Display, Formatter};
use std::hash::Hash;
use std::str::FromStr;
use memchr::memchr;
use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
use std::str::FromStr;
use thiserror::Error;
use url::Url;
use uv_cache_key::cache_digest;
use uv_normalize::{InvalidNameError, PackageName};
use uv_pep440::{Version, VersionParseError};
use uv_platform_tags::{
@ -104,6 +106,30 @@ impl WheelFilename {
)
}
/// Returns a consistent cache key with a maximum length of 64 characters.
///
/// Prefers `{version}-{tags}` if such an identifier fits within the maximum allowed length;
/// otherwise, uses a truncated version of the version and a digest of the tags.
pub fn cache_key(&self) -> String {
const CACHE_KEY_MAX_LEN: usize = 64;
let full = format!("{}-{}", self.version, self.tags);
if full.len() <= CACHE_KEY_MAX_LEN {
return full;
}
// Create a digest of the tag string (instead of its individual fields) to retain
// compatibility across platforms, Rust versions, etc.
let digest = cache_digest(&format!("{}", self.tags));
// Truncate the version, but avoid trailing dots, plus signs, etc. to avoid ambiguity.
let version_width = CACHE_KEY_MAX_LEN - 1 /* dash */ - 16 /* digest */;
let version = format!("{:.version_width$}", self.version);
let version = version.trim_end_matches(['.', '+']);
format!("{version}-{digest}")
}
/// Return the wheel's Python tags.
pub fn python_tags(&self) -> &[LanguageTag] {
match &self.tags {
@ -450,4 +476,31 @@ mod tests {
);
}
}
#[test]
fn cache_key() {
// Short names should use `version-tags` format.
let filename = WheelFilename::from_str("django_allauth-0.51.0-py3-none-any.whl").unwrap();
insta::assert_snapshot!(filename.cache_key(), @"0.51.0-py3-none-any");
// Common `manylinux` names should use still use the `version-tags` format.
let filename = WheelFilename::from_str(
"numpy-1.26.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
)
.unwrap();
insta::assert_snapshot!(filename.cache_key(), @"1.26.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64");
// But larger names should use the `truncated(version)-digest(tags)` format.
let filename = WheelFilename::from_str(
"numpy-1.26.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.musllinux_1_2.whl",
)
.unwrap();
insta::assert_snapshot!(filename.cache_key(), @"1.26.2-5a2adc379b2dc214");
// Larger versions should get truncated.
let filename = WheelFilename::from_str(
"example-1.2.3.4.5.6.7.8.9.0.1.2.3.4.5.6.7.8.9.0.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl"
).unwrap();
insta::assert_snapshot!(filename.cache_key(), @"1.2.3.4.5.6.7.8.9.0.1.2.3.4.5.6.7.8.9.0.1.2-80bf8598e9647cf7");
}
}

View File

@ -1,4 +1,5 @@
use uv_cache::{ArchiveId, Cache, ARCHIVE_VERSION};
use uv_distribution_filename::WheelFilename;
use uv_distribution_types::Hashed;
use uv_pypi_types::{HashDigest, HashDigests};
@ -9,16 +10,19 @@ pub struct Archive {
pub id: ArchiveId,
/// The computed hashes of the archive.
pub hashes: HashDigests,
/// The filename of the wheel.
pub filename: WheelFilename,
/// The version of the archive bucket.
pub version: u8,
}
impl Archive {
/// Create a new [`Archive`] with the given ID and hashes.
pub(crate) fn new(id: ArchiveId, hashes: HashDigests) -> Self {
pub(crate) fn new(id: ArchiveId, hashes: HashDigests, filename: WheelFilename) -> Self {
Self {
id,
hashes,
filename,
version: ARCHIVE_VERSION,
}
}

View File

@ -189,7 +189,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
let wheel_entry = self.build_context.cache().entry(
CacheBucket::Wheels,
WheelCache::Index(&wheel.index).wheel_dir(wheel.name().as_ref()),
wheel.filename.stem(),
wheel.filename.cache_key(),
);
// If the URL is a file URL, load the wheel directly.
@ -262,7 +262,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
let wheel_entry = self.build_context.cache().entry(
CacheBucket::Wheels,
WheelCache::Url(&wheel.url).wheel_dir(wheel.name().as_ref()),
wheel.filename.stem(),
wheel.filename.cache_key(),
);
// Download and unzip.
@ -317,7 +317,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
let cache_entry = self.build_context.cache().entry(
CacheBucket::Wheels,
WheelCache::Url(&wheel.url).wheel_dir(wheel.name().as_ref()),
wheel.filename.stem(),
wheel.filename.cache_key(),
);
self.load_wheel(
@ -526,7 +526,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
};
// Create an entry for the HTTP cache.
let http_entry = wheel_entry.with_file(format!("{}.http", filename.stem()));
let http_entry = wheel_entry.with_file(format!("{}.http", filename.cache_key()));
let download = |response: reqwest::Response| {
async {
@ -581,6 +581,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
Ok(Archive::new(
id,
hashers.into_iter().map(HashDigest::from).collect(),
filename.clone(),
))
}
.instrument(info_span!("wheel", wheel = %dist))
@ -658,7 +659,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
};
// Create an entry for the HTTP cache.
let http_entry = wheel_entry.with_file(format!("{}.http", filename.stem()));
let http_entry = wheel_entry.with_file(format!("{}.http", filename.cache_key()));
let download = |response: reqwest::Response| {
async {
@ -745,7 +746,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
reporter.on_download_complete(dist.name(), progress);
}
Ok(Archive::new(id, hashes))
Ok(Archive::new(id, hashes, filename.clone()))
}
.instrument(info_span!("wheel", wheel = %dist))
};
@ -823,7 +824,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
let modified = Timestamp::from_path(path).map_err(Error::CacheRead)?;
// Attempt to read the archive pointer from the cache.
let pointer_entry = wheel_entry.with_file(format!("{}.rev", filename.stem()));
let pointer_entry = wheel_entry.with_file(format!("{}.rev", filename.cache_key()));
let pointer = LocalArchivePointer::read_from(&pointer_entry)?;
// Extract the archive from the pointer.
@ -846,6 +847,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
let archive = Archive::new(
self.unzip_wheel(path, wheel_entry.path()).await?,
HashDigests::empty(),
filename.clone(),
);
// Write the archive pointer to the cache.
@ -892,7 +894,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
.map_err(Error::CacheWrite)?;
// Create an archive.
let archive = Archive::new(id, hashes);
let archive = Archive::new(id, hashes, filename.clone());
// Write the archive pointer to the cache.
let pointer = LocalArchivePointer {

View File

@ -116,14 +116,10 @@ impl CachedWheel {
}
}
/// Read a cached wheel from a `.http` pointer (e.g., `anyio-4.0.0-py3-none-any.http`).
/// Read a cached wheel from a `.http` pointer
pub fn from_http_pointer(path: impl AsRef<Path>, cache: &Cache) -> Option<Self> {
let path = path.as_ref();
// Determine the wheel filename.
let filename = path.file_stem()?.to_str()?;
let filename = WheelFilename::from_stem(filename).ok()?;
// Read the pointer.
let pointer = HttpArchivePointer::read_from(path).ok()??;
let cache_info = pointer.to_cache_info();
@ -135,26 +131,21 @@ impl CachedWheel {
}
let Archive { id, hashes, .. } = archive;
let entry = cache.entry(CacheBucket::Archive, "", id);
// Convert to a cached wheel.
Some(Self {
filename,
filename: archive.filename,
entry,
hashes,
cache_info,
})
}
/// Read a cached wheel from a `.rev` pointer (e.g., `anyio-4.0.0-py3-none-any.rev`).
/// Read a cached wheel from a `.rev` pointer
pub fn from_local_pointer(path: impl AsRef<Path>, cache: &Cache) -> Option<Self> {
let path = path.as_ref();
// Determine the wheel filename.
let filename = path.file_stem()?.to_str()?;
let filename = WheelFilename::from_stem(filename).ok()?;
// Read the pointer.
let pointer = LocalArchivePointer::read_from(path).ok()??;
let cache_info = pointer.to_cache_info();
@ -166,11 +157,11 @@ impl CachedWheel {
}
let Archive { id, hashes, .. } = archive;
let entry = cache.entry(CacheBucket::Archive, "", id);
// Convert to a cached wheel.
let entry = cache.entry(CacheBucket::Archive, "", id);
Some(Self {
filename,
filename: archive.filename,
entry,
hashes,
cache_info,

View File

@ -29,8 +29,8 @@ pub(crate) struct BuiltWheelMetadata {
impl BuiltWheelMetadata {
/// Find a compatible wheel in the cache.
pub(crate) fn find_in_cache(tags: &Tags, cache_shard: &CacheShard) -> Option<Self> {
for directory in files(cache_shard) {
if let Some(metadata) = Self::from_path(directory, cache_shard) {
for file in files(cache_shard) {
if let Some(metadata) = Self::from_path(file, cache_shard) {
// Validate that the wheel is compatible with the target platform.
if metadata.filename.is_compatible(tags) {
return Some(metadata);

View File

@ -159,7 +159,7 @@ impl<'a> Planner<'a> {
CacheBucket::Wheels,
WheelCache::Url(&wheel.url).wheel_dir(wheel.name().as_ref()),
)
.entry(format!("{}.http", wheel.filename.stem()));
.entry(format!("{}.http", wheel.filename.cache_key()));
// Read the HTTP pointer.
if let Some(pointer) = HttpArchivePointer::read_from(&cache_entry)? {
@ -210,7 +210,7 @@ impl<'a> Planner<'a> {
CacheBucket::Wheels,
WheelCache::Url(&wheel.url).wheel_dir(wheel.name().as_ref()),
)
.entry(format!("{}.rev", wheel.filename.stem()));
.entry(format!("{}.rev", wheel.filename.cache_key()));
if let Some(pointer) = LocalArchivePointer::read_from(&cache_entry)? {
let timestamp = Timestamp::from_path(&wheel.install_path)?;

View File

@ -158,7 +158,7 @@ fn prune_stale_symlink() -> Result<()> {
.success();
// Remove the wheels directory, causing the symlink to become stale.
let wheels = context.cache_dir.child("wheels-v4");
let wheels = context.cache_dir.child("wheels-v5");
fs_err::remove_dir_all(wheels)?;
let filters: Vec<_> = context