Use a stable cache key

This commit is contained in:
Charlie Marsh 2025-11-09 13:37:29 -05:00
parent 203594d482
commit ed48a81fa7
10 changed files with 198 additions and 68 deletions

3
Cargo.lock generated
View File

@ -5833,9 +5833,11 @@ name = "uv-distribution"
version = "0.0.1" version = "0.0.1"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"blake2",
"either", "either",
"fs-err", "fs-err",
"futures", "futures",
"hex",
"indoc", "indoc",
"insta", "insta",
"nanoid", "nanoid",
@ -5856,6 +5858,7 @@ dependencies = [
"uv-auth", "uv-auth",
"uv-cache", "uv-cache",
"uv-cache-info", "uv-cache-info",
"uv-cache-key",
"uv-client", "uv-client",
"uv-configuration", "uv-configuration",
"uv-distribution-filename", "uv-distribution-filename",

View File

@ -704,7 +704,10 @@ impl RegistryClient {
pub async fn fetch_simple_index( pub async fn fetch_simple_index(
&self, &self,
index_url: &IndexUrl, index_url: &IndexUrl,
download_concurrency: &Semaphore,
) -> Result<SimpleIndexMetadata, Error> { ) -> Result<SimpleIndexMetadata, Error> {
let _permit = download_concurrency.acquire().await;
// Format the URL for PyPI. // Format the URL for PyPI.
let mut url = index_url.url().clone(); let mut url = index_url.url().clone();
url.path_segments_mut() url.path_segments_mut()
@ -1306,15 +1309,10 @@ pub struct VersionSourceDist {
#[rkyv(derive(Debug))] #[rkyv(derive(Debug))]
pub struct SimpleIndexMetadata { pub struct SimpleIndexMetadata {
/// The list of project names available in the index. /// The list of project names available in the index.
projects: Vec<PackageName>, pub projects: Vec<PackageName>,
} }
impl SimpleIndexMetadata { impl SimpleIndexMetadata {
/// Iterate over the projects in the index.
pub fn iter(&self) -> impl Iterator<Item = &PackageName> {
self.projects.iter()
}
/// Create a [`SimpleIndexMetadata`] from a [`PypiSimpleIndex`]. /// Create a [`SimpleIndexMetadata`] from a [`PypiSimpleIndex`].
fn from_pypi_index(index: PypiSimpleIndex) -> Self { fn from_pypi_index(index: PypiSimpleIndex) -> Self {
Self { Self {

View File

@ -19,6 +19,7 @@ workspace = true
uv-auth = { workspace = true } uv-auth = { workspace = true }
uv-cache = { workspace = true } uv-cache = { workspace = true }
uv-cache-info = { workspace = true } uv-cache-info = { workspace = true }
uv-cache-key = { workspace = true }
uv-client = { workspace = true } uv-client = { workspace = true }
uv-configuration = { workspace = true } uv-configuration = { workspace = true }
uv-distribution-filename = { workspace = true } uv-distribution-filename = { workspace = true }
@ -39,9 +40,11 @@ uv-types = { workspace = true }
uv-workspace = { workspace = true } uv-workspace = { workspace = true }
anyhow = { workspace = true } anyhow = { workspace = true }
blake2 = { workspace = true }
either = { workspace = true } either = { workspace = true }
fs-err = { workspace = true } fs-err = { workspace = true }
futures = { workspace = true } futures = { workspace = true }
hex = { workspace = true }
nanoid = { workspace = true } nanoid = { workspace = true }
owo-colors = { workspace = true } owo-colors = { workspace = true }
reqwest = { workspace = true } reqwest = { workspace = true }

View File

@ -10,7 +10,7 @@ use tempfile::TempDir;
use tokio::io::{AsyncRead, AsyncSeekExt, ReadBuf}; use tokio::io::{AsyncRead, AsyncSeekExt, ReadBuf};
use tokio::sync::Semaphore; use tokio::sync::Semaphore;
use tokio_util::compat::FuturesAsyncReadCompatExt; use tokio_util::compat::FuturesAsyncReadCompatExt;
use tracing::{Instrument, info_span, instrument, warn, debug}; use tracing::{Instrument, debug, info_span, instrument, warn};
use url::Url; use url::Url;
use uv_cache::{ArchiveId, CacheBucket, CacheEntry, WheelCache}; use uv_cache::{ArchiveId, CacheBucket, CacheEntry, WheelCache};
@ -383,7 +383,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
hashes: HashPolicy<'_>, hashes: HashPolicy<'_>,
) -> Result<LocalWheel, Error> { ) -> Result<LocalWheel, Error> {
// If the metadata is available in a remote cache, fetch it. // If the metadata is available in a remote cache, fetch it.
if let Some(wheel) = self.get_remote_wheel(dist, tags, hashes).await? { if let Ok(Some(wheel)) = self.get_remote_wheel(dist, tags, hashes).await {
return Ok(wheel); return Ok(wheel);
} }
@ -558,7 +558,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
} }
// If the metadata is available in a remote cache, fetch it. // If the metadata is available in a remote cache, fetch it.
if let Some(metadata) = self.get_remote_metadata(source, hashes).await? { if let Ok(Some(metadata)) = self.get_remote_metadata(source, hashes).await {
return Ok(metadata); return Ok(metadata);
} }
@ -584,34 +584,31 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
/// Fetch a wheel from a remote cache, if available. /// Fetch a wheel from a remote cache, if available.
async fn get_remote_wheel( async fn get_remote_wheel(
&self, &self,
dist: &SourceDist, source: &SourceDist,
tags: &Tags, tags: &Tags,
hashes: HashPolicy<'_>, hashes: HashPolicy<'_>,
) -> Result<Option<LocalWheel>, Error> { ) -> Result<Option<LocalWheel>, Error> {
let Some(index) = self let Some(index) = self
.resolver .resolver
.get_cached_distribution(dist, Some(tags), &self.client) .get_cached_distribution(&BuildableSource::Dist(source), Some(tags), &self.client)
.await? .await?
else { else {
return Ok(None); return Ok(None);
}; };
let Some(entries) = index.get(dist.name()) else { for prioritized_dist in index.iter() {
return Ok(None);
};
for (.., prioritized_dist) in entries.iter() {
let Some(compatible_dist) = prioritized_dist.get() else { let Some(compatible_dist) = prioritized_dist.get() else {
continue; continue;
}; };
match compatible_dist { match compatible_dist {
CompatibleDist::InstalledDist(..) => {} CompatibleDist::InstalledDist(..) => {}
CompatibleDist::SourceDist { sdist, .. } => { CompatibleDist::SourceDist { sdist, .. } => {
debug!("Found cached remote source distribution for: {dist}"); debug!("Found cached remote source distribution for: {source}");
let dist = SourceDist::Registry(sdist.clone()); let dist = SourceDist::Registry(sdist.clone());
return self.build_wheel_inner(&dist, tags, hashes).await.map(Some); return self.build_wheel_inner(&dist, tags, hashes).await.map(Some);
} }
CompatibleDist::CompatibleWheel { wheel, .. } CompatibleDist::CompatibleWheel { wheel, .. }
| CompatibleDist::IncompatibleWheel { wheel, .. } => { | CompatibleDist::IncompatibleWheel { wheel, .. } => {
debug!("Found cached remote built distribution for: {dist}"); debug!("Found cached remote built distribution for: {source}");
let dist = BuiltDist::Registry(RegistryBuiltDist { let dist = BuiltDist::Registry(RegistryBuiltDist {
wheels: vec![wheel.clone()], wheels: vec![wheel.clone()],
best_wheel_index: 0, best_wheel_index: 0,
@ -630,30 +627,21 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
source: &BuildableSource<'_>, source: &BuildableSource<'_>,
hashes: HashPolicy<'_>, hashes: HashPolicy<'_>,
) -> Result<Option<ArchiveMetadata>, Error> { ) -> Result<Option<ArchiveMetadata>, Error> {
// TODO(charlie): If the distribution is unnamed, we should be able to infer the name
// from the list of available distributions in the index, since we expect exactly one
// package name per cache entry.
let BuildableSource::Dist(dist) = source else {
return Ok(None);
};
let Some(index) = self let Some(index) = self
.resolver .resolver
.get_cached_distribution(dist, None, &self.client) .get_cached_distribution(source, None, &self.client)
.await? .await?
else { else {
return Ok(None); return Ok(None);
}; };
let Some(entries) = index.get(dist.name()) else { for prioritized_dist in index.iter() {
return Ok(None);
};
for (.., prioritized_dist) in entries.iter() {
let Some(compatible_dist) = prioritized_dist.get() else { let Some(compatible_dist) = prioritized_dist.get() else {
continue; continue;
}; };
match compatible_dist { match compatible_dist {
CompatibleDist::InstalledDist(..) => {} CompatibleDist::InstalledDist(..) => {}
CompatibleDist::SourceDist { sdist, .. } => { CompatibleDist::SourceDist { sdist, .. } => {
debug!("Found cached remote source distribution for: {dist}"); debug!("Found cached remote source distribution for: {source}");
let dist = SourceDist::Registry(sdist.clone()); let dist = SourceDist::Registry(sdist.clone());
return self return self
.build_wheel_metadata_inner(&BuildableSource::Dist(&dist), hashes) .build_wheel_metadata_inner(&BuildableSource::Dist(&dist), hashes)
@ -662,7 +650,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
} }
CompatibleDist::CompatibleWheel { wheel, .. } CompatibleDist::CompatibleWheel { wheel, .. }
| CompatibleDist::IncompatibleWheel { wheel, .. } => { | CompatibleDist::IncompatibleWheel { wheel, .. } => {
debug!("Found cached remote built distribution for: {dist}"); debug!("Found cached remote built distribution for: {source}");
let dist = BuiltDist::Registry(RegistryBuiltDist { let dist = BuiltDist::Registry(RegistryBuiltDist {
wheels: vec![wheel.clone()], wheels: vec![wheel.clone()],
best_wheel_index: 0, best_wheel_index: 0,

View File

@ -1,21 +1,25 @@
use std::borrow::Cow;
use std::collections::BTreeMap; use std::collections::BTreeMap;
use std::collections::btree_map::Entry; use std::collections::btree_map::Entry;
use std::path::Path;
use std::sync::Arc; use std::sync::Arc;
use blake2::Digest;
use rustc_hash::FxHashMap; use rustc_hash::FxHashMap;
use tokio::sync::Mutex; use tokio::sync::Mutex;
use tracing::instrument; use tracing::{debug, instrument, warn};
use uv_auth::PyxTokenStore; use uv_auth::PyxTokenStore;
use uv_client::{MetadataFormat, VersionFiles}; use uv_cache_key::RepositoryUrl;
use uv_client::{MetadataFormat, SimpleIndexMetadata, VersionFiles};
use uv_configuration::BuildOptions; use uv_configuration::BuildOptions;
use uv_distribution_filename::{DistFilename, SourceDistFilename, WheelFilename}; use uv_distribution_filename::{DistFilename, SourceDistFilename, WheelFilename};
use uv_distribution_types::{ use uv_distribution_types::{
File, HashComparison, HashPolicy, IncompatibleSource, IncompatibleWheel, IndexFormat, BuildableSource, File, HashComparison, HashPolicy, IncompatibleSource, IncompatibleWheel,
IndexMetadata, IndexUrl, Name, PrioritizedDist, RegistryBuiltWheel, RegistrySourceDist, IndexFormat, IndexMetadata, IndexUrl, PrioritizedDist, RegistryBuiltWheel, RegistrySourceDist,
SourceDist, SourceDistCompatibility, WheelCompatibility, SourceDist, SourceDistCompatibility, SourceUrl, WheelCompatibility,
}; };
use uv_git_types::GitHubRepository; use uv_git_types::{GitOid, GitUrl};
use uv_normalize::PackageName; use uv_normalize::PackageName;
use uv_pep440::Version; use uv_pep440::Version;
use uv_pep508::VerbatimUrl; use uv_pep508::VerbatimUrl;
@ -49,12 +53,12 @@ impl<'a, T: BuildContext> RemoteCacheResolver<'a, T> {
/// Return the cached Git index for the given distribution, if any. /// Return the cached Git index for the given distribution, if any.
pub(crate) async fn get_cached_distribution( pub(crate) async fn get_cached_distribution(
&self, &self,
dist: &SourceDist, source: &BuildableSource<'_>,
tags: Option<&Tags>, tags: Option<&Tags>,
client: &ManagedClient<'a>, client: &ManagedClient<'a>,
) -> Result<Option<GitIndex>, Error> { ) -> Result<Option<GitIndex>, Error> {
// Fetch the entries for the given distribution. // Fetch the entries for the given distribution.
let entries = self.get_or_fetch_index(dist, client).await?; let entries = self.get_or_fetch_index(source, client).await?;
if entries.is_empty() { if entries.is_empty() {
return Ok(None); return Ok(None);
} }
@ -72,38 +76,81 @@ impl<'a, T: BuildContext> RemoteCacheResolver<'a, T> {
/// Fetch the remote Git index for the given distribution. /// Fetch the remote Git index for the given distribution.
async fn get_or_fetch_index( async fn get_or_fetch_index(
&self, &self,
dist: &SourceDist, source: &BuildableSource<'_>,
client: &ManagedClient<'a>, client: &ManagedClient<'a>,
) -> Result<Vec<GitIndexEntry>, Error> { ) -> Result<Vec<GitIndexEntry>, Error> {
#[derive(Debug)]
struct BuildableGitSource<'a> {
git: &'a GitUrl,
subdirectory: Option<&'a Path>,
name: Option<&'a PackageName>,
}
let Some(workspace) = &self.workspace else { let Some(workspace) = &self.workspace else {
return Ok(Vec::default()); return Ok(Vec::default());
}; };
let SourceDist::Git(dist) = dist else { let source = match source {
BuildableSource::Dist(SourceDist::Git(dist)) => BuildableGitSource {
git: &dist.git,
subdirectory: dist.subdirectory.as_deref(),
name: Some(&dist.name),
},
BuildableSource::Url(SourceUrl::Git(url)) => BuildableGitSource {
git: url.git,
subdirectory: url.subdirectory,
name: None,
},
_ => {
return Ok(Vec::default());
}
};
let Some(precise) = self.build_context.git().get_precise(source.git) else {
return Ok(Vec::default()); return Ok(Vec::default());
}; };
// TODO(charlie): Handle subdirectories. // Determine the cache key for the Git source.
if dist.subdirectory.is_some() { let cache_key = GitCacheKey {
return Ok(Vec::default()); repository: RepositoryUrl::new(source.git.repository()),
} precise,
subdirectory: source.subdirectory,
let Some(repo) = GitHubRepository::parse(dist.git.repository()) else {
return Ok(Vec::default());
}; };
let digest = cache_key.digest();
let Some(precise) = self.build_context.git().get_precise(&dist.git) else {
return Ok(Vec::default());
};
// Store the index entries in a cache, to avoid redundant fetches.
let index = IndexUrl::from( let index = IndexUrl::from(
VerbatimUrl::parse_url(format!( VerbatimUrl::parse_url(format!(
"http://localhost:8000/v1/git/{workspace}/{}/{}/{precise}", "http://localhost:8000/v1/git/{workspace}/{}/{}/{}",
repo.owner, repo.repo &digest[..2],
&digest[2..4],
&digest[4..],
)) ))
.unwrap(), .unwrap(),
); );
debug!("Using remote Git index URL: {}", index);
// Determine the package name.
let name = if let Some(name) = source.name {
Cow::Borrowed(name)
} else {
// Fetch the list of packages from the Simple API.
let SimpleIndexMetadata { projects } = client
.manual(|client, semaphore| client.fetch_simple_index(&index, semaphore))
.await?;
// Ensure that the index contains exactly one package.
let mut packages = projects.into_iter();
let Some(name) = packages.next() else {
debug!("Remote Git index at `{index}` contains no packages");
return Ok(Vec::default());
};
if packages.next().is_some() {
debug!("Remote Git index at `{index}` contains multiple packages");
return Ok(Vec::default());
}
Cow::Owned(name)
};
// Store the index entries in a cache, to avoid redundant fetches.
{ {
let cache = self.cache.lock().await; let cache = self.cache.lock().await;
if let Some(entries) = cache.get(&index) { if let Some(entries) = cache.get(&index) {
@ -118,8 +165,8 @@ impl<'a, T: BuildContext> RemoteCacheResolver<'a, T> {
}; };
let archives = client let archives = client
.manual(|client, semaphore| { .manual(|client, semaphore| {
client.package_metadata( client.simple_detail(
dist.name(), name.as_ref(),
Some(metadata.as_ref()), Some(metadata.as_ref()),
self.build_context.capabilities(), self.build_context.capabilities(),
semaphore, semaphore,
@ -137,6 +184,13 @@ impl<'a, T: BuildContext> RemoteCacheResolver<'a, T> {
let files = rkyv::deserialize::<VersionFiles, rkyv::rancor::Error>(&datum.files) let files = rkyv::deserialize::<VersionFiles, rkyv::rancor::Error>(&datum.files)
.expect("archived version files always deserializes"); .expect("archived version files always deserializes");
for (filename, file) in files.all() { for (filename, file) in files.all() {
if *filename.name() != *name {
warn!(
"Skipping file `{filename}` from remote Git index at `{index}` due to name mismatch (expected: `{name}`)"
);
continue;
}
entries.push(GitIndexEntry { entries.push(GitIndexEntry {
filename, filename,
file, file,
@ -165,6 +219,9 @@ struct GitIndexEntry {
} }
/// A set of [`PrioritizedDist`] from a Git index. /// A set of [`PrioritizedDist`] from a Git index.
///
/// In practice, it's assumed that the [`GitIndex`] will only contain distributions for a single
/// package.
#[derive(Debug, Clone, Default)] #[derive(Debug, Clone, Default)]
pub(crate) struct GitIndex(FxHashMap<PackageName, GitIndexDistributions>); pub(crate) struct GitIndex(FxHashMap<PackageName, GitIndexDistributions>);
@ -177,7 +234,6 @@ impl GitIndex {
hasher: &HashStrategy, hasher: &HashStrategy,
build_options: &BuildOptions, build_options: &BuildOptions,
) -> Self { ) -> Self {
// Collect compatible distributions.
let mut index = FxHashMap::<PackageName, GitIndexDistributions>::default(); let mut index = FxHashMap::<PackageName, GitIndexDistributions>::default();
for entry in entries { for entry in entries {
let distributions = index.entry(entry.filename.name().clone()).or_default(); let distributions = index.entry(entry.filename.name().clone()).or_default();
@ -193,9 +249,11 @@ impl GitIndex {
Self(index) Self(index)
} }
/// Get the [`GitIndexDistributions`] for the given package name. /// Returns an [`Iterator`] over the distributions.
pub(crate) fn get(&self, package_name: &PackageName) -> Option<&GitIndexDistributions> { pub(crate) fn iter(&self) -> impl Iterator<Item = &PrioritizedDist> {
self.0.get(package_name) self.0
.iter()
.flat_map(|(.., distributions)| distributions.0.iter().map(|(.., dist)| dist))
} }
} }
@ -204,11 +262,6 @@ impl GitIndex {
pub(crate) struct GitIndexDistributions(BTreeMap<Version, PrioritizedDist>); pub(crate) struct GitIndexDistributions(BTreeMap<Version, PrioritizedDist>);
impl GitIndexDistributions { impl GitIndexDistributions {
/// Returns an [`Iterator`] over the distributions.
pub(crate) fn iter(&self) -> impl Iterator<Item = (&Version, &PrioritizedDist)> {
self.0.iter()
}
/// Add the given [`File`] to the [`GitIndexDistributions`] for the given package. /// Add the given [`File`] to the [`GitIndexDistributions`] for the given package.
fn add_file( fn add_file(
&mut self, &mut self,
@ -219,8 +272,7 @@ impl GitIndexDistributions {
build_options: &BuildOptions, build_options: &BuildOptions,
index: IndexUrl, index: IndexUrl,
) { ) {
// No `requires-python` here: for source distributions, we don't have that information; // TODO(charlie): Incorporate `Requires-Python`, yanked status, etc.
// for wheels, we read it lazily only when selected.
match filename { match filename {
DistFilename::WheelFilename(filename) => { DistFilename::WheelFilename(filename) => {
let version = filename.version.clone(); let version = filename.version.clone();
@ -366,3 +418,39 @@ impl GitIndexCache {
self.0.insert(index, entries) self.0.insert(index, entries)
} }
} }
/// A cache key for a Git repository at a precise commit.
#[derive(Debug, Clone, PartialEq, Eq)]
struct GitCacheKey<'a> {
repository: RepositoryUrl,
precise: GitOid,
subdirectory: Option<&'a Path>,
}
impl GitCacheKey<'_> {
/// Compute the digest for the Git cache key.
fn digest(&self) -> String {
let mut hasher = blake2::Blake2b::<blake2::digest::consts::U32>::new();
hasher.update(self.repository.as_str().as_bytes());
hasher.update(b"/");
hasher.update(self.precise.as_str().as_bytes());
if let Some(subdirectory) = self
.subdirectory
.and_then(|subdirectory| subdirectory.to_str())
{
hasher.update(b"?subdirectory=");
hasher.update(subdirectory.as_bytes());
}
hex::encode(hasher.finalize())
}
}
impl std::fmt::Display for GitCacheKey<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}/{}", self.repository, self.precise.as_str())?;
if let Some(subdirectory) = &self.subdirectory {
write!(f, "?subdirectory={}", subdirectory.display())?;
}
Ok(())
}
}

50
hash_test.py Normal file
View File

@ -0,0 +1,50 @@
from hashlib import blake2b
from os import fspath
from typing import Optional, Union
Pathish = Union[str, bytes, "os.PathLike[str]", "os.PathLike[bytes]"]
def git_cache_digest(repository: str, precise: str, subdirectory: Optional[Pathish] = None) -> str:
"""
Reproduces the Rust digest() exactly:
- blake2b with 32-byte (256-bit) digest
- bytes fed in this order:
repository + "/" + precise [+ "?subdirectory=" + subdirectory]
- subdirectory is included only if it is representable as UTF-8
(mirrors Rust Path::to_str() -> Option<&str>)
- hex output is lowercase
"""
h = blake2b(digest_size=32)
# repository and precise are Rust &str equivalents: encode as UTF-8
h.update(repository.encode("utf-8"))
h.update(b"/")
h.update(precise.encode("utf-8"))
if subdirectory is not None:
# Normalize to either str or bytes using fspath (handles PathLike)
p = fspath(subdirectory)
# Try to get a UTF-8 string like Path::to_str()
if isinstance(p, bytes):
try:
p_str = p.decode("utf-8")
except UnicodeDecodeError:
p_str = None
else:
# Already a str
p_str = p
if p_str is not None:
h.update(b"?subdirectory=")
h.update(p_str.encode("utf-8"))
return h.hexdigest()
digest = git_cache_digest(
repository="https://github.com/agronholm/anyio",
precise="64b753b19c9a49e3ae395cde457cf82d51f7e999",
subdirectory=None
)
print(digest) # lowercase hex, identical to the Rust version