Use a stable cache key

This commit is contained in:
Charlie Marsh 2025-11-09 13:37:29 -05:00
parent 203594d482
commit ed48a81fa7
10 changed files with 198 additions and 68 deletions

3
Cargo.lock generated
View File

@ -5833,9 +5833,11 @@ name = "uv-distribution"
version = "0.0.1"
dependencies = [
"anyhow",
"blake2",
"either",
"fs-err",
"futures",
"hex",
"indoc",
"insta",
"nanoid",
@ -5856,6 +5858,7 @@ dependencies = [
"uv-auth",
"uv-cache",
"uv-cache-info",
"uv-cache-key",
"uv-client",
"uv-configuration",
"uv-distribution-filename",

View File

@ -704,7 +704,10 @@ impl RegistryClient {
pub async fn fetch_simple_index(
&self,
index_url: &IndexUrl,
download_concurrency: &Semaphore,
) -> Result<SimpleIndexMetadata, Error> {
let _permit = download_concurrency.acquire().await;
// Format the URL for PyPI.
let mut url = index_url.url().clone();
url.path_segments_mut()
@ -1306,15 +1309,10 @@ pub struct VersionSourceDist {
#[rkyv(derive(Debug))]
pub struct SimpleIndexMetadata {
/// The list of project names available in the index.
projects: Vec<PackageName>,
pub projects: Vec<PackageName>,
}
impl SimpleIndexMetadata {
/// Iterate over the projects in the index.
pub fn iter(&self) -> impl Iterator<Item = &PackageName> {
self.projects.iter()
}
/// Create a [`SimpleIndexMetadata`] from a [`PypiSimpleIndex`].
fn from_pypi_index(index: PypiSimpleIndex) -> Self {
Self {

View File

@ -19,6 +19,7 @@ workspace = true
uv-auth = { workspace = true }
uv-cache = { workspace = true }
uv-cache-info = { workspace = true }
uv-cache-key = { workspace = true }
uv-client = { workspace = true }
uv-configuration = { workspace = true }
uv-distribution-filename = { workspace = true }
@ -39,9 +40,11 @@ uv-types = { workspace = true }
uv-workspace = { workspace = true }
anyhow = { workspace = true }
blake2 = { workspace = true }
either = { workspace = true }
fs-err = { workspace = true }
futures = { workspace = true }
hex = { workspace = true }
nanoid = { workspace = true }
owo-colors = { workspace = true }
reqwest = { workspace = true }

View File

@ -10,7 +10,7 @@ use tempfile::TempDir;
use tokio::io::{AsyncRead, AsyncSeekExt, ReadBuf};
use tokio::sync::Semaphore;
use tokio_util::compat::FuturesAsyncReadCompatExt;
use tracing::{Instrument, info_span, instrument, warn, debug};
use tracing::{Instrument, debug, info_span, instrument, warn};
use url::Url;
use uv_cache::{ArchiveId, CacheBucket, CacheEntry, WheelCache};
@ -383,7 +383,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
hashes: HashPolicy<'_>,
) -> Result<LocalWheel, Error> {
// If the metadata is available in a remote cache, fetch it.
if let Some(wheel) = self.get_remote_wheel(dist, tags, hashes).await? {
if let Ok(Some(wheel)) = self.get_remote_wheel(dist, tags, hashes).await {
return Ok(wheel);
}
@ -558,7 +558,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
}
// If the metadata is available in a remote cache, fetch it.
if let Some(metadata) = self.get_remote_metadata(source, hashes).await? {
if let Ok(Some(metadata)) = self.get_remote_metadata(source, hashes).await {
return Ok(metadata);
}
@ -584,34 +584,31 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
/// Fetch a wheel from a remote cache, if available.
async fn get_remote_wheel(
&self,
dist: &SourceDist,
source: &SourceDist,
tags: &Tags,
hashes: HashPolicy<'_>,
) -> Result<Option<LocalWheel>, Error> {
let Some(index) = self
.resolver
.get_cached_distribution(dist, Some(tags), &self.client)
.get_cached_distribution(&BuildableSource::Dist(source), Some(tags), &self.client)
.await?
else {
return Ok(None);
};
let Some(entries) = index.get(dist.name()) else {
return Ok(None);
};
for (.., prioritized_dist) in entries.iter() {
for prioritized_dist in index.iter() {
let Some(compatible_dist) = prioritized_dist.get() else {
continue;
};
match compatible_dist {
CompatibleDist::InstalledDist(..) => {}
CompatibleDist::SourceDist { sdist, .. } => {
debug!("Found cached remote source distribution for: {dist}");
debug!("Found cached remote source distribution for: {source}");
let dist = SourceDist::Registry(sdist.clone());
return self.build_wheel_inner(&dist, tags, hashes).await.map(Some);
}
CompatibleDist::CompatibleWheel { wheel, .. }
| CompatibleDist::IncompatibleWheel { wheel, .. } => {
debug!("Found cached remote built distribution for: {dist}");
debug!("Found cached remote built distribution for: {source}");
let dist = BuiltDist::Registry(RegistryBuiltDist {
wheels: vec![wheel.clone()],
best_wheel_index: 0,
@ -630,30 +627,21 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
source: &BuildableSource<'_>,
hashes: HashPolicy<'_>,
) -> Result<Option<ArchiveMetadata>, Error> {
// TODO(charlie): If the distribution is unnamed, we should be able to infer the name
// from the list of available distributions in the index, since we expect exactly one
// package name per cache entry.
let BuildableSource::Dist(dist) = source else {
return Ok(None);
};
let Some(index) = self
.resolver
.get_cached_distribution(dist, None, &self.client)
.get_cached_distribution(source, None, &self.client)
.await?
else {
return Ok(None);
};
let Some(entries) = index.get(dist.name()) else {
return Ok(None);
};
for (.., prioritized_dist) in entries.iter() {
for prioritized_dist in index.iter() {
let Some(compatible_dist) = prioritized_dist.get() else {
continue;
};
match compatible_dist {
CompatibleDist::InstalledDist(..) => {}
CompatibleDist::SourceDist { sdist, .. } => {
debug!("Found cached remote source distribution for: {dist}");
debug!("Found cached remote source distribution for: {source}");
let dist = SourceDist::Registry(sdist.clone());
return self
.build_wheel_metadata_inner(&BuildableSource::Dist(&dist), hashes)
@ -662,7 +650,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
}
CompatibleDist::CompatibleWheel { wheel, .. }
| CompatibleDist::IncompatibleWheel { wheel, .. } => {
debug!("Found cached remote built distribution for: {dist}");
debug!("Found cached remote built distribution for: {source}");
let dist = BuiltDist::Registry(RegistryBuiltDist {
wheels: vec![wheel.clone()],
best_wheel_index: 0,

View File

@ -1,21 +1,25 @@
use std::borrow::Cow;
use std::collections::BTreeMap;
use std::collections::btree_map::Entry;
use std::path::Path;
use std::sync::Arc;
use blake2::Digest;
use rustc_hash::FxHashMap;
use tokio::sync::Mutex;
use tracing::instrument;
use tracing::{debug, instrument, warn};
use uv_auth::PyxTokenStore;
use uv_client::{MetadataFormat, VersionFiles};
use uv_cache_key::RepositoryUrl;
use uv_client::{MetadataFormat, SimpleIndexMetadata, VersionFiles};
use uv_configuration::BuildOptions;
use uv_distribution_filename::{DistFilename, SourceDistFilename, WheelFilename};
use uv_distribution_types::{
File, HashComparison, HashPolicy, IncompatibleSource, IncompatibleWheel, IndexFormat,
IndexMetadata, IndexUrl, Name, PrioritizedDist, RegistryBuiltWheel, RegistrySourceDist,
SourceDist, SourceDistCompatibility, WheelCompatibility,
BuildableSource, File, HashComparison, HashPolicy, IncompatibleSource, IncompatibleWheel,
IndexFormat, IndexMetadata, IndexUrl, PrioritizedDist, RegistryBuiltWheel, RegistrySourceDist,
SourceDist, SourceDistCompatibility, SourceUrl, WheelCompatibility,
};
use uv_git_types::GitHubRepository;
use uv_git_types::{GitOid, GitUrl};
use uv_normalize::PackageName;
use uv_pep440::Version;
use uv_pep508::VerbatimUrl;
@ -49,12 +53,12 @@ impl<'a, T: BuildContext> RemoteCacheResolver<'a, T> {
/// Return the cached Git index for the given distribution, if any.
pub(crate) async fn get_cached_distribution(
&self,
dist: &SourceDist,
source: &BuildableSource<'_>,
tags: Option<&Tags>,
client: &ManagedClient<'a>,
) -> Result<Option<GitIndex>, Error> {
// Fetch the entries for the given distribution.
let entries = self.get_or_fetch_index(dist, client).await?;
let entries = self.get_or_fetch_index(source, client).await?;
if entries.is_empty() {
return Ok(None);
}
@ -72,38 +76,81 @@ impl<'a, T: BuildContext> RemoteCacheResolver<'a, T> {
/// Fetch the remote Git index for the given distribution.
async fn get_or_fetch_index(
&self,
dist: &SourceDist,
source: &BuildableSource<'_>,
client: &ManagedClient<'a>,
) -> Result<Vec<GitIndexEntry>, Error> {
#[derive(Debug)]
struct BuildableGitSource<'a> {
git: &'a GitUrl,
subdirectory: Option<&'a Path>,
name: Option<&'a PackageName>,
}
let Some(workspace) = &self.workspace else {
return Ok(Vec::default());
};
let SourceDist::Git(dist) = dist else {
return Ok(Vec::default());
};
// TODO(charlie): Handle subdirectories.
if dist.subdirectory.is_some() {
let source = match source {
BuildableSource::Dist(SourceDist::Git(dist)) => BuildableGitSource {
git: &dist.git,
subdirectory: dist.subdirectory.as_deref(),
name: Some(&dist.name),
},
BuildableSource::Url(SourceUrl::Git(url)) => BuildableGitSource {
git: url.git,
subdirectory: url.subdirectory,
name: None,
},
_ => {
return Ok(Vec::default());
}
};
let Some(repo) = GitHubRepository::parse(dist.git.repository()) else {
let Some(precise) = self.build_context.git().get_precise(source.git) else {
return Ok(Vec::default());
};
let Some(precise) = self.build_context.git().get_precise(&dist.git) else {
return Ok(Vec::default());
// Determine the cache key for the Git source.
let cache_key = GitCacheKey {
repository: RepositoryUrl::new(source.git.repository()),
precise,
subdirectory: source.subdirectory,
};
// Store the index entries in a cache, to avoid redundant fetches.
let digest = cache_key.digest();
let index = IndexUrl::from(
VerbatimUrl::parse_url(format!(
"http://localhost:8000/v1/git/{workspace}/{}/{}/{precise}",
repo.owner, repo.repo
"http://localhost:8000/v1/git/{workspace}/{}/{}/{}",
&digest[..2],
&digest[2..4],
&digest[4..],
))
.unwrap(),
);
debug!("Using remote Git index URL: {}", index);
// Determine the package name.
let name = if let Some(name) = source.name {
Cow::Borrowed(name)
} else {
// Fetch the list of packages from the Simple API.
let SimpleIndexMetadata { projects } = client
.manual(|client, semaphore| client.fetch_simple_index(&index, semaphore))
.await?;
// Ensure that the index contains exactly one package.
let mut packages = projects.into_iter();
let Some(name) = packages.next() else {
debug!("Remote Git index at `{index}` contains no packages");
return Ok(Vec::default());
};
if packages.next().is_some() {
debug!("Remote Git index at `{index}` contains multiple packages");
return Ok(Vec::default());
}
Cow::Owned(name)
};
// Store the index entries in a cache, to avoid redundant fetches.
{
let cache = self.cache.lock().await;
if let Some(entries) = cache.get(&index) {
@ -118,8 +165,8 @@ impl<'a, T: BuildContext> RemoteCacheResolver<'a, T> {
};
let archives = client
.manual(|client, semaphore| {
client.package_metadata(
dist.name(),
client.simple_detail(
name.as_ref(),
Some(metadata.as_ref()),
self.build_context.capabilities(),
semaphore,
@ -137,6 +184,13 @@ impl<'a, T: BuildContext> RemoteCacheResolver<'a, T> {
let files = rkyv::deserialize::<VersionFiles, rkyv::rancor::Error>(&datum.files)
.expect("archived version files always deserializes");
for (filename, file) in files.all() {
if *filename.name() != *name {
warn!(
"Skipping file `{filename}` from remote Git index at `{index}` due to name mismatch (expected: `{name}`)"
);
continue;
}
entries.push(GitIndexEntry {
filename,
file,
@ -165,6 +219,9 @@ struct GitIndexEntry {
}
/// A set of [`PrioritizedDist`] from a Git index.
///
/// In practice, it's assumed that the [`GitIndex`] will only contain distributions for a single
/// package.
#[derive(Debug, Clone, Default)]
pub(crate) struct GitIndex(FxHashMap<PackageName, GitIndexDistributions>);
@ -177,7 +234,6 @@ impl GitIndex {
hasher: &HashStrategy,
build_options: &BuildOptions,
) -> Self {
// Collect compatible distributions.
let mut index = FxHashMap::<PackageName, GitIndexDistributions>::default();
for entry in entries {
let distributions = index.entry(entry.filename.name().clone()).or_default();
@ -193,9 +249,11 @@ impl GitIndex {
Self(index)
}
/// Get the [`GitIndexDistributions`] for the given package name.
pub(crate) fn get(&self, package_name: &PackageName) -> Option<&GitIndexDistributions> {
self.0.get(package_name)
/// Returns an [`Iterator`] over the distributions.
pub(crate) fn iter(&self) -> impl Iterator<Item = &PrioritizedDist> {
self.0
.iter()
.flat_map(|(.., distributions)| distributions.0.iter().map(|(.., dist)| dist))
}
}
@ -204,11 +262,6 @@ impl GitIndex {
pub(crate) struct GitIndexDistributions(BTreeMap<Version, PrioritizedDist>);
impl GitIndexDistributions {
/// Returns an [`Iterator`] over the distributions.
pub(crate) fn iter(&self) -> impl Iterator<Item = (&Version, &PrioritizedDist)> {
self.0.iter()
}
/// Add the given [`File`] to the [`GitIndexDistributions`] for the given package.
fn add_file(
&mut self,
@ -219,8 +272,7 @@ impl GitIndexDistributions {
build_options: &BuildOptions,
index: IndexUrl,
) {
// No `requires-python` here: for source distributions, we don't have that information;
// for wheels, we read it lazily only when selected.
// TODO(charlie): Incorporate `Requires-Python`, yanked status, etc.
match filename {
DistFilename::WheelFilename(filename) => {
let version = filename.version.clone();
@ -366,3 +418,39 @@ impl GitIndexCache {
self.0.insert(index, entries)
}
}
/// A cache key for a Git repository at a precise commit.
#[derive(Debug, Clone, PartialEq, Eq)]
struct GitCacheKey<'a> {
repository: RepositoryUrl,
precise: GitOid,
subdirectory: Option<&'a Path>,
}
impl GitCacheKey<'_> {
/// Compute the digest for the Git cache key.
fn digest(&self) -> String {
let mut hasher = blake2::Blake2b::<blake2::digest::consts::U32>::new();
hasher.update(self.repository.as_str().as_bytes());
hasher.update(b"/");
hasher.update(self.precise.as_str().as_bytes());
if let Some(subdirectory) = self
.subdirectory
.and_then(|subdirectory| subdirectory.to_str())
{
hasher.update(b"?subdirectory=");
hasher.update(subdirectory.as_bytes());
}
hex::encode(hasher.finalize())
}
}
impl std::fmt::Display for GitCacheKey<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}/{}", self.repository, self.precise.as_str())?;
if let Some(subdirectory) = &self.subdirectory {
write!(f, "?subdirectory={}", subdirectory.display())?;
}
Ok(())
}
}

50
hash_test.py Normal file
View File

@ -0,0 +1,50 @@
from hashlib import blake2b
from os import fspath
from typing import Optional, Union
Pathish = Union[str, bytes, "os.PathLike[str]", "os.PathLike[bytes]"]
def git_cache_digest(repository: str, precise: str, subdirectory: Optional[Pathish] = None) -> str:
"""
Reproduces the Rust digest() exactly:
- blake2b with 32-byte (256-bit) digest
- bytes fed in this order:
repository + "/" + precise [+ "?subdirectory=" + subdirectory]
- subdirectory is included only if it is representable as UTF-8
(mirrors Rust Path::to_str() -> Option<&str>)
- hex output is lowercase
"""
h = blake2b(digest_size=32)
# repository and precise are Rust &str equivalents: encode as UTF-8
h.update(repository.encode("utf-8"))
h.update(b"/")
h.update(precise.encode("utf-8"))
if subdirectory is not None:
# Normalize to either str or bytes using fspath (handles PathLike)
p = fspath(subdirectory)
# Try to get a UTF-8 string like Path::to_str()
if isinstance(p, bytes):
try:
p_str = p.decode("utf-8")
except UnicodeDecodeError:
p_str = None
else:
# Already a str
p_str = p
if p_str is not None:
h.update(b"?subdirectory=")
h.update(p_str.encode("utf-8"))
return h.hexdigest()
digest = git_cache_digest(
repository="https://github.com/agronholm/anyio",
precise="64b753b19c9a49e3ae395cde457cf82d51f7e999",
subdirectory=None
)
print(digest) # lowercase hex, identical to the Rust version