diff --git a/Cargo.lock b/Cargo.lock index c01316973..76100b932 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5833,9 +5833,11 @@ name = "uv-distribution" version = "0.0.1" dependencies = [ "anyhow", + "blake2", "either", "fs-err", "futures", + "hex", "indoc", "insta", "nanoid", @@ -5856,6 +5858,7 @@ dependencies = [ "uv-auth", "uv-cache", "uv-cache-info", + "uv-cache-key", "uv-client", "uv-configuration", "uv-distribution-filename", diff --git a/crates/uv-client/src/registry_client.rs b/crates/uv-client/src/registry_client.rs index e846ab640..43fdabb7c 100644 --- a/crates/uv-client/src/registry_client.rs +++ b/crates/uv-client/src/registry_client.rs @@ -704,7 +704,10 @@ impl RegistryClient { pub async fn fetch_simple_index( &self, index_url: &IndexUrl, + download_concurrency: &Semaphore, ) -> Result { + let _permit = download_concurrency.acquire().await; + // Format the URL for PyPI. let mut url = index_url.url().clone(); url.path_segments_mut() @@ -1306,15 +1309,10 @@ pub struct VersionSourceDist { #[rkyv(derive(Debug))] pub struct SimpleIndexMetadata { /// The list of project names available in the index. - projects: Vec, + pub projects: Vec, } impl SimpleIndexMetadata { - /// Iterate over the projects in the index. - pub fn iter(&self) -> impl Iterator { - self.projects.iter() - } - /// Create a [`SimpleIndexMetadata`] from a [`PypiSimpleIndex`]. fn from_pypi_index(index: PypiSimpleIndex) -> Self { Self { diff --git a/crates/uv-distribution/Cargo.toml b/crates/uv-distribution/Cargo.toml index 3112663a9..81134c1fc 100644 --- a/crates/uv-distribution/Cargo.toml +++ b/crates/uv-distribution/Cargo.toml @@ -19,6 +19,7 @@ workspace = true uv-auth = { workspace = true } uv-cache = { workspace = true } uv-cache-info = { workspace = true } +uv-cache-key = { workspace = true } uv-client = { workspace = true } uv-configuration = { workspace = true } uv-distribution-filename = { workspace = true } @@ -39,9 +40,11 @@ uv-types = { workspace = true } uv-workspace = { workspace = true } anyhow = { workspace = true } +blake2 = { workspace = true } either = { workspace = true } fs-err = { workspace = true } futures = { workspace = true } +hex = { workspace = true } nanoid = { workspace = true } owo-colors = { workspace = true } reqwest = { workspace = true } diff --git a/crates/uv-distribution/src/distribution_database.rs b/crates/uv-distribution/src/distribution_database.rs index 60b1c1d9f..23c910e91 100644 --- a/crates/uv-distribution/src/distribution_database.rs +++ b/crates/uv-distribution/src/distribution_database.rs @@ -10,7 +10,7 @@ use tempfile::TempDir; use tokio::io::{AsyncRead, AsyncSeekExt, ReadBuf}; use tokio::sync::Semaphore; use tokio_util::compat::FuturesAsyncReadCompatExt; -use tracing::{Instrument, info_span, instrument, warn, debug}; +use tracing::{Instrument, debug, info_span, instrument, warn}; use url::Url; use uv_cache::{ArchiveId, CacheBucket, CacheEntry, WheelCache}; @@ -383,7 +383,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> { hashes: HashPolicy<'_>, ) -> Result { // If the metadata is available in a remote cache, fetch it. - if let Some(wheel) = self.get_remote_wheel(dist, tags, hashes).await? { + if let Ok(Some(wheel)) = self.get_remote_wheel(dist, tags, hashes).await { return Ok(wheel); } @@ -558,7 +558,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> { } // If the metadata is available in a remote cache, fetch it. - if let Some(metadata) = self.get_remote_metadata(source, hashes).await? { + if let Ok(Some(metadata)) = self.get_remote_metadata(source, hashes).await { return Ok(metadata); } @@ -584,34 +584,31 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> { /// Fetch a wheel from a remote cache, if available. async fn get_remote_wheel( &self, - dist: &SourceDist, + source: &SourceDist, tags: &Tags, hashes: HashPolicy<'_>, ) -> Result, Error> { let Some(index) = self .resolver - .get_cached_distribution(dist, Some(tags), &self.client) + .get_cached_distribution(&BuildableSource::Dist(source), Some(tags), &self.client) .await? else { return Ok(None); }; - let Some(entries) = index.get(dist.name()) else { - return Ok(None); - }; - for (.., prioritized_dist) in entries.iter() { + for prioritized_dist in index.iter() { let Some(compatible_dist) = prioritized_dist.get() else { continue; }; match compatible_dist { CompatibleDist::InstalledDist(..) => {} CompatibleDist::SourceDist { sdist, .. } => { - debug!("Found cached remote source distribution for: {dist}"); + debug!("Found cached remote source distribution for: {source}"); let dist = SourceDist::Registry(sdist.clone()); return self.build_wheel_inner(&dist, tags, hashes).await.map(Some); } CompatibleDist::CompatibleWheel { wheel, .. } | CompatibleDist::IncompatibleWheel { wheel, .. } => { - debug!("Found cached remote built distribution for: {dist}"); + debug!("Found cached remote built distribution for: {source}"); let dist = BuiltDist::Registry(RegistryBuiltDist { wheels: vec![wheel.clone()], best_wheel_index: 0, @@ -630,30 +627,21 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> { source: &BuildableSource<'_>, hashes: HashPolicy<'_>, ) -> Result, Error> { - // TODO(charlie): If the distribution is unnamed, we should be able to infer the name - // from the list of available distributions in the index, since we expect exactly one - // package name per cache entry. - let BuildableSource::Dist(dist) = source else { - return Ok(None); - }; let Some(index) = self .resolver - .get_cached_distribution(dist, None, &self.client) + .get_cached_distribution(source, None, &self.client) .await? else { return Ok(None); }; - let Some(entries) = index.get(dist.name()) else { - return Ok(None); - }; - for (.., prioritized_dist) in entries.iter() { + for prioritized_dist in index.iter() { let Some(compatible_dist) = prioritized_dist.get() else { continue; }; match compatible_dist { CompatibleDist::InstalledDist(..) => {} CompatibleDist::SourceDist { sdist, .. } => { - debug!("Found cached remote source distribution for: {dist}"); + debug!("Found cached remote source distribution for: {source}"); let dist = SourceDist::Registry(sdist.clone()); return self .build_wheel_metadata_inner(&BuildableSource::Dist(&dist), hashes) @@ -662,7 +650,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> { } CompatibleDist::CompatibleWheel { wheel, .. } | CompatibleDist::IncompatibleWheel { wheel, .. } => { - debug!("Found cached remote built distribution for: {dist}"); + debug!("Found cached remote built distribution for: {source}"); let dist = BuiltDist::Registry(RegistryBuiltDist { wheels: vec![wheel.clone()], best_wheel_index: 0, diff --git a/crates/uv-distribution/src/remote.rs b/crates/uv-distribution/src/remote.rs index 82de31f01..7ac103678 100644 --- a/crates/uv-distribution/src/remote.rs +++ b/crates/uv-distribution/src/remote.rs @@ -1,21 +1,25 @@ +use std::borrow::Cow; use std::collections::BTreeMap; use std::collections::btree_map::Entry; +use std::path::Path; use std::sync::Arc; +use blake2::Digest; use rustc_hash::FxHashMap; use tokio::sync::Mutex; -use tracing::instrument; +use tracing::{debug, instrument, warn}; use uv_auth::PyxTokenStore; -use uv_client::{MetadataFormat, VersionFiles}; +use uv_cache_key::RepositoryUrl; +use uv_client::{MetadataFormat, SimpleIndexMetadata, VersionFiles}; use uv_configuration::BuildOptions; use uv_distribution_filename::{DistFilename, SourceDistFilename, WheelFilename}; use uv_distribution_types::{ - File, HashComparison, HashPolicy, IncompatibleSource, IncompatibleWheel, IndexFormat, - IndexMetadata, IndexUrl, Name, PrioritizedDist, RegistryBuiltWheel, RegistrySourceDist, - SourceDist, SourceDistCompatibility, WheelCompatibility, + BuildableSource, File, HashComparison, HashPolicy, IncompatibleSource, IncompatibleWheel, + IndexFormat, IndexMetadata, IndexUrl, PrioritizedDist, RegistryBuiltWheel, RegistrySourceDist, + SourceDist, SourceDistCompatibility, SourceUrl, WheelCompatibility, }; -use uv_git_types::GitHubRepository; +use uv_git_types::{GitOid, GitUrl}; use uv_normalize::PackageName; use uv_pep440::Version; use uv_pep508::VerbatimUrl; @@ -49,12 +53,12 @@ impl<'a, T: BuildContext> RemoteCacheResolver<'a, T> { /// Return the cached Git index for the given distribution, if any. pub(crate) async fn get_cached_distribution( &self, - dist: &SourceDist, + source: &BuildableSource<'_>, tags: Option<&Tags>, client: &ManagedClient<'a>, ) -> Result, Error> { // Fetch the entries for the given distribution. - let entries = self.get_or_fetch_index(dist, client).await?; + let entries = self.get_or_fetch_index(source, client).await?; if entries.is_empty() { return Ok(None); } @@ -72,38 +76,81 @@ impl<'a, T: BuildContext> RemoteCacheResolver<'a, T> { /// Fetch the remote Git index for the given distribution. async fn get_or_fetch_index( &self, - dist: &SourceDist, + source: &BuildableSource<'_>, client: &ManagedClient<'a>, ) -> Result, Error> { + #[derive(Debug)] + struct BuildableGitSource<'a> { + git: &'a GitUrl, + subdirectory: Option<&'a Path>, + name: Option<&'a PackageName>, + } + let Some(workspace) = &self.workspace else { return Ok(Vec::default()); }; - let SourceDist::Git(dist) = dist else { + let source = match source { + BuildableSource::Dist(SourceDist::Git(dist)) => BuildableGitSource { + git: &dist.git, + subdirectory: dist.subdirectory.as_deref(), + name: Some(&dist.name), + }, + BuildableSource::Url(SourceUrl::Git(url)) => BuildableGitSource { + git: url.git, + subdirectory: url.subdirectory, + name: None, + }, + _ => { + return Ok(Vec::default()); + } + }; + + let Some(precise) = self.build_context.git().get_precise(source.git) else { return Ok(Vec::default()); }; - // TODO(charlie): Handle subdirectories. - if dist.subdirectory.is_some() { - return Ok(Vec::default()); - } - - let Some(repo) = GitHubRepository::parse(dist.git.repository()) else { - return Ok(Vec::default()); + // Determine the cache key for the Git source. + let cache_key = GitCacheKey { + repository: RepositoryUrl::new(source.git.repository()), + precise, + subdirectory: source.subdirectory, }; - - let Some(precise) = self.build_context.git().get_precise(&dist.git) else { - return Ok(Vec::default()); - }; - - // Store the index entries in a cache, to avoid redundant fetches. + let digest = cache_key.digest(); let index = IndexUrl::from( VerbatimUrl::parse_url(format!( - "http://localhost:8000/v1/git/{workspace}/{}/{}/{precise}", - repo.owner, repo.repo + "http://localhost:8000/v1/git/{workspace}/{}/{}/{}", + &digest[..2], + &digest[2..4], + &digest[4..], )) .unwrap(), ); + debug!("Using remote Git index URL: {}", index); + + // Determine the package name. + let name = if let Some(name) = source.name { + Cow::Borrowed(name) + } else { + // Fetch the list of packages from the Simple API. + let SimpleIndexMetadata { projects } = client + .manual(|client, semaphore| client.fetch_simple_index(&index, semaphore)) + .await?; + + // Ensure that the index contains exactly one package. + let mut packages = projects.into_iter(); + let Some(name) = packages.next() else { + debug!("Remote Git index at `{index}` contains no packages"); + return Ok(Vec::default()); + }; + if packages.next().is_some() { + debug!("Remote Git index at `{index}` contains multiple packages"); + return Ok(Vec::default()); + } + Cow::Owned(name) + }; + + // Store the index entries in a cache, to avoid redundant fetches. { let cache = self.cache.lock().await; if let Some(entries) = cache.get(&index) { @@ -118,8 +165,8 @@ impl<'a, T: BuildContext> RemoteCacheResolver<'a, T> { }; let archives = client .manual(|client, semaphore| { - client.package_metadata( - dist.name(), + client.simple_detail( + name.as_ref(), Some(metadata.as_ref()), self.build_context.capabilities(), semaphore, @@ -137,6 +184,13 @@ impl<'a, T: BuildContext> RemoteCacheResolver<'a, T> { let files = rkyv::deserialize::(&datum.files) .expect("archived version files always deserializes"); for (filename, file) in files.all() { + if *filename.name() != *name { + warn!( + "Skipping file `{filename}` from remote Git index at `{index}` due to name mismatch (expected: `{name}`)" + ); + continue; + } + entries.push(GitIndexEntry { filename, file, @@ -165,6 +219,9 @@ struct GitIndexEntry { } /// A set of [`PrioritizedDist`] from a Git index. +/// +/// In practice, it's assumed that the [`GitIndex`] will only contain distributions for a single +/// package. #[derive(Debug, Clone, Default)] pub(crate) struct GitIndex(FxHashMap); @@ -177,7 +234,6 @@ impl GitIndex { hasher: &HashStrategy, build_options: &BuildOptions, ) -> Self { - // Collect compatible distributions. let mut index = FxHashMap::::default(); for entry in entries { let distributions = index.entry(entry.filename.name().clone()).or_default(); @@ -193,9 +249,11 @@ impl GitIndex { Self(index) } - /// Get the [`GitIndexDistributions`] for the given package name. - pub(crate) fn get(&self, package_name: &PackageName) -> Option<&GitIndexDistributions> { - self.0.get(package_name) + /// Returns an [`Iterator`] over the distributions. + pub(crate) fn iter(&self) -> impl Iterator { + self.0 + .iter() + .flat_map(|(.., distributions)| distributions.0.iter().map(|(.., dist)| dist)) } } @@ -204,11 +262,6 @@ impl GitIndex { pub(crate) struct GitIndexDistributions(BTreeMap); impl GitIndexDistributions { - /// Returns an [`Iterator`] over the distributions. - pub(crate) fn iter(&self) -> impl Iterator { - self.0.iter() - } - /// Add the given [`File`] to the [`GitIndexDistributions`] for the given package. fn add_file( &mut self, @@ -219,8 +272,7 @@ impl GitIndexDistributions { build_options: &BuildOptions, index: IndexUrl, ) { - // No `requires-python` here: for source distributions, we don't have that information; - // for wheels, we read it lazily only when selected. + // TODO(charlie): Incorporate `Requires-Python`, yanked status, etc. match filename { DistFilename::WheelFilename(filename) => { let version = filename.version.clone(); @@ -366,3 +418,39 @@ impl GitIndexCache { self.0.insert(index, entries) } } + +/// A cache key for a Git repository at a precise commit. +#[derive(Debug, Clone, PartialEq, Eq)] +struct GitCacheKey<'a> { + repository: RepositoryUrl, + precise: GitOid, + subdirectory: Option<&'a Path>, +} + +impl GitCacheKey<'_> { + /// Compute the digest for the Git cache key. + fn digest(&self) -> String { + let mut hasher = blake2::Blake2b::::new(); + hasher.update(self.repository.as_str().as_bytes()); + hasher.update(b"/"); + hasher.update(self.precise.as_str().as_bytes()); + if let Some(subdirectory) = self + .subdirectory + .and_then(|subdirectory| subdirectory.to_str()) + { + hasher.update(b"?subdirectory="); + hasher.update(subdirectory.as_bytes()); + } + hex::encode(hasher.finalize()) + } +} + +impl std::fmt::Display for GitCacheKey<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}/{}", self.repository, self.precise.as_str())?; + if let Some(subdirectory) = &self.subdirectory { + write!(f, "?subdirectory={}", subdirectory.display())?; + } + Ok(()) + } +} diff --git a/hash_test.py b/hash_test.py new file mode 100644 index 000000000..9f8288aba --- /dev/null +++ b/hash_test.py @@ -0,0 +1,50 @@ +from hashlib import blake2b +from os import fspath +from typing import Optional, Union + +Pathish = Union[str, bytes, "os.PathLike[str]", "os.PathLike[bytes]"] + +def git_cache_digest(repository: str, precise: str, subdirectory: Optional[Pathish] = None) -> str: + """ + Reproduces the Rust digest() exactly: + + - blake2b with 32-byte (256-bit) digest + - bytes fed in this order: + repository + "/" + precise [+ "?subdirectory=" + subdirectory] + - subdirectory is included only if it is representable as UTF-8 + (mirrors Rust Path::to_str() -> Option<&str>) + - hex output is lowercase + """ + h = blake2b(digest_size=32) + + # repository and precise are Rust &str equivalents: encode as UTF-8 + h.update(repository.encode("utf-8")) + h.update(b"/") + h.update(precise.encode("utf-8")) + + if subdirectory is not None: + # Normalize to either str or bytes using fspath (handles PathLike) + p = fspath(subdirectory) + + # Try to get a UTF-8 string like Path::to_str() + if isinstance(p, bytes): + try: + p_str = p.decode("utf-8") + except UnicodeDecodeError: + p_str = None + else: + # Already a str + p_str = p + + if p_str is not None: + h.update(b"?subdirectory=") + h.update(p_str.encode("utf-8")) + + return h.hexdigest() + +digest = git_cache_digest( + repository="https://github.com/agronholm/anyio", + precise="64b753b19c9a49e3ae395cde457cf82d51f7e999", + subdirectory=None +) +print(digest) # lowercase hex, identical to the Rust version diff --git a/server/v1/git/astral-sh/agronholm/anyio/64b753b19c9a49e3ae395cde457cf82d51f7e999/anyio/anyio-4.11.0.post24-py3-none-any.whl b/server/v1/git/astral-sh/9e/d1/54a640517fe5836f86a0844598bda6a7ad5c46321d0e2f4efa0362bf0b0b/anyio/anyio-4.11.0.post24-py3-none-any.whl similarity index 100% rename from server/v1/git/astral-sh/agronholm/anyio/64b753b19c9a49e3ae395cde457cf82d51f7e999/anyio/anyio-4.11.0.post24-py3-none-any.whl rename to server/v1/git/astral-sh/9e/d1/54a640517fe5836f86a0844598bda6a7ad5c46321d0e2f4efa0362bf0b0b/anyio/anyio-4.11.0.post24-py3-none-any.whl diff --git a/server/v1/git/astral-sh/agronholm/anyio/64b753b19c9a49e3ae395cde457cf82d51f7e999/anyio/anyio-4.11.0.post24.tar.gz b/server/v1/git/astral-sh/9e/d1/54a640517fe5836f86a0844598bda6a7ad5c46321d0e2f4efa0362bf0b0b/anyio/anyio-4.11.0.post24.tar.gz similarity index 100% rename from server/v1/git/astral-sh/agronholm/anyio/64b753b19c9a49e3ae395cde457cf82d51f7e999/anyio/anyio-4.11.0.post24.tar.gz rename to server/v1/git/astral-sh/9e/d1/54a640517fe5836f86a0844598bda6a7ad5c46321d0e2f4efa0362bf0b0b/anyio/anyio-4.11.0.post24.tar.gz diff --git a/server/v1/git/astral-sh/agronholm/anyio/64b753b19c9a49e3ae395cde457cf82d51f7e999/anyio/index.html b/server/v1/git/astral-sh/9e/d1/54a640517fe5836f86a0844598bda6a7ad5c46321d0e2f4efa0362bf0b0b/anyio/index.html similarity index 100% rename from server/v1/git/astral-sh/agronholm/anyio/64b753b19c9a49e3ae395cde457cf82d51f7e999/anyio/index.html rename to server/v1/git/astral-sh/9e/d1/54a640517fe5836f86a0844598bda6a7ad5c46321d0e2f4efa0362bf0b0b/anyio/index.html diff --git a/server/v1/git/astral-sh/agronholm/anyio/64b753b19c9a49e3ae395cde457cf82d51f7e999/index.html b/server/v1/git/astral-sh/9e/d1/54a640517fe5836f86a0844598bda6a7ad5c46321d0e2f4efa0362bf0b0b/index.html similarity index 100% rename from server/v1/git/astral-sh/agronholm/anyio/64b753b19c9a49e3ae395cde457cf82d51f7e999/index.html rename to server/v1/git/astral-sh/9e/d1/54a640517fe5836f86a0844598bda6a7ad5c46321d0e2f4efa0362bf0b0b/index.html