From 5ee4cf6ff5841e68a13928f6e3cc37463284ce78 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Mon, 20 Jan 2025 12:50:39 -0500 Subject: [PATCH] Fetch `pyproject.toml` from GitHub API (#10765) ## Summary When resolving Git metadata, we may be able to fetch the metadata from GitHub directly in some cases. This is _way_ faster, since we don't need to perform many Git operations and, in particular, don't need to clone the repo. This only works in the following cases: - The Git repository is public. Otherwise, I believe you need an access token, which we don't have. - The `pyproject.toml` has static metadata. - The `pyproject.toml` has no `tool.uv.sources`. Otherwise, we need to lower them... And, if there are any paths or workspace sources, that requires an install path (i.e., we need the content on-disk). - The project is in the repo root. If it's in a subdirectory, it could be a workspace member. And if it's a workspace member, there could be sources defined in the workspace root. But we can't know without fetching the workspace root -- and we need the workspace in order to find the root... Closes #10568. --- Cargo.lock | 1 + crates/uv-distribution/Cargo.toml | 1 + crates/uv-distribution/src/source/mod.rs | 218 ++++++++++++++++++++--- crates/uv-git/src/git.rs | 2 +- crates/uv-git/src/resolver.rs | 62 ++++++- 5 files changed, 254 insertions(+), 30 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c195423d8..35c76430b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4972,6 +4972,7 @@ dependencies = [ "thiserror 2.0.11", "tokio", "tokio-util", + "toml", "tracing", "url", "uv-cache", diff --git a/crates/uv-distribution/Cargo.toml b/crates/uv-distribution/Cargo.toml index b0fdefb96..06bd9e5c5 100644 --- a/crates/uv-distribution/Cargo.toml +++ b/crates/uv-distribution/Cargo.toml @@ -51,6 +51,7 @@ tempfile = { workspace = true } thiserror = { workspace = true } tokio = { workspace = true } tokio-util = { workspace = true, features = ["compat"] } +toml = { workspace = true } tracing = { workspace = true } url = { workspace = true } walkdir = { workspace = true } diff --git a/crates/uv-distribution/src/source/mod.rs b/crates/uv-distribution/src/source/mod.rs index f68b9a9bf..4a784fbf4 100644 --- a/crates/uv-distribution/src/source/mod.rs +++ b/crates/uv-distribution/src/source/mod.rs @@ -22,7 +22,7 @@ use crate::source::revision::Revision; use crate::{Reporter, RequiresDist}; use fs_err::tokio as fs; use futures::{FutureExt, TryStreamExt}; -use reqwest::Response; +use reqwest::{Response, StatusCode}; use tokio_util::compat::FuturesAsyncReadCompatExt; use tracing::{debug, info_span, instrument, warn, Instrument}; use url::Url; @@ -40,12 +40,14 @@ use uv_distribution_types::{ }; use uv_extract::hash::Hasher; use uv_fs::{rename_with_retry, write_atomic, LockedFile}; +use uv_git::{GitHubRepository, GitSha}; use uv_metadata::read_archive_metadata; use uv_normalize::PackageName; use uv_pep440::{release_specifiers_to_ranges, Version}; use uv_platform_tags::Tags; use uv_pypi_types::{HashAlgorithm, HashDigest, Metadata12, RequiresTxt, ResolutionMetadata}; use uv_types::{BuildContext, BuildStack, SourceBuildTrait}; +use uv_workspace::pyproject::ToolUvSources; use zip::ZipArchive; mod built_wheel_metadata; @@ -1496,6 +1498,34 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> { return Err(Error::HashesNotSupportedGit(source.to_string())); } + // If this is GitHub URL, attempt to resolve to a precise commit using the GitHub API. + if let Some(precise) = self + .build_context + .git() + .github_fast_path( + resource.git, + client.unmanaged.uncached_client(resource.url).clone(), + ) + .await? + { + // There's no need to check the cache, since we can't use cached metadata if there are + // sources, and we can't know if there are sources without fetching the + // `pyproject.toml`. + // + // For the same reason, there's no need to write to the cache, since we won't be able to + // use it on subsequent runs. + if let Some(metadata) = self + .github_metadata(precise, source, resource, client) + .await? + { + debug!("Found static metadata via GitHub fast path for: {source}"); + return Ok(ArchiveMetadata { + metadata: Metadata::from_metadata23(metadata), + hashes: vec![], + }); + } + } + // Fetch the Git repository. let fetch = self .build_context @@ -1698,38 +1728,139 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> { source: &BuildableSource<'_>, client: &ManagedClient<'_>, ) -> Result<(), Error> { - match source { - BuildableSource::Dist(SourceDist::Git(source)) => { - self.build_context - .git() - .fetch( - &source.git, - client.unmanaged.uncached_client(&source.url).clone(), - self.build_context.cache().bucket(CacheBucket::Git), - self.reporter - .clone() - .map(|reporter| reporter.into_git_reporter()), - ) - .await?; + let git = match source { + BuildableSource::Dist(SourceDist::Git(source)) => &*source.git, + BuildableSource::Url(SourceUrl::Git(source)) => source.git, + _ => { + return Ok(()); } - BuildableSource::Url(SourceUrl::Git(source)) => { - self.build_context - .git() - .fetch( - source.git, - client.unmanaged.uncached_client(source.url).clone(), - self.build_context.cache().bucket(CacheBucket::Git), - self.reporter - .clone() - .map(|reporter| reporter.into_git_reporter()), - ) - .await?; - } - _ => {} + }; + + // If this is GitHub URL, attempt to resolve to a precise commit using the GitHub API. + if self + .build_context + .git() + .github_fast_path( + git, + client.unmanaged.uncached_client(git.repository()).clone(), + ) + .await? + .is_some() + { + debug!("Resolved to precise commit via GitHub fast path: {source}"); + return Ok(()); } + + // Otherwise, fetch the Git repository. + self.build_context + .git() + .fetch( + git, + client.unmanaged.uncached_client(git.repository()).clone(), + self.build_context.cache().bucket(CacheBucket::Git), + self.reporter + .clone() + .map(|reporter| reporter.into_git_reporter()), + ) + .await?; + Ok(()) } + /// Fetch static [`ResolutionMetadata`] from a GitHub repository, if possible. + /// + /// Attempts to fetch the `pyproject.toml` from the resolved commit using the GitHub API. + async fn github_metadata( + &self, + commit: GitSha, + source: &BuildableSource<'_>, + resource: &GitSourceUrl<'_>, + client: &ManagedClient<'_>, + ) -> Result, Error> { + let GitSourceUrl { + git, subdirectory, .. + } = resource; + + // The fast path isn't available for subdirectories. If a `pyproject.toml` is in a + // subdirectory, it could be part of a workspace; and if it's part of a workspace, it could + // have `tool.uv.sources` entries that it inherits from the workspace root. + if subdirectory.is_some() { + return Ok(None); + } + + let Some(GitHubRepository { owner, repo }) = GitHubRepository::parse(git.repository()) + else { + return Ok(None); + }; + + // Fetch the `pyproject.toml` from the resolved commit. + let url = + format!("https://raw.githubusercontent.com/{owner}/{repo}/{commit}/pyproject.toml"); + + debug!("Attempting to fetch `pyproject.toml` from: {url}"); + + let content = client + .managed(|client| async { + let response = client + .uncached_client(git.repository()) + .get(&url) + .send() + .await?; + + // If the `pyproject.toml` does not exist, the GitHub API will return a 404. + if response.status() == StatusCode::NOT_FOUND { + return Ok::, Error>(None); + } + response.error_for_status_ref()?; + + let content = response.text().await?; + Ok::, Error>(Some(content)) + }) + .await?; + + let Some(content) = content else { + debug!("GitHub API returned a 404 for: {url}"); + return Ok(None); + }; + + // Parse the metadata. + let metadata = match ResolutionMetadata::parse_pyproject_toml(&content, source.version()) { + Ok(metadata) => metadata, + Err( + uv_pypi_types::MetadataError::Pep508Error(_) + | uv_pypi_types::MetadataError::DynamicField(_) + | uv_pypi_types::MetadataError::FieldNotFound(_) + | uv_pypi_types::MetadataError::PoetrySyntax, + ) => { + debug!("Failed to extract static metadata from GitHub API for: {url}"); + return Ok(None); + } + Err(err) => return Err(err.into()), + }; + + // Determine whether the project has `tool.uv.sources`. If the project has sources, it must + // be lowered, which requires access to the workspace. For example, it could have workspace + // members that need to be translated to concrete paths on disk. + // + // TODO(charlie): We could still use the `pyproject.toml` if the sources are all `git` or + // `url` sources; this is only applicable to `workspace` and `path` sources. It's awkward, + // though, because we'd need to pass a path into the lowering routine, and that path would + // be incorrect (we'd just be relying on it not being used). + match has_sources(&content) { + Ok(false) => {} + Ok(true) => { + debug!("Skipping GitHub fast path; `pyproject.toml` has sources: {url}"); + return Ok(None); + } + Err(err) => { + debug!("Failed to parse `tool.uv.sources` from GitHub API for: {url} ({err})"); + return Ok(None); + } + } + + Ok(Some(metadata)) + } + /// Heal a [`Revision`] for a local archive. async fn heal_archive_revision( &self, @@ -2341,6 +2472,37 @@ impl StaticMetadata { } } +/// Returns `true` if a `pyproject.toml` has `tool.uv.sources`. +fn has_sources(content: &str) -> Result { + #[derive(serde::Deserialize)] + struct PyProjectToml { + tool: Option, + } + + #[derive(serde::Deserialize)] + struct Tool { + uv: Option, + } + + #[derive(serde::Deserialize)] + struct ToolUv { + sources: Option, + } + + let PyProjectToml { tool } = toml::from_str(content)?; + if let Some(tool) = tool { + if let Some(uv) = tool.uv { + if let Some(sources) = uv.sources { + if !sources.inner().is_empty() { + return Ok(true); + } + } + } + } + + Ok(false) +} + /// Validate that the source distribution matches the built metadata. fn validate_metadata( source: &BuildableSource<'_>, diff --git a/crates/uv-git/src/git.rs b/crates/uv-git/src/git.rs index 0a539dc4b..fae382cca 100644 --- a/crates/uv-git/src/git.rs +++ b/crates/uv-git/src/git.rs @@ -102,7 +102,7 @@ impl GitReference { } /// Converts the [`GitReference`] to a `str` that can be used as a revision. - pub(crate) fn as_rev(&self) -> &str { + pub fn as_rev(&self) -> &str { match self { Self::Tag(rev) => rev, Self::Branch(rev) => rev, diff --git a/crates/uv-git/src/resolver.rs b/crates/uv-git/src/resolver.rs index aff504b05..5539ce04d 100644 --- a/crates/uv-git/src/resolver.rs +++ b/crates/uv-git/src/resolver.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; use std::path::PathBuf; +use std::str::FromStr; use std::sync::Arc; use tracing::debug; @@ -11,7 +12,7 @@ use reqwest_middleware::ClientWithMiddleware; use uv_cache_key::{cache_digest, RepositoryUrl}; use uv_fs::LockedFile; -use crate::{Fetch, GitReference, GitSha, GitSource, GitUrl, Reporter}; +use crate::{Fetch, GitHubRepository, GitReference, GitSha, GitSource, GitUrl, Reporter}; #[derive(Debug, thiserror::Error)] pub enum GitResolverError { @@ -21,6 +22,10 @@ pub enum GitResolverError { Join(#[from] tokio::task::JoinError), #[error("Git operation failed")] Git(#[source] anyhow::Error), + #[error(transparent)] + Reqwest(#[from] reqwest::Error), + #[error(transparent)] + ReqwestMiddleware(#[from] reqwest_middleware::Error), } /// A resolver for Git repositories. @@ -38,6 +43,61 @@ impl GitResolver { self.0.get(reference) } + /// Resolve a Git URL to a specific commit without performing any Git operations. + /// + /// Returns a [`GitSha`] if the URL has already been resolved (i.e., is available in the cache), + /// or if it can be fetched via the GitHub API. Otherwise, returns `None`. + pub async fn github_fast_path( + &self, + url: &GitUrl, + client: ClientWithMiddleware, + ) -> Result, GitResolverError> { + let reference = RepositoryReference::from(url); + + // If we know the precise commit already, return it. + if let Some(precise) = self.get(&reference) { + return Ok(Some(*precise)); + } + + // If the URL is a GitHub URL, attempt to resolve it via the GitHub API. + let Some(GitHubRepository { owner, repo }) = GitHubRepository::parse(url.repository()) + else { + return Ok(None); + }; + + // Determine the Git reference. + let rev = url.reference().as_rev(); + + let url = format!("https://api.github.com/repos/{owner}/{repo}/commits/{rev}"); + + debug!("Attempting GitHub fast path for: {url}"); + let mut request = client.get(&url); + request = request.header("Accept", "application/vnd.github.3.sha"); + request = request.header("User-Agent", "uv"); + + let response = request.send().await?; + if !response.status().is_success() { + // Returns a 404 if the repository does not exist, and a 422 if GitHub is unable to + // resolve the requested rev. + debug!( + "GitHub API request failed for: {url} ({})", + response.status() + ); + return Ok(None); + } + + // Parse the response as a Git SHA. + let precise = response.text().await?; + let precise = + GitSha::from_str(&precise).map_err(|err| GitResolverError::Git(err.into()))?; + + // Insert the resolved URL into the in-memory cache. This ensures that subsequent fetches + // resolve to the same precise commit. + self.insert(reference, precise); + + Ok(Some(precise)) + } + /// Fetch a remote Git repository. pub async fn fetch( &self,