From a99e5e00f22b18ea3194c076e173da14bf481c00 Mon Sep 17 00:00:00 2001 From: konsti Date: Sun, 14 Jan 2024 18:15:24 +0100 Subject: [PATCH] Use absolute urls in `distribution_type::File` (#917) Previously, the url on file could either be a relative or an absolute url, depending on the index, and we would finalize it lazily. Now we finalize the url when converting `pypi_types::File` to `distribution_types::File`. This change is required to make the hashes on `File` optional (https://github.com/astral-sh/puffin/pull/910), which are currently the only unique field usable for caching. --- crates/distribution-types/src/file.rs | 25 +++++++++++---- crates/puffin-client/src/registry_client.rs | 32 ++++++++++++------- .../src/distribution_database.rs | 7 +--- crates/puffin-distribution/src/source/mod.rs | 18 ++--------- 4 files changed, 41 insertions(+), 41 deletions(-) diff --git a/crates/distribution-types/src/file.rs b/crates/distribution-types/src/file.rs index 69ca46b9d..b616ce059 100644 --- a/crates/distribution-types/src/file.rs +++ b/crates/distribution-types/src/file.rs @@ -1,8 +1,19 @@ use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; +use thiserror::Error; +use url::Url; use pep440_rs::{VersionSpecifiers, VersionSpecifiersParseError}; -use pypi_types::{DistInfoMetadata, Hashes, Yanked}; +use pypi_types::{BaseUrl, DistInfoMetadata, Hashes, Yanked}; + +/// Error converting [`pypi_types::File`] to [`distribution_type::File`]. +#[derive(Debug, Error)] +pub enum FileConversionError { + #[error("Invalid 'requires-python' value")] + VersionSpecifiersParseError(#[from] VersionSpecifiersParseError), + #[error("Failed to parse URL: {0}")] + Url(String, #[source] url::ParseError), +} /// Internal analog to [`pypi_types::File`]. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -13,15 +24,13 @@ pub struct File { pub requires_python: Option, pub size: Option, pub upload_time: Option>, - pub url: String, + pub url: Url, pub yanked: Option, } -impl TryFrom for File { - type Error = VersionSpecifiersParseError; - +impl File { /// `TryFrom` instead of `From` to filter out files with invalid requires python version specifiers - fn try_from(file: pypi_types::File) -> Result { + pub fn try_from(file: pypi_types::File, base: &BaseUrl) -> Result { Ok(Self { dist_info_metadata: file.dist_info_metadata, filename: file.filename, @@ -29,7 +38,9 @@ impl TryFrom for File { requires_python: file.requires_python.transpose()?, size: file.size, upload_time: file.upload_time, - url: file.url, + url: base + .join_relative(&file.url) + .map_err(|err| FileConversionError::Url(file.url.clone(), err))?, yanked: file.yanked, }) } diff --git a/crates/puffin-client/src/registry_client.rs b/crates/puffin-client/src/registry_client.rs index 5ffb2c25f..9b30c19cc 100644 --- a/crates/puffin-client/src/registry_client.rs +++ b/crates/puffin-client/src/registry_client.rs @@ -171,15 +171,16 @@ impl RegistryClient { let bytes = response.bytes().await?; let data: SimpleJson = serde_json::from_slice(bytes.as_ref()) .map_err(|err| Error::from_json_err(err, url.clone()))?; - let metadata = SimpleMetadata::from_files(data.files, package_name); let base = BaseUrl::from(url.clone()); + let metadata = + SimpleMetadata::from_files(data.files, package_name, &base); Ok((base, metadata)) } MediaType::Html => { let text = response.text().await?; let SimpleHtml { base, files } = SimpleHtml::parse(&text, &url) .map_err(|err| Error::from_html_err(err, url.clone()))?; - let metadata = SimpleMetadata::from_files(files, package_name); + let metadata = SimpleMetadata::from_files(files, package_name, &base); Ok((base, metadata)) } } @@ -217,7 +218,7 @@ impl RegistryClient { pub async fn wheel_metadata(&self, built_dist: &BuiltDist) -> Result { let metadata = match &built_dist { BuiltDist::Registry(wheel) => { - self.wheel_metadata_registry(&wheel.index, &wheel.base, &wheel.file) + self.wheel_metadata_registry(&wheel.index, &wheel.file) .await? } BuiltDist::DirectUrl(wheel) => { @@ -248,7 +249,6 @@ impl RegistryClient { async fn wheel_metadata_registry( &self, index: &IndexUrl, - base: &BaseUrl, file: &File, ) -> Result { if self.index_urls.no_index() { @@ -256,14 +256,13 @@ impl RegistryClient { } // If the metadata file is available at its own url (PEP 658), download it from there. - let url = base.join_relative(&file.url)?; let filename = WheelFilename::from_str(&file.filename)?; if file .dist_info_metadata .as_ref() .is_some_and(pypi_types::DistInfoMetadata::is_available) { - let url = Url::parse(&format!("{url}.metadata"))?; + let url = Url::parse(&format!("{}.metadata", file.url))?; let cache_entry = self.cache.entry( CacheBucket::Wheels, @@ -287,7 +286,7 @@ impl RegistryClient { // If we lack PEP 658 support, try using HTTP range requests to read only the // `.dist-info/METADATA` file from the zip, and if that also fails, download the whole wheel // into the cache and read from there - self.wheel_metadata_no_pep658(&filename, &url, WheelCache::Index(index)) + self.wheel_metadata_no_pep658(&filename, &file.url, WheelCache::Index(index)) .await } } @@ -452,7 +451,11 @@ impl SimpleMetadata { self.0.iter() } - fn from_files(files: Vec, package_name: &PackageName) -> Self { + fn from_files( + files: Vec, + package_name: &PackageName, + base: &BaseUrl, + ) -> Self { let mut metadata = Self::default(); // Group the distributions by version and kind @@ -465,7 +468,7 @@ impl SimpleMetadata { DistFilename::WheelFilename(ref inner) => &inner.version, }; - let file = match file.try_into() { + let file = match File::try_from(file, base) { Ok(file) => file, Err(err) => { // Ignore files with unparseable version specifiers. @@ -526,9 +529,10 @@ impl MediaType { #[cfg(test)] mod tests { use std::str::FromStr; + use url::Url; use puffin_normalize::PackageName; - use pypi_types::SimpleJson; + use pypi_types::{BaseUrl, SimpleJson}; use crate::SimpleMetadata; @@ -568,8 +572,12 @@ mod tests { } "#; let data: SimpleJson = serde_json::from_str(response).unwrap(); - let simple_metadata = - SimpleMetadata::from_files(data.files, &PackageName::from_str("pyflyby").unwrap()); + let base = BaseUrl::from(Url::from_str("https://pypi.org/simple/pyflyby/").unwrap()); + let simple_metadata = SimpleMetadata::from_files( + data.files, + &PackageName::from_str("pyflyby").unwrap(), + &base, + ); let versions: Vec = simple_metadata .iter() .map(|(version, _)| version.to_string()) diff --git a/crates/puffin-distribution/src/distribution_database.rs b/crates/puffin-distribution/src/distribution_database.rs index 3a799d239..c9ef5c5f6 100644 --- a/crates/puffin-distribution/src/distribution_database.rs +++ b/crates/puffin-distribution/src/distribution_database.rs @@ -108,11 +108,6 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> ) -> Result { match &dist { Dist::Built(BuiltDist::Registry(wheel)) => { - let url = wheel - .base - .join_relative(&wheel.file.url) - .map_err(|err| DistributionDatabaseError::Url(wheel.file.url.clone(), err))?; - // Download and unzip on the same tokio task. // // In all wheels we've seen so far, unzipping while downloading is @@ -128,7 +123,7 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> // for downloading and unzipping (with a buffer in between) and switch // to rayon if this buffer grows large by the time the file is fully // downloaded. - let reader = self.client.stream_external(&url).await?; + let reader = self.client.stream_external(&wheel.file.url).await?; // Download and unzip the wheel to a temporary directory. let temp_dir = tempfile::tempdir_in(self.cache.root())?; diff --git a/crates/puffin-distribution/src/source/mod.rs b/crates/puffin-distribution/src/source/mod.rs index ea6e846d3..eed88d340 100644 --- a/crates/puffin-distribution/src/source/mod.rs +++ b/crates/puffin-distribution/src/source/mod.rs @@ -98,13 +98,6 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { .await? } SourceDist::Registry(registry_source_dist) => { - let url = registry_source_dist - .base - .join_relative(®istry_source_dist.file.url) - .map_err(|err| { - SourceDistError::UrlParse(registry_source_dist.file.url.clone(), err) - })?; - // For registry source distributions, shard by package, then by SHA. // Ex) `pypi/requests/a673187abc19fe6c` let cache_shard = self.build_context.cache().shard( @@ -117,7 +110,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { self.url( source_dist, ®istry_source_dist.file.filename, - &url, + ®istry_source_dist.file.url, &cache_shard, None, ) @@ -161,13 +154,6 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { .await? } SourceDist::Registry(registry_source_dist) => { - let url = registry_source_dist - .base - .join_relative(®istry_source_dist.file.url) - .map_err(|err| { - SourceDistError::UrlParse(registry_source_dist.file.url.clone(), err) - })?; - // For registry source distributions, shard by package, then by SHA. // Ex) `pypi/requests/a673187abc19fe6c` let cache_shard = self.build_context.cache().shard( @@ -180,7 +166,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { self.url_metadata( source_dist, ®istry_source_dist.file.filename, - &url, + ®istry_source_dist.file.url, &cache_shard, None, )