diff --git a/crates/distribution-types/src/base_url.rs b/crates/distribution-types/src/base_url.rs new file mode 100644 index 000000000..4701bce26 --- /dev/null +++ b/crates/distribution-types/src/base_url.rs @@ -0,0 +1,34 @@ +use serde::{Deserialize, Serialize}; +use url::Url; + +#[derive(Debug, Clone, Hash, Eq, PartialEq, Serialize, Deserialize)] +pub struct BaseUrl(Url); + +impl BaseUrl { + /// Parse the given URL. If it's relative, join it to the current [`BaseUrl`]. Allows for + /// parsing URLs that may be absolute or relative, with a known base URL. + pub fn join_relative(&self, url: &str) -> Result { + match Url::parse(url) { + Ok(url) => Ok(url), + Err(err) => { + if err == url::ParseError::RelativeUrlWithoutBase { + self.0.join(url) + } else { + Err(err) + } + } + } + } +} + +impl From for BaseUrl { + fn from(url: Url) -> Self { + Self(url) + } +} + +impl std::fmt::Display for BaseUrl { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} diff --git a/crates/distribution-types/src/index_url.rs b/crates/distribution-types/src/index_url.rs index 95a0d828d..a6a0dcc2d 100644 --- a/crates/distribution-types/src/index_url.rs +++ b/crates/distribution-types/src/index_url.rs @@ -3,12 +3,13 @@ use std::ops::Deref; use std::str::FromStr; use once_cell::sync::Lazy; +use serde::{Deserialize, Serialize}; use url::Url; static PYPI_URL: Lazy = Lazy::new(|| Url::parse("https://pypi.org/simple").unwrap()); -/// The url of an index, newtype'd to avoid mixing it with file urls -#[derive(Debug, Clone, Hash, Eq, PartialEq)] +/// The url of an index, newtype'd to avoid mixing it with file urls. +#[derive(Debug, Clone, Hash, Eq, PartialEq, Serialize, Deserialize)] pub enum IndexUrl { Pypi, Url(Url), diff --git a/crates/distribution-types/src/lib.rs b/crates/distribution-types/src/lib.rs index c8dbf3d33..64e7f167c 100644 --- a/crates/distribution-types/src/lib.rs +++ b/crates/distribution-types/src/lib.rs @@ -47,6 +47,7 @@ use distribution_filename::WheelFilename; use pep440_rs::Version; use pep508_rs::VerbatimUrl; use puffin_normalize::PackageName; +use pypi_types::BaseUrl; use requirements_txt::EditableRequirement; pub use crate::any::*; @@ -152,6 +153,7 @@ pub struct RegistryBuiltDist { pub version: Version, pub file: File, pub index: IndexUrl, + pub base: BaseUrl, } /// A built distribution (wheel) that exists at an arbitrary URL. @@ -178,6 +180,7 @@ pub struct RegistrySourceDist { pub version: Version, pub file: File, pub index: IndexUrl, + pub base: BaseUrl, } /// A source distribution that exists at an arbitrary URL. @@ -207,7 +210,13 @@ pub struct PathSourceDist { impl Dist { /// Create a [`Dist`] for a registry-based distribution. - pub fn from_registry(name: PackageName, version: Version, file: File, index: IndexUrl) -> Self { + pub fn from_registry( + name: PackageName, + version: Version, + file: File, + index: IndexUrl, + base: BaseUrl, + ) -> Self { if Path::new(&file.filename) .extension() .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) @@ -217,6 +226,7 @@ impl Dist { version, file, index, + base, })) } else { Self::Source(SourceDist::Registry(RegistrySourceDist { @@ -224,6 +234,7 @@ impl Dist { version, file, index, + base, })) } } diff --git a/crates/puffin-client/src/html.rs b/crates/puffin-client/src/html.rs index 9c8916808..d41236979 100644 --- a/crates/puffin-client/src/html.rs +++ b/crates/puffin-client/src/html.rs @@ -4,10 +4,12 @@ use tl::HTMLTag; use url::Url; use pep440_rs::VersionSpecifiers; -use pypi_types::{DistInfoMetadata, File, Hashes, Yanked}; +use pypi_types::{BaseUrl, DistInfoMetadata, File, Hashes, Yanked}; #[derive(Debug, Clone)] pub(crate) struct SimpleHtml { + /// The [`BaseUrl`] to which all relative URLs should be resolved. + pub(crate) base: BaseUrl, /// The list of [`File`]s available for download. pub(crate) files: Vec, } @@ -20,16 +22,17 @@ impl SimpleHtml { // Parse the first `` tag, if any, to determine the base URL to which all // relative URLs should be resolved. The HTML spec requires that the `` tag // appear before other tags with attribute values of URLs. - let base = dom - .nodes() - .iter() - .filter_map(|node| node.as_tag()) - .take_while(|tag| !matches!(tag.name().as_bytes(), b"a" | b"link")) - .find(|tag| tag.name().as_bytes() == b"base") - .map(|base| Self::parse_base(base)) - .transpose()? - .flatten() - .unwrap_or_else(|| url.clone()); + let base = BaseUrl::from( + dom.nodes() + .iter() + .filter_map(|node| node.as_tag()) + .take_while(|tag| !matches!(tag.name().as_bytes(), b"a" | b"link")) + .find(|tag| tag.name().as_bytes() == b"base") + .map(|base| Self::parse_base(base)) + .transpose()? + .flatten() + .unwrap_or_else(|| url.clone()), + ); // Parse each `` tag, to extract the filename, hash, and URL. let files: Vec = dom @@ -37,10 +40,10 @@ impl SimpleHtml { .iter() .filter_map(|node| node.as_tag()) .filter(|link| link.name().as_bytes() == b"a") - .map(|link| Self::parse_anchor(link, &base)) + .map(|link| Self::parse_anchor(link)) .collect::, _>>()?; - Ok(Self { files }) + Ok(Self { base, files }) } /// Parse the `href` from a `` tag. @@ -54,25 +57,25 @@ impl SimpleHtml { } /// Parse the hash from a fragment, as in: `sha256=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61` - fn parse_hash(fragment: &str, url: &Url) -> Result { + fn parse_hash(fragment: &str) -> Result { let mut parts = fragment.split('='); // Extract the key and value. let name = parts .next() - .ok_or_else(|| Error::FragmentParse(url.clone()))?; + .ok_or_else(|| Error::FragmentParse(fragment.to_string()))?; let value = parts .next() - .ok_or_else(|| Error::FragmentParse(url.clone()))?; + .ok_or_else(|| Error::FragmentParse(fragment.to_string()))?; // Ensure there are no more parts. if parts.next().is_some() { - return Err(Error::FragmentParse(url.clone())); + return Err(Error::FragmentParse(fragment.to_string())); } // TODO(charlie): Support all hash algorithms. if name != "sha256" { - return Err(Error::UnsupportedHashAlgorithm(url.clone())); + return Err(Error::UnsupportedHashAlgorithm(fragment.to_string())); } let sha256 = std::str::from_utf8(value.as_bytes())?; @@ -81,31 +84,30 @@ impl SimpleHtml { } /// Parse a [`File`] from an `` tag. - fn parse_anchor(link: &HTMLTag, base: &Url) -> Result { + fn parse_anchor(link: &HTMLTag) -> Result { // Extract the href. let href = link .attributes() .get("href") .flatten() - .ok_or_else(|| Error::MissingHref(base.clone()))?; + .filter(|bytes| !bytes.as_bytes().is_empty()) + .ok_or(Error::MissingHref)?; let href = std::str::from_utf8(href.as_bytes())?; - let url = base - .join(href) - .map_err(|err| Error::UrlParse(href.to_string(), err))?; + + // Split the base and the fragment. + let (path, fragment) = href + .split_once('#') + .ok_or_else(|| Error::MissingHash(href.to_string()))?; // Extract the filename from the body text, which MUST match that of // the final path component of the URL. - let filename = url - .path_segments() - .and_then(|segments| segments.last()) - .ok_or_else(|| Error::MissingFilename(url.clone()))?; + let filename = path + .split('/') + .last() + .ok_or_else(|| Error::MissingFilename(href.to_string()))?; // Extract the hash, which should be in the fragment. - let hashes = url - .fragment() - .map(|fragment| Self::parse_hash(fragment, &url)) - .transpose()? - .ok_or_else(|| Error::MissingHash(url.clone()))?; + let hashes = Self::parse_hash(fragment)?; // Extract the `requires-python` field, which should be set on the // `data-requires-python` attribute. @@ -131,7 +133,7 @@ impl SimpleHtml { match dist_info_metadata.as_ref() { "true" => Some(DistInfoMetadata::Bool(true)), "false" => Some(DistInfoMetadata::Bool(false)), - fragment => Some(DistInfoMetadata::Hashes(Self::parse_hash(fragment, &url)?)), + fragment => Some(DistInfoMetadata::Hashes(Self::parse_hash(fragment)?)), } } else { None @@ -153,8 +155,7 @@ impl SimpleHtml { requires_python, hashes, filename: filename.to_string(), - // TODO(charlie): Store serialized URLs. - url: url.to_string(), + url: href.to_string(), size: None, upload_time: None, }) @@ -172,20 +173,20 @@ pub enum Error { #[error(transparent)] HtmlParse(#[from] tl::ParseError), - #[error("Missing href attribute on URL: {0}")] - MissingHref(Url), + #[error("Missing href attribute on anchor link")] + MissingHref, #[error("Expected distribution filename as last path component of URL: {0}")] - MissingFilename(Url), + MissingFilename(String), #[error("Missing hash attribute on URL: {0}")] - MissingHash(Url), + MissingHash(String), #[error("Unexpected fragment (expected `#sha256=...`) on URL: {0}")] - FragmentParse(Url), + FragmentParse(String), #[error("Unsupported hash algorithm (expected `sha256`) on: {0}")] - UnsupportedHashAlgorithm(Url), + UnsupportedHashAlgorithm(String), #[error("Invalid `requires-python` specifier: {0}")] Pep440(#[source] pep440_rs::Pep440Error), @@ -211,6 +212,23 @@ mod tests { let result = SimpleHtml::parse(text, &base).unwrap(); insta::assert_debug_snapshot!(result, @r###" SimpleHtml { + base: BaseUrl( + Url { + scheme: "https", + cannot_be_a_base: false, + username: "", + password: None, + host: Some( + Domain( + "download.pytorch.org", + ), + ), + port: None, + path: "/whl/jinja2/", + query: None, + fragment: None, + }, + ), files: [ File { dist_info_metadata: None, @@ -221,7 +239,7 @@ mod tests { requires_python: None, size: None, upload_time: None, - url: "https://download.pytorch.org/whl/Jinja2-3.1.2-py3-none-any.whl#sha256=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61", + url: "/whl/Jinja2-3.1.2-py3-none-any.whl#sha256=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61", yanked: None, }, ], @@ -248,6 +266,23 @@ mod tests { let result = SimpleHtml::parse(text, &base).unwrap(); insta::assert_debug_snapshot!(result, @r###" SimpleHtml { + base: BaseUrl( + Url { + scheme: "https", + cannot_be_a_base: false, + username: "", + password: None, + host: Some( + Domain( + "index.python.org", + ), + ), + port: None, + path: "/", + query: None, + fragment: None, + }, + ), files: [ File { dist_info_metadata: None, @@ -258,7 +293,7 @@ mod tests { requires_python: None, size: None, upload_time: None, - url: "https://index.python.org/whl/Jinja2-3.1.2-py3-none-any.whl#sha256=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61", + url: "/whl/Jinja2-3.1.2-py3-none-any.whl#sha256=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61", yanked: None, }, ], @@ -280,7 +315,7 @@ mod tests { "#; let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap(); let result = SimpleHtml::parse(text, &base).unwrap_err(); - insta::assert_display_snapshot!(result, @"Missing href attribute on URL: https://download.pytorch.org/whl/jinja2/"); + insta::assert_display_snapshot!(result, @"Missing href attribute on anchor link"); } #[test] @@ -297,7 +332,7 @@ mod tests { "#; let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap(); let result = SimpleHtml::parse(text, &base).unwrap_err(); - insta::assert_display_snapshot!(result, @"Missing hash attribute on URL: https://download.pytorch.org/whl/jinja2/"); + insta::assert_display_snapshot!(result, @"Missing href attribute on anchor link"); } #[test] @@ -314,7 +349,7 @@ mod tests { "#; let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap(); let result = SimpleHtml::parse(text, &base).unwrap_err(); - insta::assert_display_snapshot!(result, @"Missing hash attribute on URL: https://download.pytorch.org/whl/Jinja2-3.1.2-py3-none-any.whl"); + insta::assert_display_snapshot!(result, @"Missing hash attribute on URL: /whl/Jinja2-3.1.2-py3-none-any.whl"); } #[test] @@ -331,7 +366,7 @@ mod tests { "#; let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap(); let result = SimpleHtml::parse(text, &base).unwrap_err(); - insta::assert_display_snapshot!(result, @"Unexpected fragment (expected `#sha256=...`) on URL: https://download.pytorch.org/whl/Jinja2-3.1.2-py3-none-any.whl#sha256"); + insta::assert_display_snapshot!(result, @"Unexpected fragment (expected `#sha256=...`) on URL: sha256"); } #[test] @@ -348,6 +383,6 @@ mod tests { "#; let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap(); let result = SimpleHtml::parse(text, &base).unwrap_err(); - insta::assert_display_snapshot!(result, @"Unsupported hash algorithm (expected `sha256`) on: https://download.pytorch.org/whl/Jinja2-3.1.2-py3-none-any.whl#sha512=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"); + insta::assert_display_snapshot!(result, @"Unsupported hash algorithm (expected `sha256`) on: sha512=6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"); } } diff --git a/crates/puffin-client/src/lib.rs b/crates/puffin-client/src/lib.rs index 32d5e1120..ea925a8fb 100644 --- a/crates/puffin-client/src/lib.rs +++ b/crates/puffin-client/src/lib.rs @@ -1,7 +1,7 @@ pub use cached_client::{CachedClient, CachedClientError, DataWithCachePolicy}; pub use error::Error; pub use registry_client::{ - read_metadata_async, RegistryClient, RegistryClientBuilder, SimpleMetadata, + read_metadata_async, RegistryClient, RegistryClientBuilder, SimpleMetadata, VersionFiles, }; mod cached_client; diff --git a/crates/puffin-client/src/registry_client.rs b/crates/puffin-client/src/registry_client.rs index 84c2613b2..8dd75aeb5 100644 --- a/crates/puffin-client/src/registry_client.rs +++ b/crates/puffin-client/src/registry_client.rs @@ -22,7 +22,7 @@ use install_wheel_rs::find_dist_info; use pep440_rs::Version; use puffin_cache::{Cache, CacheBucket, WheelCache}; use puffin_normalize::PackageName; -use pypi_types::{Metadata21, SimpleJson}; +use pypi_types::{BaseUrl, Metadata21, SimpleJson}; use crate::html::SimpleHtml; use crate::remote_metadata::wheel_metadata_from_remote_zip; @@ -104,6 +104,7 @@ impl RegistryClientBuilder { // TODO(konstin): Clean up the clients once we moved everything to common caching. #[derive(Debug, Clone)] pub struct RegistryClient { + /// The index URLs to use for fetching packages. pub(crate) index_urls: IndexUrls, pub(crate) client: CachedClient, /// Don't use this client, it only exists because `async_http_range_reader` needs @@ -126,7 +127,7 @@ impl RegistryClient { pub async fn simple( &self, package_name: &PackageName, - ) -> Result<(IndexUrl, SimpleMetadata), Error> { + ) -> Result<(IndexUrl, BaseUrl, SimpleMetadata), Error> { if self.index_urls.no_index() { return Err(Error::NoIndex(package_name.as_ref().to_string())); } @@ -173,15 +174,16 @@ impl RegistryClient { let bytes = response.bytes().await?; let data: SimpleJson = serde_json::from_slice(bytes.as_ref()) .map_err(|err| Error::from_json_err(err, url.clone()))?; - let metadata = SimpleMetadata::from_files(package_name, data.files); - Ok(metadata) + let metadata = SimpleMetadata::from_files(data.files, package_name); + let base = BaseUrl::from(url.clone()); + Ok((base, metadata)) } MediaType::Html => { let text = response.text().await?; - let data = SimpleHtml::parse(&text, &url) + let SimpleHtml { base, files } = SimpleHtml::parse(&text, &url) .map_err(|err| Error::from_html_err(err, url.clone()))?; - let metadata = SimpleMetadata::from_files(package_name, data.files); - Ok(metadata) + let metadata = SimpleMetadata::from_files(files, package_name); + Ok((base, metadata)) } } }; @@ -191,20 +193,16 @@ impl RegistryClient { .await; // Fetch from the index. - match result { - Ok(simple_metadata) => { - return Ok((index.clone(), simple_metadata)); - } + return match result { + Ok((base, metadata)) => Ok((index.clone(), base, metadata)), Err(CachedClientError::Client(Error::RequestError(err))) => { if err.status() == Some(StatusCode::NOT_FOUND) { continue; } - return Err(err.into()); + Err(err.into()) } - Err(err) => { - return Err(err.into()); - } - } + Err(err) => Err(err.into()), + }; } Err(Error::PackageNotFound(package_name.to_string())) @@ -219,7 +217,7 @@ impl RegistryClient { pub async fn wheel_metadata(&self, built_dist: &BuiltDist) -> Result { let metadata = match &built_dist { BuiltDist::Registry(wheel) => { - self.wheel_metadata_registry(wheel.index.clone(), wheel.file.clone()) + self.wheel_metadata_registry(&wheel.index, &wheel.base, &wheel.file) .await? } BuiltDist::DirectUrl(wheel) => { @@ -249,26 +247,27 @@ impl RegistryClient { /// Fetch the metadata from a wheel file. async fn wheel_metadata_registry( &self, - index: IndexUrl, - file: File, + index: &IndexUrl, + base: &BaseUrl, + file: &File, ) -> Result { if self.index_urls.no_index() { - return Err(Error::NoIndex(file.filename)); + return Err(Error::NoIndex(file.filename.clone())); } - // If the metadata file is available at its own url (PEP 658), download it from there - let url = Url::parse(&file.url)?; + // If the metadata file is available at its own url (PEP 658), download it from there. + let url = base.join_relative(&file.url)?; let filename = WheelFilename::from_str(&file.filename)?; if file .dist_info_metadata .as_ref() .is_some_and(pypi_types::DistInfoMetadata::is_available) { - let url = Url::parse(&format!("{}.metadata", file.url))?; + let url = Url::parse(&format!("{}.metadata", url))?; let cache_entry = self.cache.entry( CacheBucket::Wheels, - WheelCache::Index(&index).remote_wheel_dir(filename.name.as_ref()), + WheelCache::Index(index).remote_wheel_dir(filename.name.as_ref()), format!("{}.msgpack", filename.stem()), ); @@ -285,7 +284,7 @@ impl RegistryClient { // If we lack PEP 658 support, try using HTTP range requests to read only the // `.dist-info/METADATA` file from the zip, and if that also fails, download the whole wheel // into the cache and read from there - self.wheel_metadata_no_pep658(&filename, &url, WheelCache::Index(&index)) + self.wheel_metadata_no_pep658(&filename, &url, WheelCache::Index(index)) .await } } @@ -446,7 +445,7 @@ impl SimpleMetadata { self.0.iter() } - fn from_files(package_name: &PackageName, files: Vec) -> Self { + fn from_files(files: Vec, package_name: &PackageName) -> Self { let mut metadata = Self::default(); // Group the distributions by version and kind @@ -504,6 +503,7 @@ impl MediaType { /// Return the `Accept` header value for all supported media types. #[inline] const fn accepts() -> &'static str { - "application/vnd.pypi.simple.v1+json, application/vnd.pypi.simple.v1+html;q=0.2, text/html" + // See: https://peps.python.org/pep-0691/#version-format-selection + "application/vnd.pypi.simple.v1+json, application/vnd.pypi.simple.v1+html;q=0.2, text/html;q=0.01" } } diff --git a/crates/puffin-distribution/src/distribution_database.rs b/crates/puffin-distribution/src/distribution_database.rs index 26c5b62a1..878996afd 100644 --- a/crates/puffin-distribution/src/distribution_database.rs +++ b/crates/puffin-distribution/src/distribution_database.rs @@ -30,7 +30,7 @@ use crate::{ #[derive(Debug, Error)] pub enum DistributionDatabaseError { - #[error("Failed to parse '{0}' as url")] + #[error("Failed to parse URL: {0}")] Url(String, #[source] url::ParseError), #[error(transparent)] WheelFilename(#[from] WheelFilenameError), @@ -108,9 +108,11 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> match &dist { Dist::Built(BuiltDist::Registry(wheel)) => { // Fetch the wheel. - let url = Url::parse(&wheel.file.url).map_err(|err| { - DistributionDatabaseError::Url(wheel.file.url.to_string(), err) - })?; + let url = wheel + .base + .join_relative(&wheel.file.url) + .map_err(|err| DistributionDatabaseError::Url(wheel.file.url.clone(), err))?; + let wheel_filename = WheelFilename::from_str(&wheel.file.filename)?; let reader = self.client.stream_external(&url).await?; diff --git a/crates/puffin-distribution/src/source_dist.rs b/crates/puffin-distribution/src/source_dist.rs index ef82452df..76ebc2640 100644 --- a/crates/puffin-distribution/src/source_dist.rs +++ b/crates/puffin-distribution/src/source_dist.rs @@ -231,9 +231,12 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { .await? } SourceDist::Registry(registry_source_dist) => { - let url = Url::parse(®istry_source_dist.file.url).map_err(|err| { - SourceDistError::UrlParse(registry_source_dist.file.url.clone(), err) - })?; + let url = registry_source_dist + .base + .join_relative(®istry_source_dist.file.url) + .map_err(|err| { + SourceDistError::UrlParse(registry_source_dist.file.url.clone(), err) + })?; // For registry source distributions, shard by package, then by SHA. // Ex) `pypi/requests/a673187abc19fe6c` diff --git a/crates/puffin-resolver/src/candidate_selector.rs b/crates/puffin-resolver/src/candidate_selector.rs index ee2cda1d5..1e2759eb5 100644 --- a/crates/puffin-resolver/src/candidate_selector.rs +++ b/crates/puffin-resolver/src/candidate_selector.rs @@ -4,6 +4,7 @@ use rustc_hash::FxHashMap; use distribution_types::{Dist, DistributionMetadata, IndexUrl, Name}; use pep508_rs::{Requirement, VersionOrUrl}; use puffin_normalize::PackageName; +use pypi_types::BaseUrl; use crate::file::DistFile; use crate::prerelease_mode::PreReleaseStrategy; @@ -145,7 +146,7 @@ impl CandidateSelector { } /// Select the first-matching [`Candidate`] from a set of candidate versions and files, - /// preferring wheels over sdists. + /// preferring wheels over source distributions. fn select_candidate<'a>( versions: impl Iterator)>, package_name: &'a PackageName, @@ -242,12 +243,13 @@ impl<'a> Candidate<'a> { } /// Return the [`Dist`] to use when resolving the candidate. - pub(crate) fn into_distribution(self, index: IndexUrl) -> Dist { + pub(crate) fn into_distribution(self, index: IndexUrl, base: BaseUrl) -> Dist { Dist::from_registry( self.name().clone(), self.version().clone().into(), self.resolve().clone().into(), index, + base, ) } } diff --git a/crates/puffin-resolver/src/error.rs b/crates/puffin-resolver/src/error.rs index 2b46c8b59..2fd288fe5 100644 --- a/crates/puffin-resolver/src/error.rs +++ b/crates/puffin-resolver/src/error.rs @@ -12,6 +12,7 @@ use pep508_rs::Requirement; use puffin_distribution::DistributionDatabaseError; use puffin_normalize::PackageName; use puffin_traits::OnceMap; +use pypi_types::BaseUrl; use crate::pubgrub::{PubGrubPackage, PubGrubReportFormatter, PubGrubVersion}; use crate::version_map::VersionMap; @@ -144,12 +145,12 @@ impl NoSolutionError { /// Only packages used in the error's derivation tree will be retrieved. pub(crate) fn update_available_versions( mut self, - package_versions: &OnceMap, + package_versions: &OnceMap, ) -> Self { for package in self.derivation_tree.packages() { if let PubGrubPackage::Package(name, ..) = package { if let Some(entry) = package_versions.get(name) { - let (_, version_map) = entry.value(); + let (_, _, version_map) = entry.value(); self.available_versions.insert( package.clone(), version_map diff --git a/crates/puffin-resolver/src/finder.rs b/crates/puffin-resolver/src/finder.rs index 5c4219014..b0c409a13 100644 --- a/crates/puffin-resolver/src/finder.rs +++ b/crates/puffin-resolver/src/finder.rs @@ -6,7 +6,7 @@ use anyhow::Result; use futures::{stream, Stream, StreamExt, TryStreamExt}; use rustc_hash::FxHashMap; -use distribution_types::{Dist, IndexUrl, Resolution}; +use distribution_types::{Dist, File, Resolution}; use pep440_rs::Version; use pep508_rs::{Requirement, VersionOrUrl}; use platform_tags::{TagPriority, Tags}; @@ -53,12 +53,18 @@ impl<'a> DistFinder<'a> { match requirement.version_or_url.as_ref() { None | Some(VersionOrUrl::VersionSpecifier(_)) => { // Query the index(es) (cached) to get the URLs for the available files. - let (index, metadata) = self.client.simple(&requirement.name).await?; + let (index, base, metadata) = self.client.simple(&requirement.name).await?; // Pick a version that satisfies the requirement. - let Some(distribution) = self.select(requirement, &index, metadata) else { + let Some(ParsedFile { + name, + version, + file, + }) = self.select(requirement, metadata) + else { return Err(ResolveError::NotFound(requirement.clone())); }; + let distribution = Dist::from_registry(name, version, file, index, base); if let Some(reporter) = self.reporter.as_ref() { reporter.on_progress(&distribution); @@ -103,15 +109,10 @@ impl<'a> DistFinder<'a> { } /// select a version that satisfies the requirement, preferring wheels to source distributions. - fn select( - &self, - requirement: &Requirement, - index: &IndexUrl, - metadata: SimpleMetadata, - ) -> Option { + fn select(&self, requirement: &Requirement, metadata: SimpleMetadata) -> Option { let mut best_version: Option = None; - let mut best_wheel: Option<(Dist, TagPriority)> = None; - let mut best_sdist: Option = None; + let mut best_wheel: Option<(ParsedFile, TagPriority)> = None; + let mut best_sdist: Option = None; for (version, files) in metadata.into_iter().rev() { // If we iterated past the first-compatible version, break. @@ -151,7 +152,11 @@ impl<'a> DistFinder<'a> { .map_or(true, |(.., existing)| priority > *existing) { best_wheel = Some(( - Dist::from_registry(wheel.name, wheel.version, file, index.clone()), + ParsedFile { + name: wheel.name, + version: wheel.version, + file, + }, priority, )); } @@ -177,12 +182,11 @@ impl<'a> DistFinder<'a> { } best_version = Some(sdist.version.clone()); - best_sdist = Some(Dist::from_registry( - sdist.name, - sdist.version, + best_sdist = Some(ParsedFile { + name: sdist.name, + version: sdist.version, file, - index.clone(), - )); + }); } } } @@ -191,6 +195,16 @@ impl<'a> DistFinder<'a> { } } +#[derive(Debug)] +struct ParsedFile { + /// The [`PackageName`] extracted from the [`File`]. + name: PackageName, + /// The version extracted from the [`File`]. + version: Version, + /// The underlying [`File`]. + file: File, +} + pub trait Reporter: Send + Sync { /// Callback to invoke when a package is resolved to a specific distribution. fn on_progress(&self, dist: &Dist); diff --git a/crates/puffin-resolver/src/pins.rs b/crates/puffin-resolver/src/pins.rs index 7d595b9d5..87da6e959 100644 --- a/crates/puffin-resolver/src/pins.rs +++ b/crates/puffin-resolver/src/pins.rs @@ -2,6 +2,7 @@ use rustc_hash::FxHashMap; use distribution_types::{File, IndexUrl}; use puffin_normalize::PackageName; +use pypi_types::BaseUrl; use crate::candidate_selector::Candidate; @@ -10,14 +11,20 @@ use crate::candidate_selector::Candidate; /// For example, given `Flask==3.0.0`, the [`FilePins`] would contain a mapping from `Flask` to /// `3.0.0` to the specific wheel or source distribution archive that was pinned for that version. #[derive(Debug, Default)] -pub(crate) struct FilePins(FxHashMap>); +pub(crate) struct FilePins( + FxHashMap>, +); impl FilePins { /// Pin a candidate package. - pub(crate) fn insert(&mut self, candidate: &Candidate, index: &IndexUrl) { + pub(crate) fn insert(&mut self, candidate: &Candidate, index: &IndexUrl, base: &BaseUrl) { self.0.entry(candidate.name().clone()).or_default().insert( candidate.version().clone().into(), - (index.clone(), candidate.install().clone().into()), + ( + index.clone(), + base.clone(), + candidate.install().clone().into(), + ), ); } @@ -26,7 +33,7 @@ impl FilePins { &self, name: &PackageName, version: &pep440_rs::Version, - ) -> Option<&(IndexUrl, File)> { + ) -> Option<&(IndexUrl, BaseUrl, File)> { self.0.get(name)?.get(version) } } diff --git a/crates/puffin-resolver/src/resolution.rs b/crates/puffin-resolver/src/resolution.rs index 6e1b23fb2..e9c4fb3a8 100644 --- a/crates/puffin-resolver/src/resolution.rs +++ b/crates/puffin-resolver/src/resolution.rs @@ -55,12 +55,12 @@ impl ResolutionGraph { match package { PubGrubPackage::Package(package_name, None, None) => { let version = Version::from(version.clone()); - let (index, file) = pins + let (index, base, file) = pins .get(package_name, &version) .expect("Every package should be pinned") .clone(); let pinned_package = - Dist::from_registry(package_name.clone(), version, file, index); + Dist::from_registry(package_name.clone(), version, file, index, base); let index = petgraph.add_node(pinned_package); inverse.insert(package_name, index); @@ -89,12 +89,12 @@ impl ResolutionGraph { if !metadata.provides_extras.contains(extra) { let version = Version::from(version.clone()); - let (index, file) = pins + let (index, base, file) = pins .get(package_name, &version) .expect("Every package should be pinned") .clone(); let pinned_package = - Dist::from_registry(package_name.clone(), version, file, index); + Dist::from_registry(package_name.clone(), version, file, index, base); diagnostics.push(Diagnostic::MissingExtra { dist: pinned_package, diff --git a/crates/puffin-resolver/src/resolver.rs b/crates/puffin-resolver/src/resolver.rs index cf0b28b06..96927ffe2 100644 --- a/crates/puffin-resolver/src/resolver.rs +++ b/crates/puffin-resolver/src/resolver.rs @@ -28,7 +28,7 @@ use puffin_client::RegistryClient; use puffin_distribution::{DistributionDatabase, DistributionDatabaseError}; use puffin_normalize::PackageName; use puffin_traits::{BuildContext, OnceMap}; -use pypi_types::Metadata21; +use pypi_types::{BaseUrl, Metadata21}; use crate::candidate_selector::CandidateSelector; use crate::error::ResolveError; @@ -44,7 +44,7 @@ use crate::version_map::VersionMap; use crate::yanks::AllowedYanks; use crate::ResolutionOptions; -type VersionMapResponse = Result<(IndexUrl, VersionMap), puffin_client::Error>; +type VersionMapResponse = Result<(IndexUrl, BaseUrl, VersionMap), puffin_client::Error>; type WheelMetadataResponse = Result<(Metadata21, Option), DistributionDatabaseError>; pub trait ResolverProvider: Send + Sync { @@ -113,9 +113,10 @@ impl<'a, Context: BuildContext + Send + Sync> ResolverProvider Box::pin( self.client .simple(package_name) - .map_ok(move |(index, metadata)| { + .map_ok(move |(index, base, metadata)| { ( index, + base, VersionMap::from_metadata( metadata, package_name, @@ -479,7 +480,7 @@ impl<'a, Provider: ResolverProvider> Resolver<'a, Provider> { let Some(entry) = self.index.packages.get(package_name) else { continue; }; - let (index, version_map) = entry.value(); + let (index, base, version_map) = entry.value(); // Try to find a compatible version. If there aren't any compatible versions, // short-circuit and return `None`. @@ -490,7 +491,7 @@ impl<'a, Provider: ResolverProvider> Resolver<'a, Provider> { // Emit a request to fetch the metadata for this version. if self.index.distributions.register(&candidate.package_id()) { - let distribution = candidate.into_distribution(index.clone()); + let distribution = candidate.into_distribution(index.clone(), base.clone()); request_sink.unbounded_send(Request::Dist(distribution))?; } } @@ -553,7 +554,7 @@ impl<'a, Provider: ResolverProvider> Resolver<'a, Provider> { PubGrubPackage::Package(package_name, extra, None) => { // Wait for the metadata to be available. let entry = self.index.packages.wait(package_name).await; - let (index, version_map) = entry.value(); + let (index, base, version_map) = entry.value(); if let Some(extra) = extra { debug!( @@ -588,13 +589,13 @@ impl<'a, Provider: ResolverProvider> Resolver<'a, Provider> { // We want to return a package pinned to a specific version; but we _also_ want to // store the exact file that we selected to satisfy that version. - pins.insert(&candidate, index); + pins.insert(&candidate, index, base); let version = candidate.version().clone(); // Emit a request to fetch the metadata for this version. if self.index.distributions.register(&candidate.package_id()) { - let distribution = candidate.into_distribution(index.clone()); + let distribution = candidate.into_distribution(index.clone(), base.clone()); request_sink.unbounded_send(Request::Dist(distribution))?; } @@ -698,9 +699,11 @@ impl<'a, Provider: ResolverProvider> Resolver<'a, Provider> { while let Some(response) = response_stream.next().await { match response? { - Response::Package(package_name, index, version_map) => { + Response::Package(package_name, index, base, version_map) => { trace!("Received package metadata for: {package_name}"); - self.index.packages.done(package_name, (index, version_map)); + self.index + .packages + .done(package_name, (index, base, version_map)); } Response::Dist(Dist::Built(distribution), metadata, ..) => { trace!("Received built distribution metadata for: {distribution}"); @@ -738,12 +741,12 @@ impl<'a, Provider: ResolverProvider> Resolver<'a, Provider> { match request { // Fetch package metadata from the registry. Request::Package(package_name) => { - let (index, metadata) = self + let (index, base, metadata) = self .provider .get_version_map(&package_name) .await .map_err(ResolveError::Client)?; - Ok(Response::Package(package_name, index, metadata)) + Ok(Response::Package(package_name, index, base, metadata)) } Request::Dist(dist) => { @@ -848,7 +851,7 @@ enum Request { #[allow(clippy::large_enum_variant)] enum Response { /// The returned metadata for a package hosted on a registry. - Package(PackageName, IndexUrl, VersionMap), + Package(PackageName, IndexUrl, BaseUrl, VersionMap), /// The returned metadata for a distribution. Dist(Dist, Metadata21, Option), } @@ -858,7 +861,7 @@ enum Response { pub(crate) struct Index { /// A map from package name to the metadata for that package and the index where the metadata /// came from. - pub(crate) packages: OnceMap, + pub(crate) packages: OnceMap, /// A map from distribution SHA to metadata for that distribution. pub(crate) distributions: OnceMap, diff --git a/crates/pypi-types/src/base_url.rs b/crates/pypi-types/src/base_url.rs new file mode 100644 index 000000000..4701bce26 --- /dev/null +++ b/crates/pypi-types/src/base_url.rs @@ -0,0 +1,34 @@ +use serde::{Deserialize, Serialize}; +use url::Url; + +#[derive(Debug, Clone, Hash, Eq, PartialEq, Serialize, Deserialize)] +pub struct BaseUrl(Url); + +impl BaseUrl { + /// Parse the given URL. If it's relative, join it to the current [`BaseUrl`]. Allows for + /// parsing URLs that may be absolute or relative, with a known base URL. + pub fn join_relative(&self, url: &str) -> Result { + match Url::parse(url) { + Ok(url) => Ok(url), + Err(err) => { + if err == url::ParseError::RelativeUrlWithoutBase { + self.0.join(url) + } else { + Err(err) + } + } + } + } +} + +impl From for BaseUrl { + fn from(url: Url) -> Self { + Self(url) + } +} + +impl std::fmt::Display for BaseUrl { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} diff --git a/crates/pypi-types/src/lib.rs b/crates/pypi-types/src/lib.rs index 9d165c49f..7db6e0b2e 100644 --- a/crates/pypi-types/src/lib.rs +++ b/crates/pypi-types/src/lib.rs @@ -1,8 +1,10 @@ +pub use base_url::*; pub use direct_url::*; pub use lenient_requirement::*; pub use metadata::*; pub use simple_json::*; +mod base_url; mod direct_url; mod lenient_requirement; mod metadata; diff --git a/crates/pypi-types/src/simple_json.rs b/crates/pypi-types/src/simple_json.rs index acc76126c..6c932900e 100644 --- a/crates/pypi-types/src/simple_json.rs +++ b/crates/pypi-types/src/simple_json.rs @@ -1,8 +1,10 @@ -use chrono::{DateTime, Utc}; -use pep440_rs::VersionSpecifiers; -use serde::{de, Deserialize, Deserializer, Serialize}; use std::str::FromStr; +use chrono::{DateTime, Utc}; +use serde::{de, Deserialize, Deserializer, Serialize}; + +use pep440_rs::VersionSpecifiers; + use crate::lenient_requirement::LenientVersionSpecifiers; #[derive(Debug, Clone, Deserialize)] @@ -10,13 +12,13 @@ pub struct SimpleJson { pub files: Vec, } -/// A single (remote) file belonging to a package, generally either a wheel or a source dist. +/// A single (remote) file belonging to a package, either a wheel or a source distribution. /// /// #[derive(Debug, Clone, Deserialize)] #[serde(rename_all = "kebab-case")] pub struct File { - // Not PEP 691 compliant alias used by pypi + // Non-PEP 691-compliant alias used by PyPI. #[serde(alias = "data_dist_info_metadata")] pub dist_info_metadata: Option, pub filename: String,