From d833910a5d210d72690ed25b03f4211607585f07 Mon Sep 17 00:00:00 2001 From: Ibraheem Ahmed Date: Wed, 10 Jul 2024 05:16:30 -0400 Subject: [PATCH] Avoid reparsing wheel URLs (#4947) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary We currently store wheel URLs in an unparsed state because we don't have a stable parsed representation to use with rykv. Unfortunately this means we end up reparsing unnecessarily in a lot of places, especially when constructing a `Lock`. This PR adds a `UrlString` type that lets us avoid reparsing without losing the validity of the `Url`. ## Test Plan Shaves off another ~10 ms from https://github.com/astral-sh/uv/issues/4860. ``` ➜ transformers hyperfine "../../uv/target/profiling/uv lock" "../../uv/target/profiling/baseline lock" --warmup 3 Benchmark 1: ../../uv/target/profiling/uv lock Time (mean ± σ): 120.9 ms ± 2.5 ms [User: 126.0 ms, System: 80.6 ms] Range (min … max): 116.8 ms … 125.7 ms 23 runs Benchmark 2: ../../uv/target/profiling/baseline lock Time (mean ± σ): 129.9 ms ± 4.2 ms [User: 127.1 ms, System: 86.1 ms] Range (min … max): 123.4 ms … 141.2 ms 23 runs Summary ../../uv/target/profiling/uv lock ran 1.07 ± 0.04 times faster than ../../uv/target/profiling/baseline lock ``` --- crates/distribution-types/src/file.rs | 84 +++++++++++++++---- crates/uv-client/src/registry_client.rs | 2 +- .../src/distribution_database.rs | 4 +- crates/uv-distribution/src/source/mod.rs | 8 +- crates/uv-resolver/src/lock.rs | 16 ++-- ...r__lock__tests__hash_optional_missing.snap | 14 +--- 6 files changed, 84 insertions(+), 44 deletions(-) diff --git a/crates/distribution-types/src/file.rs b/crates/distribution-types/src/file.rs index 0516ddf4f..2674ef114 100644 --- a/crates/distribution-types/src/file.rs +++ b/crates/distribution-types/src/file.rs @@ -1,11 +1,11 @@ -use std::fmt::{Display, Formatter}; +use std::fmt::{self, Display, Formatter}; use std::path::PathBuf; +use std::str::FromStr; use serde::{Deserialize, Serialize}; use url::Url; use pep440_rs::{VersionSpecifiers, VersionSpecifiersParseError}; -use pep508_rs::split_scheme; use pypi_types::{CoreMetadata, HashDigest, Yanked}; /// Error converting [`pypi_types::File`] to [`distribution_type::File`]. @@ -56,10 +56,9 @@ impl File { .map_err(|err| FileConversionError::RequiresPython(err.line().clone(), err))?, size: file.size, upload_time_utc_ms: file.upload_time.map(|dt| dt.timestamp_millis()), - url: if split_scheme(&file.url).is_some() { - FileLocation::AbsoluteUrl(file.url) - } else { - FileLocation::RelativeUrl(base.to_string(), file.url) + url: match Url::parse(&file.url) { + Ok(url) => FileLocation::AbsoluteUrl(url.into()), + Err(_) => FileLocation::RelativeUrl(base.to_string(), file.url), }, yanked: file.yanked, }) @@ -76,7 +75,7 @@ pub enum FileLocation { /// URL relative to the base URL. RelativeUrl(String, String), /// Absolute URL. - AbsoluteUrl(String), + AbsoluteUrl(UrlString), /// Absolute path to a file. Path(#[with(rkyv::with::AsString)] PathBuf), } @@ -107,13 +106,7 @@ impl FileLocation { })?; Ok(joined) } - FileLocation::AbsoluteUrl(ref absolute) => { - let url = Url::parse(absolute).map_err(|err| ToUrlError::InvalidAbsolute { - absolute: absolute.clone(), - err, - })?; - Ok(url) - } + FileLocation::AbsoluteUrl(ref absolute) => Ok(absolute.to_url()), FileLocation::Path(ref path) => { let path = path .to_str() @@ -125,18 +118,79 @@ impl FileLocation { } } } + + /// Convert this location to a URL. + /// + /// This method is identical to [`FileLocation::to_url`] except it avoids parsing absolute URLs + /// as they are already guaranteed to be valid. + pub fn to_url_string(&self) -> Result { + match *self { + FileLocation::AbsoluteUrl(ref absolute) => Ok(absolute.clone()), + _ => Ok(self.to_url()?.into()), + } + } } impl Display for FileLocation { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { Self::RelativeUrl(_base, url) => Display::fmt(&url, f), - Self::AbsoluteUrl(url) => Display::fmt(&url, f), + Self::AbsoluteUrl(url) => Display::fmt(&url.0, f), Self::Path(path) => Display::fmt(&path.display(), f), } } } +/// A [`Url`] represented as a `String`. +/// +/// This type is guaranteed to be a valid URL but avoids being parsed into the [`Url`] type. +#[derive( + Debug, + Clone, + Serialize, + Deserialize, + rkyv::Archive, + rkyv::Deserialize, + rkyv::Serialize, + PartialEq, + Eq, +)] +#[archive(check_bytes)] +#[archive_attr(derive(Debug))] +pub struct UrlString(String); + +impl UrlString { + /// Converts a [`UrlString`] to a [`Url`]. + pub fn to_url(&self) -> Url { + // This conversion can never fail as the only way to construct a `UrlString` is from a `Url`. + Url::from_str(&self.0).unwrap() + } +} + +impl AsRef for UrlString { + fn as_ref(&self) -> &str { + &self.0 + } +} + +impl From for UrlString { + fn from(value: Url) -> Self { + UrlString(value.to_string()) + } +} + +impl From for String { + fn from(value: UrlString) -> Self { + value.0 + } +} + +impl fmt::Display for UrlString { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&self.0, f) + } +} + /// An error that occurs when a `FileLocation` is not a valid URL. #[derive(Clone, Debug, Eq, PartialEq, thiserror::Error)] pub enum ToUrlError { diff --git a/crates/uv-client/src/registry_client.rs b/crates/uv-client/src/registry_client.rs index 29cd70eb0..d0b51ec0d 100644 --- a/crates/uv-client/src/registry_client.rs +++ b/crates/uv-client/src/registry_client.rs @@ -421,7 +421,7 @@ impl RegistryClient { } } FileLocation::AbsoluteUrl(url) => { - let url = Url::parse(url).map_err(ErrorKind::UrlParse)?; + let url = url.to_url(); if url.scheme() == "file" { let path = url .to_file_path() diff --git a/crates/uv-distribution/src/distribution_database.rs b/crates/uv-distribution/src/distribution_database.rs index 710a36d8f..0ba87e723 100644 --- a/crates/uv-distribution/src/distribution_database.rs +++ b/crates/uv-distribution/src/distribution_database.rs @@ -160,9 +160,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> { FileLocation::RelativeUrl(base, url) => { pypi_types::base_url_join_relative(base, url)? } - FileLocation::AbsoluteUrl(url) => { - Url::parse(url).map_err(|err| Error::Url(url.clone(), err))? - } + FileLocation::AbsoluteUrl(url) => url.to_url(), FileLocation::Path(path) => { let cache_entry = self.build_context.cache().entry( CacheBucket::Wheels, diff --git a/crates/uv-distribution/src/source/mod.rs b/crates/uv-distribution/src/source/mod.rs index 802931dcf..92d3ece68 100644 --- a/crates/uv-distribution/src/source/mod.rs +++ b/crates/uv-distribution/src/source/mod.rs @@ -102,9 +102,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> { FileLocation::RelativeUrl(base, url) => { pypi_types::base_url_join_relative(base, url)? } - FileLocation::AbsoluteUrl(url) => { - Url::parse(url).map_err(|err| Error::Url(url.clone(), err))? - } + FileLocation::AbsoluteUrl(url) => url.to_url(), FileLocation::Path(path) => { let url = Url::from_file_path(path) .map_err(|()| Error::RelativePath(path.clone()))?; @@ -280,9 +278,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> { FileLocation::RelativeUrl(base, url) => { pypi_types::base_url_join_relative(base, url)? } - FileLocation::AbsoluteUrl(url) => { - Url::parse(url).map_err(|err| Error::Url(url.clone(), err))? - } + FileLocation::AbsoluteUrl(url) => url.to_url(), FileLocation::Path(path) => { let url = Url::from_file_path(path) .map_err(|()| Error::RelativePath(path.clone()))?; diff --git a/crates/uv-resolver/src/lock.rs b/crates/uv-resolver/src/lock.rs index eccbc5cbc..6d207acdf 100644 --- a/crates/uv-resolver/src/lock.rs +++ b/crates/uv-resolver/src/lock.rs @@ -17,7 +17,7 @@ use distribution_filename::WheelFilename; use distribution_types::{ BuiltDist, DirectUrlBuiltDist, DirectUrlSourceDist, DirectorySourceDist, Dist, FileLocation, GitSourceDist, IndexUrl, PathBuiltDist, PathSourceDist, RegistryBuiltDist, RegistryBuiltWheel, - RegistrySourceDist, RemoteSource, Resolution, ResolvedDist, ToUrlError, + RegistrySourceDist, RemoteSource, Resolution, ResolvedDist, ToUrlError, UrlString, }; use pep440_rs::{Version, VersionSpecifiers}; use pep508_rs::{ @@ -716,7 +716,7 @@ impl Distribution { requires_python: None, size: sdist.size(), upload_time_utc_ms: None, - url: FileLocation::AbsoluteUrl(file_url.to_string()), + url: FileLocation::AbsoluteUrl(file_url.clone().into()), yanked: None, }); let index = IndexUrl::Url(VerbatimUrl::from_url(url.clone())); @@ -1705,7 +1705,7 @@ struct Wheel { /// against was found. The location does not need to exist in the future, /// so this should be treated as only a hint to where to look and/or /// recording where the wheel file originally came from. - url: Url, + url: UrlString, /// A hash of the built distribution. /// /// This is only present for wheels that come from registries and direct @@ -1773,7 +1773,7 @@ impl Wheel { let url = wheel .file .url - .to_url() + .to_url_string() .map_err(LockErrorKind::InvalidFileUrl) .map_err(LockError::from)?; let hash = wheel.file.hashes.first().cloned().map(Hash::from); @@ -1788,7 +1788,7 @@ impl Wheel { fn from_direct_dist(direct_dist: &DirectUrlBuiltDist, hashes: &[HashDigest]) -> Wheel { Wheel { - url: direct_dist.url.to_url(), + url: direct_dist.url.to_url().into(), hash: hashes.first().cloned().map(Hash::from), size: None, filename: direct_dist.filename.clone(), @@ -1797,7 +1797,7 @@ impl Wheel { fn from_path_dist(path_dist: &PathBuiltDist, hashes: &[HashDigest]) -> Wheel { Wheel { - url: path_dist.url.to_url(), + url: path_dist.url.to_url().into(), hash: hashes.first().cloned().map(Hash::from), size: None, filename: path_dist.filename.clone(), @@ -1813,7 +1813,7 @@ impl Wheel { requires_python: None, size: self.size, upload_time_utc_ms: None, - url: FileLocation::AbsoluteUrl(self.url.to_string()), + url: FileLocation::AbsoluteUrl(self.url.clone()), yanked: None, }); let index = IndexUrl::Url(VerbatimUrl::from_url(url.clone())); @@ -1872,7 +1872,7 @@ impl TryFrom for Wheel { .map_err(|err| format!("failed to parse `{filename}` as wheel filename: {err}"))?; Ok(Wheel { - url: wire.url, + url: wire.url.into(), hash: wire.hash, size: wire.size, filename, diff --git a/crates/uv-resolver/src/snapshots/uv_resolver__lock__tests__hash_optional_missing.snap b/crates/uv-resolver/src/snapshots/uv_resolver__lock__tests__hash_optional_missing.snap index 38a0ff753..946a8e0fd 100644 --- a/crates/uv-resolver/src/snapshots/uv_resolver__lock__tests__hash_optional_missing.snap +++ b/crates/uv-resolver/src/snapshots/uv_resolver__lock__tests__hash_optional_missing.snap @@ -19,17 +19,9 @@ Ok( sdist: None, wheels: [ Wheel { - url: Url { - scheme: "file", - cannot_be_a_base: false, - username: "", - password: None, - host: None, - port: None, - path: "/foo/bar/anyio-4.3.0-py3-none-any.whl", - query: None, - fragment: None, - }, + url: UrlString( + "file:///foo/bar/anyio-4.3.0-py3-none-any.whl", + ), hash: Some( Hash( HashDigest {