From 26f84e5699abe6e5076e1587f5504cfd71d1e920 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Fri, 31 Jan 2025 15:45:48 -0500 Subject: [PATCH] Percent-decode URLs in canonical comparisons (#11088) ## Summary This PR adds an additional normalization step to `CanonicalUrl` whereby we now percent-decode the path, to ensure that (e.g.) `torch-2.5.1%2Bcpu.cxx11.abi-cp39-cp39-linux_x86_64.whl` and `torch-2.5.1+cpu.cxx11.abi-cp39-cp39-linux_x86_64.whl` are considered equal. Further, when generating the "reinstall" report, we use the canonical URL rather than the verbatim URL. In making this change, I also learned that we don't apply any of the normalization passes to `file://` URLs. I inadvertently removed it in https://github.com/astral-sh/uv/pull/3010/commits/93d606aba20c39149390a9191879e82194ea4e91, since setting the password or URL on ` file://` URL errors -- but now suppress those errors anyway. Closes https://github.com/astral-sh/uv/issues/11082. ## Test Plan - Downloaded a [PyTorch wheel](https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.5.1%2Bcpu.cxx11.abi-cp39-cp39-linux_x86_64.whl) - `python3.9 -m pip install torch-2.5.1+cpu.cxx11.abi-cp39-cp39-linux_x86_64.whl --platform linux_x86_64 --target foo --no-deps` - `cargo run pip install torch-2.5.1+cpu.cxx11.abi-cp39-cp39-linux_x86_64.whl --python-platform linux --python-version 3.9 --target foo --no-deps` - Verified that the package had the `~` symbol for the reinstall. --- Cargo.lock | 2 + crates/uv-cache-key/Cargo.toml | 2 + crates/uv-cache-key/src/canonical_url.rs | 55 +++++++++++++++++++++--- crates/uv-distribution-types/src/any.rs | 54 ++++++++++++++++++----- 4 files changed, 97 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4270aa5f3..12eeb4e27 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4750,8 +4750,10 @@ name = "uv-cache-key" version = "0.0.1" dependencies = [ "hex", + "memchr", "seahash", "url", + "urlencoding", ] [[package]] diff --git a/crates/uv-cache-key/Cargo.toml b/crates/uv-cache-key/Cargo.toml index 7816c1d13..fe8f915d6 100644 --- a/crates/uv-cache-key/Cargo.toml +++ b/crates/uv-cache-key/Cargo.toml @@ -18,5 +18,7 @@ workspace = true [dependencies] hex = { workspace = true } +memchr = { workspace = true } seahash = { workspace = true } url = { workspace = true } +urlencoding = { workspace = true } diff --git a/crates/uv-cache-key/src/canonical_url.rs b/crates/uv-cache-key/src/canonical_url.rs index b46d591bb..5da0e01f7 100644 --- a/crates/uv-cache-key/src/canonical_url.rs +++ b/crates/uv-cache-key/src/canonical_url.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::fmt::{Debug, Formatter}; use std::hash::{Hash, Hasher}; use std::ops::Deref; @@ -26,11 +27,6 @@ impl CanonicalUrl { return Self(url); } - // If the URL has no host, then it's not a valid URL anyway. - if !url.has_host() { - return Self(url); - } - // Strip credentials. let _ = url.set_password(None); let _ = url.set_username(""); @@ -76,6 +72,23 @@ impl CanonicalUrl { } } + // Decode any percent-encoded characters in the path. + if memchr::memchr(b'%', url.path().as_bytes()).is_some() { + let decoded = url + .path_segments() + .unwrap() + .map(|segment| { + urlencoding::decode(segment) + .unwrap_or(Cow::Borrowed(segment)) + .into_owned() + }) + .collect::>(); + + let mut path_segments = url.path_segments_mut().unwrap(); + path_segments.clear(); + path_segments.extend(decoded); + } + Self(url) } @@ -276,6 +289,38 @@ mod tests { CanonicalUrl::parse("git+https:://github.com/pypa/sample-namespace-packages.git")?, ); + // Two URLs should _not_ be considered equal based on percent-decoding slashes. + assert_ne!( + CanonicalUrl::parse("https://github.com/pypa/sample%2Fnamespace%2Fpackages")?, + CanonicalUrl::parse("https://github.com/pypa/sample/namespace/packages")?, + ); + + // Two URLs should be considered equal regardless of percent-encoding. + assert_eq!( + CanonicalUrl::parse("https://github.com/pypa/sample%2Bnamespace%2Bpackages")?, + CanonicalUrl::parse("https://github.com/pypa/sample+namespace+packages")?, + ); + + // Two URLs should _not_ be considered equal based on percent-decoding slashes. + assert_ne!( + CanonicalUrl::parse( + "file:///home/ferris/my_project%2Fmy_project-0.1.0-py3-none-any.whl" + )?, + CanonicalUrl::parse( + "file:///home/ferris/my_project/my_project-0.1.0-py3-none-any.whl" + )?, + ); + + // Two URLs should be considered equal regardless of percent-encoding. + assert_eq!( + CanonicalUrl::parse( + "file:///home/ferris/my_project/my_project-0.1.0+foo-py3-none-any.whl" + )?, + CanonicalUrl::parse( + "file:///home/ferris/my_project/my_project-0.1.0%2Bfoo-py3-none-any.whl" + )?, + ); + Ok(()) } diff --git a/crates/uv-distribution-types/src/any.rs b/crates/uv-distribution-types/src/any.rs index 1f39535a7..6a29a0ff8 100644 --- a/crates/uv-distribution-types/src/any.rs +++ b/crates/uv-distribution-types/src/any.rs @@ -1,6 +1,8 @@ use std::hash::Hash; +use uv_cache_key::CanonicalUrl; use uv_normalize::PackageName; +use uv_pep440::Version; use crate::cached::CachedDist; use crate::installed::InstalledDist; @@ -8,19 +10,29 @@ use crate::{InstalledMetadata, InstalledVersion, Name}; /// A distribution which is either installable, is a wheel in our cache or is already installed. /// -/// Note equality and hash operations are only based on the name and version, not the kind. +/// Note equality and hash operations are only based on the name and canonical version, not the +/// kind. #[derive(Debug, Clone, Eq)] #[allow(clippy::large_enum_variant)] pub enum LocalDist { - Cached(CachedDist), - Installed(InstalledDist), + Cached(CachedDist, CanonicalVersion), + Installed(InstalledDist, CanonicalVersion), +} + +impl LocalDist { + fn canonical_version(&self) -> &CanonicalVersion { + match self { + Self::Cached(_, version) => version, + Self::Installed(_, version) => version, + } + } } impl Name for LocalDist { fn name(&self) -> &PackageName { match self { - Self::Cached(dist) => dist.name(), - Self::Installed(dist) => dist.name(), + Self::Cached(dist, _) => dist.name(), + Self::Installed(dist, _) => dist.name(), } } } @@ -28,33 +40,53 @@ impl Name for LocalDist { impl InstalledMetadata for LocalDist { fn installed_version(&self) -> InstalledVersion { match self { - Self::Cached(dist) => dist.installed_version(), - Self::Installed(dist) => dist.installed_version(), + Self::Cached(dist, _) => dist.installed_version(), + Self::Installed(dist, _) => dist.installed_version(), } } } impl From for LocalDist { fn from(dist: CachedDist) -> Self { - Self::Cached(dist) + let version = CanonicalVersion::from(dist.installed_version()); + Self::Cached(dist, version) } } impl From for LocalDist { fn from(dist: InstalledDist) -> Self { - Self::Installed(dist) + let version = CanonicalVersion::from(dist.installed_version()); + Self::Installed(dist, version) } } impl Hash for LocalDist { fn hash(&self, state: &mut H) { self.name().hash(state); - self.installed_version().hash(state); + self.canonical_version().hash(state); } } impl PartialEq for LocalDist { fn eq(&self, other: &Self) -> bool { - self.name() == other.name() && self.installed_version() == other.installed_version() + self.name() == other.name() && self.canonical_version() == other.canonical_version() + } +} + +/// Like [`InstalledVersion`], but with [`CanonicalUrl`] to ensure robust URL comparisons. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum CanonicalVersion { + Version(Version), + Url(CanonicalUrl, Version), +} + +impl From> for CanonicalVersion { + fn from(installed_version: InstalledVersion<'_>) -> Self { + match installed_version { + InstalledVersion::Version(version) => Self::Version(version.clone()), + InstalledVersion::Url(url, version) => { + Self::Url(CanonicalUrl::new(url), version.clone()) + } + } } }