From f0841cdb6ec08f387b4589b0b6b3bd125c0329ab Mon Sep 17 00:00:00 2001 From: konsti Date: Mon, 20 Nov 2023 17:26:36 +0100 Subject: [PATCH] Wheel metadata refactor (#462) A consistent cache structure for remote wheel metadata: * `/pypi/foo-1.0.0-py3-none-any.json` * `//foo-1.0.0-py3-none-any.json` * `/url//foo-1.0.0-py3-none-any.json` The source dist caching will use a similar structure (#468). --- Cargo.lock | 7 ++++ crates/distribution-types/Cargo.toml | 1 + crates/distribution-types/src/cached.rs | 2 +- crates/distribution-types/src/lib.rs | 24 +++++++---- crates/puffin-cache/Cargo.toml | 3 ++ crates/puffin-cache/src/lib.rs | 1 + crates/puffin-cache/src/metadata.rs | 38 +++++++++++++++++ crates/puffin-client/Cargo.toml | 2 + crates/puffin-client/src/client.rs | 26 +++++++----- crates/puffin-client/src/remote_metadata.rs | 3 -- crates/puffin-client/tests/remote_metadata.rs | 7 +++- crates/puffin-dev/src/wheel_metadata.rs | 5 ++- crates/puffin-distribution/src/fetcher.rs | 4 +- crates/puffin-resolver/src/resolution.rs | 2 +- crates/puffin-resolver/src/resolver.rs | 41 +++++++++++-------- crates/pypi-types/src/index_url.rs | 27 ++++++++++-- crates/pypi-types/src/lib.rs | 2 +- scripts/benchmarks/requirements/all-kinds.in | 5 +++ 18 files changed, 153 insertions(+), 47 deletions(-) create mode 100644 crates/puffin-cache/src/metadata.rs create mode 100644 scripts/benchmarks/requirements/all-kinds.in diff --git a/Cargo.lock b/Cargo.lock index f2d29ba6a..ada6967cc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -874,6 +874,7 @@ name = "distribution-types" version = "0.0.1" dependencies = [ "anyhow", + "distribution-filename", "fs-err", "pep440_rs 0.3.12", "puffin-cache", @@ -2371,8 +2372,10 @@ version = "0.0.1" dependencies = [ "clap", "directories", + "distribution-filename", "fs-err", "hex", + "pypi-types", "seahash", "tempfile", "url", @@ -2447,6 +2450,7 @@ dependencies = [ "http-cache-reqwest", "http-cache-semantics", "install-wheel-rs", + "puffin-cache", "puffin-normalize", "pypi-types", "reqwest", @@ -2454,6 +2458,7 @@ dependencies = [ "reqwest-retry", "serde", "serde_json", + "sha2", "tempfile", "thiserror", "tokio", @@ -2618,8 +2623,10 @@ dependencies = [ name = "puffin-macros" version = "0.0.1" dependencies = [ + "colored", "fxhash", "once_cell", + "tracing", ] [[package]] diff --git a/crates/distribution-types/Cargo.toml b/crates/distribution-types/Cargo.toml index 3f36eda97..eaa7a8af6 100644 --- a/crates/distribution-types/Cargo.toml +++ b/crates/distribution-types/Cargo.toml @@ -10,6 +10,7 @@ authors = { workspace = true } license = { workspace = true } [dependencies] +distribution-filename = { path = "../distribution-filename" } pep440_rs = { path = "../pep440-rs" } puffin-cache = { path = "../puffin-cache" } puffin-git = { path = "../puffin-git" } diff --git a/crates/distribution-types/src/cached.rs b/crates/distribution-types/src/cached.rs index 5f81941a7..9209db7e8 100644 --- a/crates/distribution-types/src/cached.rs +++ b/crates/distribution-types/src/cached.rs @@ -80,7 +80,7 @@ impl CachedDist { path, }), Dist::Built(BuiltDist::DirectUrl(dist)) => Self::Url(CachedDirectUrlDist { - name: dist.name, + name: dist.filename.name, url: dist.url, path, }), diff --git a/crates/distribution-types/src/lib.rs b/crates/distribution-types/src/lib.rs index cadbb14f7..486f058de 100644 --- a/crates/distribution-types/src/lib.rs +++ b/crates/distribution-types/src/lib.rs @@ -1,6 +1,8 @@ use std::path::Path; +use std::str::FromStr; use anyhow::{Context, Result}; +use distribution_filename::WheelFilename; use url::Url; use pep440_rs::Version; @@ -68,7 +70,9 @@ pub struct RegistryBuiltDist { /// A built distribution (wheel) that exists at an arbitrary URL. #[derive(Debug, Clone)] pub struct DirectUrlBuiltDist { - pub name: PackageName, + /// We require that wheel urls end in the full wheel filename, e.g. + /// `https://example.org/packages/flask-3.0.0-py3-none-any.whl` + pub filename: WheelFilename, pub url: Url, } @@ -84,6 +88,8 @@ pub struct RegistrySourceDist { /// A source distribution that exists at an arbitrary URL. #[derive(Debug, Clone)] pub struct DirectUrlSourceDist { + /// Unlike [`DirectUrlBuiltDist`], we can't require a full filename with a version here, people + /// like using e.g. `foo @ https://github.com/org/repo/archive/master.zip` pub name: PackageName, pub url: Url, } @@ -120,13 +126,15 @@ impl Dist { /// Create a [`Dist`] for a URL-based distribution. pub fn from_url(name: PackageName, url: Url) -> Self { + // The part after the last slash + let filename = url + .path() + .rsplit_once('/') + .map_or(url.path(), |(_path, filename)| filename); if url.scheme().starts_with("git+") { Self::Source(SourceDist::Git(GitSourceDist { name, url })) - } else if Path::new(url.path()) - .extension() - .is_some_and(|ext| ext.eq_ignore_ascii_case("whl")) - { - Self::Built(BuiltDist::DirectUrl(DirectUrlBuiltDist { name, url })) + } else if let Ok(filename) = WheelFilename::from_str(filename) { + Self::Built(BuiltDist::DirectUrl(DirectUrlBuiltDist { filename, url })) } else { Self::Source(SourceDist::DirectUrl(DirectUrlSourceDist { name, url })) } @@ -145,7 +153,7 @@ impl Dist { match self { Self::Built(built) => Self::Built(match built { BuiltDist::DirectUrl(dist) => BuiltDist::DirectUrl(DirectUrlBuiltDist { - name: dist.name, + filename: dist.filename, url, }), dist @ BuiltDist::Registry(_) => dist, @@ -197,7 +205,7 @@ impl Metadata for RegistryBuiltDist { impl Metadata for DirectUrlBuiltDist { fn name(&self) -> &PackageName { - &self.name + &self.filename.name } fn version_or_url(&self) -> VersionOrUrl { diff --git a/crates/puffin-cache/Cargo.toml b/crates/puffin-cache/Cargo.toml index 3591ec027..ff511b154 100644 --- a/crates/puffin-cache/Cargo.toml +++ b/crates/puffin-cache/Cargo.toml @@ -11,6 +11,9 @@ authors = { workspace = true } license = { workspace = true } [dependencies] +distribution-filename = { path = "../distribution-filename" } +pypi-types = { path = "../pypi-types" } + clap = { workspace = true, features = ["derive"], optional = true } directories = { workspace = true } fs-err = { workspace = true } diff --git a/crates/puffin-cache/src/lib.rs b/crates/puffin-cache/src/lib.rs index 000f87eec..77ecbf56e 100644 --- a/crates/puffin-cache/src/lib.rs +++ b/crates/puffin-cache/src/lib.rs @@ -11,6 +11,7 @@ mod cache_key; mod canonical_url; mod cli; mod digest; +pub mod metadata; /// A trait for types that can be hashed in a stable way across versions and platforms. pub trait StableHash { diff --git a/crates/puffin-cache/src/metadata.rs b/crates/puffin-cache/src/metadata.rs new file mode 100644 index 000000000..9a2ed9fc6 --- /dev/null +++ b/crates/puffin-cache/src/metadata.rs @@ -0,0 +1,38 @@ +use std::path::{Path, PathBuf}; + +use url::Url; + +use pypi_types::IndexUrl; + +use crate::{digest, CanonicalUrl}; + +const WHEEL_METADATA_CACHE: &str = "wheel-metadata-v0"; + +/// Cache wheel metadata. +/// +/// Wheel metadata can come from a remote wheel or from building a source +/// distribution. For a remote wheel, we try the following ways to fetch the metadata: +/// 1. From a [PEP 658](https://peps.python.org/pep-0658/) data-dist-info-metadata url +/// 2. From a remote wheel by partial zip reading +/// 3. From a (temp) download of a remote wheel (this is a fallback, the webserver should support range requests) +pub enum WheelMetadataCache { + Index(IndexUrl), + Url, +} + +impl WheelMetadataCache { + /// Cache structure: + /// * `/pypi/foo-1.0.0-py3-none-any.json` + /// * `//foo-1.0.0-py3-none-any.json` + /// * `/url//foo-1.0.0-py3-none-any.json` + pub fn cache_dir(&self, cache: &Path, url: &Url) -> PathBuf { + let cache_root = cache.join(WHEEL_METADATA_CACHE); + match self { + WheelMetadataCache::Index(IndexUrl::Pypi) => cache_root.join("pypi"), + WheelMetadataCache::Index(url) => cache_root + .join("index") + .join(digest(&CanonicalUrl::new(url))), + WheelMetadataCache::Url => cache_root.join("url").join(digest(&CanonicalUrl::new(url))), + } + } +} diff --git a/crates/puffin-client/Cargo.toml b/crates/puffin-client/Cargo.toml index b44c6ebd0..0593569a7 100644 --- a/crates/puffin-client/Cargo.toml +++ b/crates/puffin-client/Cargo.toml @@ -6,6 +6,7 @@ edition = "2021" [dependencies] distribution-filename = { path = "../distribution-filename" } install-wheel-rs = { path = "../install-wheel-rs" } +puffin-cache = { path = "../puffin-cache" } puffin-normalize = { path = "../puffin-normalize" } pypi-types = { path = "../pypi-types" } @@ -21,6 +22,7 @@ reqwest-middleware = { workspace = true } reqwest-retry = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } +sha2 = { workspace = true } thiserror = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true, features = ["fs"] } diff --git a/crates/puffin-client/src/client.rs b/crates/puffin-client/src/client.rs index cf56e74a6..497e914fe 100644 --- a/crates/puffin-client/src/client.rs +++ b/crates/puffin-client/src/client.rs @@ -18,14 +18,13 @@ use url::Url; use distribution_filename::WheelFilename; use install_wheel_rs::find_dist_info; +use puffin_cache::metadata::WheelMetadataCache; use puffin_normalize::PackageName; use pypi_types::{File, IndexUrl, Metadata21, SimpleJson}; use crate::cached_client::CachedClient; use crate::error::Error; -use crate::remote_metadata::{ - wheel_metadata_from_remote_zip, WHEEL_METADATA_FROM_INDEX, WHEEL_METADATA_FROM_ZIP_CACHE, -}; +use crate::remote_metadata::wheel_metadata_from_remote_zip; /// A builder for an [`RegistryClient`]. #[derive(Debug, Clone)] @@ -41,7 +40,7 @@ pub struct RegistryClientBuilder { impl RegistryClientBuilder { pub fn new(cache: impl Into) -> Self { Self { - index: IndexUrl::from(Url::parse("https://pypi.org/simple").unwrap()), + index: IndexUrl::Pypi, extra_index: vec![], no_index: false, proxy: Url::parse("https://pypi-metadata.ruff.rs").unwrap(), @@ -199,7 +198,7 @@ impl RegistryClient { } /// Fetch the metadata from a wheel file. - pub async fn wheel_metadata(&self, file: File) -> Result { + pub async fn wheel_metadata(&self, index: IndexUrl, file: File) -> Result { if self.no_index { return Err(Error::NoIndex(file.filename)); } @@ -209,11 +208,12 @@ impl RegistryClient { let filename = WheelFilename::from_str(&file.filename)?; if file .dist_info_metadata - .is_some_and(|dist_info_metadata| dist_info_metadata.is_available()) + .as_ref() + .is_some_and(pypi_types::Metadata::is_available) { let url = Url::parse(&format!("{}.metadata", file.url))?; - let cache_dir = self.cache.join(WHEEL_METADATA_FROM_INDEX).join("pypi"); + let cache_dir = WheelMetadataCache::Index(index).cache_dir(&self.cache, &url); let cache_file = format!("{}.json", filename.stem()); let response_callback = |response: Response| async { @@ -228,17 +228,23 @@ impl RegistryClient { // `.dist-info/METADATA` file from the zip, and if that also fails, download the whole wheel // into the cache and read from there } else { - self.wheel_metadata_no_index(&filename, &url).await + self.wheel_metadata_no_pep658(&filename, &url, WheelMetadataCache::Index(index)) + .await } } /// Get the wheel metadata if it isn't available in an index through PEP 658 - pub async fn wheel_metadata_no_index( + pub async fn wheel_metadata_no_pep658( &self, filename: &WheelFilename, url: &Url, + cache_shard: WheelMetadataCache, ) -> Result { - let cache_dir = self.cache.join(WHEEL_METADATA_FROM_ZIP_CACHE).join("pypi"); + if self.no_index { + return Err(Error::NoIndex(url.to_string())); + } + + let cache_dir = cache_shard.cache_dir(&self.cache, url); let cache_file = format!("{}.json", filename.stem()); // This response callback is special, we actually make a number of subsequent requests to diff --git a/crates/puffin-client/src/remote_metadata.rs b/crates/puffin-client/src/remote_metadata.rs index 5c85e4377..b21be3411 100644 --- a/crates/puffin-client/src/remote_metadata.rs +++ b/crates/puffin-client/src/remote_metadata.rs @@ -7,9 +7,6 @@ use install_wheel_rs::find_dist_info; use crate::Error; -pub(crate) const WHEEL_METADATA_FROM_INDEX: &str = "wheel-metadat-index-v0"; -pub(crate) const WHEEL_METADATA_FROM_ZIP_CACHE: &str = "wheel-metadata-remote-v0"; - /// Read the `.dist-info/METADATA` file from a async remote zip reader, so we avoid downloading the /// entire wheel just for the one file. /// diff --git a/crates/puffin-client/tests/remote_metadata.rs b/crates/puffin-client/tests/remote_metadata.rs index 0e5061282..1bbf1b1db 100644 --- a/crates/puffin-client/tests/remote_metadata.rs +++ b/crates/puffin-client/tests/remote_metadata.rs @@ -5,6 +5,7 @@ use tempfile::tempdir; use url::Url; use distribution_filename::WheelFilename; +use puffin_cache::metadata::WheelMetadataCache; use puffin_client::RegistryClientBuilder; #[tokio::test] @@ -17,7 +18,11 @@ async fn remote_metadata_with_and_without_cache() -> Result<()> { let url = "https://files.pythonhosted.org/packages/00/e5/f12a80907d0884e6dff9c16d0c0114d81b8cd07dc3ae54c5e962cc83037e/tqdm-4.66.1-py3-none-any.whl"; let filename = WheelFilename::from_str(url.rsplit_once('/').unwrap().1).unwrap(); let metadata = client - .wheel_metadata_no_index(&filename, &Url::parse(url).unwrap()) + .wheel_metadata_no_pep658( + &filename, + &Url::parse(url).unwrap(), + WheelMetadataCache::Url, + ) .await .unwrap(); assert_eq!(metadata.summary.unwrap(), "Fast, Extensible Progress Meter"); diff --git a/crates/puffin-dev/src/wheel_metadata.rs b/crates/puffin-dev/src/wheel_metadata.rs index 66c491a5e..f35399467 100644 --- a/crates/puffin-dev/src/wheel_metadata.rs +++ b/crates/puffin-dev/src/wheel_metadata.rs @@ -5,6 +5,7 @@ use url::Url; use anyhow::Result; use distribution_filename::WheelFilename; +use puffin_cache::metadata::WheelMetadataCache; use puffin_cache::{CacheArgs, CacheDir}; use puffin_client::RegistryClientBuilder; @@ -28,7 +29,9 @@ pub(crate) async fn wheel_metadata(args: WheelMetadataArgs) -> Result<()> { .1, )?; - let metadata = client.wheel_metadata_no_index(&filename, &args.url).await?; + let metadata = client + .wheel_metadata_no_pep658(&filename, &args.url, WheelMetadataCache::Url) + .await?; println!("{metadata:?}"); Ok(()) } diff --git a/crates/puffin-distribution/src/fetcher.rs b/crates/puffin-distribution/src/fetcher.rs index c38909608..ccf703db3 100644 --- a/crates/puffin-distribution/src/fetcher.rs +++ b/crates/puffin-distribution/src/fetcher.rs @@ -72,7 +72,9 @@ impl<'a> Fetcher<'a> { match dist { // Fetch the metadata directly from the registry. Dist::Built(BuiltDist::Registry(wheel)) => { - let metadata = client.wheel_metadata(wheel.file.clone()).await?; + let metadata = client + .wheel_metadata(wheel.index.clone(), wheel.file.clone()) + .await?; Ok(metadata) } // Fetch the distribution, then read the metadata (for built distributions), or build diff --git a/crates/puffin-resolver/src/resolution.rs b/crates/puffin-resolver/src/resolution.rs index 7fac38dc2..29b75b855 100644 --- a/crates/puffin-resolver/src/resolution.rs +++ b/crates/puffin-resolver/src/resolution.rs @@ -157,7 +157,7 @@ impl Graph { marker: None, }, Dist::Built(BuiltDist::DirectUrl(wheel)) => Requirement { - name: wheel.name.clone(), + name: wheel.filename.name.clone(), extras: None, version_or_url: Some(VersionOrUrl::Url(wheel.url.clone())), marker: None, diff --git a/crates/puffin-resolver/src/resolver.rs b/crates/puffin-resolver/src/resolver.rs index 205b82737..6b90a9a04 100644 --- a/crates/puffin-resolver/src/resolver.rs +++ b/crates/puffin-resolver/src/resolver.rs @@ -21,6 +21,7 @@ use distribution_filename::WheelFilename; use distribution_types::{BuiltDist, Dist, Identifier, Metadata, SourceDist, VersionOrUrl}; use pep508_rs::{MarkerEnvironment, Requirement}; use platform_tags::Tags; +use puffin_cache::metadata::WheelMetadataCache; use puffin_cache::CanonicalUrl; use puffin_client::RegistryClient; use puffin_distribution::Fetcher; @@ -300,8 +301,8 @@ impl<'a, Context: BuildContext + Send + Sync> Resolver<'a, Context> { PubGrubPackage::Package(package_name, _extra, Some(url)) => { // Emit a request to fetch the metadata for this distribution. if in_flight.insert_url(url) { - priorities.add(package_name.clone()); let distribution = Dist::from_url(package_name.clone(), url.clone()); + priorities.add(distribution.name().clone()); request_sink.unbounded_send(Request::Dist(distribution))?; } } @@ -605,26 +606,32 @@ impl<'a, Context: BuildContext + Send + Sync> Resolver<'a, Context> { .await } - // Fetch wheel metadata from the registry if possible. This is a fast-path to avoid - // reading from the cache in the common case: we cache wheel metadata in the HTTP - // cache, rather than downloading the wheel itself. - Request::Dist(Dist::Built(BuiltDist::Registry(wheel))) => { - let metadata = self - .client - .wheel_metadata(wheel.file.clone()) - .map_err(ResolveError::Client) - .await?; - if metadata.name != *wheel.name() { + // Fetch wheel metadata. + Request::Dist(Dist::Built(distribution)) => { + let metadata = match &distribution { + BuiltDist::Registry(wheel) => { + self.client + .wheel_metadata(wheel.index.clone(), wheel.file.clone()) + .await? + } + BuiltDist::DirectUrl(wheel) => { + self.client + .wheel_metadata_no_pep658( + &wheel.filename, + &wheel.url, + WheelMetadataCache::Url, + ) + .await? + } + }; + + if metadata.name != *distribution.name() { return Err(ResolveError::NameMismatch { metadata: metadata.name, - given: wheel.name().clone(), + given: distribution.name().clone(), }); } - Ok(Response::Dist( - Dist::Built(BuiltDist::Registry(wheel)), - metadata, - None, - )) + Ok(Response::Dist(Dist::Built(distribution), metadata, None)) } // Fetch distribution metadata. diff --git a/crates/pypi-types/src/index_url.rs b/crates/pypi-types/src/index_url.rs index fa5c3f217..4edfcbd8b 100644 --- a/crates/pypi-types/src/index_url.rs +++ b/crates/pypi-types/src/index_url.rs @@ -1,17 +1,38 @@ +use once_cell::sync::Lazy; +use std::ops::Deref; use url::Url; +static PYPI_URL: Lazy = Lazy::new(|| Url::parse("https://pypi.org/simple").unwrap()); + /// The url of an index, newtype'd to avoid mixing it with file urls #[derive(Debug, Clone, Hash, Eq, PartialEq)] -pub struct IndexUrl(Url); +pub enum IndexUrl { + Pypi, + Url(Url), +} impl From for IndexUrl { fn from(url: Url) -> Self { - Self(url) + Self::Url(url) } } impl From for Url { fn from(index: IndexUrl) -> Self { - index.0 + match index { + IndexUrl::Pypi => PYPI_URL.clone(), + IndexUrl::Url(url) => url, + } + } +} + +impl Deref for IndexUrl { + type Target = Url; + + fn deref(&self) -> &Self::Target { + match &self { + IndexUrl::Pypi => &PYPI_URL, + IndexUrl::Url(url) => url, + } } } diff --git a/crates/pypi-types/src/lib.rs b/crates/pypi-types/src/lib.rs index e8c5623c0..6b9c271d3 100644 --- a/crates/pypi-types/src/lib.rs +++ b/crates/pypi-types/src/lib.rs @@ -2,7 +2,7 @@ pub use direct_url::{ArchiveInfo, DirectUrl, VcsInfo, VcsKind}; pub use index_url::IndexUrl; pub use lenient_requirement::LenientVersionSpecifiers; pub use metadata::{Error, Metadata21}; -pub use simple_json::{File, SimpleJson, Yanked}; +pub use simple_json::{File, Metadata, SimpleJson, Yanked}; mod direct_url; mod index_url; diff --git a/scripts/benchmarks/requirements/all-kinds.in b/scripts/benchmarks/requirements/all-kinds.in new file mode 100644 index 000000000..9e233dae6 --- /dev/null +++ b/scripts/benchmarks/requirements/all-kinds.in @@ -0,0 +1,5 @@ +# All kinds of dependencies +flask @ https://files.pythonhosted.org/packages/36/42/015c23096649b908c809c69388a805a571a3bea44362fe87e33fc3afa01f/flask-3.0.0-py3-none-any.whl +django_allauth==0.51.0 +pandas +pydantic-extra-types @ git+https://github.com/pydantic/pydantic-extra-types.git