Wheel metadata refactor (#462)

A consistent cache structure for remote wheel metadata:

 * `<wheel metadata cache>/pypi/foo-1.0.0-py3-none-any.json`
* `<wheel metadata
cache>/<digest(index-url)>/foo-1.0.0-py3-none-any.json`
* `<wheel metadata cache>/url/<digest(url)>/foo-1.0.0-py3-none-any.json`

The source dist caching will use a similar structure (#468).
This commit is contained in:
konsti 2023-11-20 17:26:36 +01:00 committed by GitHub
parent d3e9e1783f
commit f0841cdb6e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 153 additions and 47 deletions

7
Cargo.lock generated
View File

@ -874,6 +874,7 @@ name = "distribution-types"
version = "0.0.1" version = "0.0.1"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"distribution-filename",
"fs-err", "fs-err",
"pep440_rs 0.3.12", "pep440_rs 0.3.12",
"puffin-cache", "puffin-cache",
@ -2371,8 +2372,10 @@ version = "0.0.1"
dependencies = [ dependencies = [
"clap", "clap",
"directories", "directories",
"distribution-filename",
"fs-err", "fs-err",
"hex", "hex",
"pypi-types",
"seahash", "seahash",
"tempfile", "tempfile",
"url", "url",
@ -2447,6 +2450,7 @@ dependencies = [
"http-cache-reqwest", "http-cache-reqwest",
"http-cache-semantics", "http-cache-semantics",
"install-wheel-rs", "install-wheel-rs",
"puffin-cache",
"puffin-normalize", "puffin-normalize",
"pypi-types", "pypi-types",
"reqwest", "reqwest",
@ -2454,6 +2458,7 @@ dependencies = [
"reqwest-retry", "reqwest-retry",
"serde", "serde",
"serde_json", "serde_json",
"sha2",
"tempfile", "tempfile",
"thiserror", "thiserror",
"tokio", "tokio",
@ -2618,8 +2623,10 @@ dependencies = [
name = "puffin-macros" name = "puffin-macros"
version = "0.0.1" version = "0.0.1"
dependencies = [ dependencies = [
"colored",
"fxhash", "fxhash",
"once_cell", "once_cell",
"tracing",
] ]
[[package]] [[package]]

View File

@ -10,6 +10,7 @@ authors = { workspace = true }
license = { workspace = true } license = { workspace = true }
[dependencies] [dependencies]
distribution-filename = { path = "../distribution-filename" }
pep440_rs = { path = "../pep440-rs" } pep440_rs = { path = "../pep440-rs" }
puffin-cache = { path = "../puffin-cache" } puffin-cache = { path = "../puffin-cache" }
puffin-git = { path = "../puffin-git" } puffin-git = { path = "../puffin-git" }

View File

@ -80,7 +80,7 @@ impl CachedDist {
path, path,
}), }),
Dist::Built(BuiltDist::DirectUrl(dist)) => Self::Url(CachedDirectUrlDist { Dist::Built(BuiltDist::DirectUrl(dist)) => Self::Url(CachedDirectUrlDist {
name: dist.name, name: dist.filename.name,
url: dist.url, url: dist.url,
path, path,
}), }),

View File

@ -1,6 +1,8 @@
use std::path::Path; use std::path::Path;
use std::str::FromStr;
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use distribution_filename::WheelFilename;
use url::Url; use url::Url;
use pep440_rs::Version; use pep440_rs::Version;
@ -68,7 +70,9 @@ pub struct RegistryBuiltDist {
/// A built distribution (wheel) that exists at an arbitrary URL. /// A built distribution (wheel) that exists at an arbitrary URL.
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct DirectUrlBuiltDist { pub struct DirectUrlBuiltDist {
pub name: PackageName, /// We require that wheel urls end in the full wheel filename, e.g.
/// `https://example.org/packages/flask-3.0.0-py3-none-any.whl`
pub filename: WheelFilename,
pub url: Url, pub url: Url,
} }
@ -84,6 +88,8 @@ pub struct RegistrySourceDist {
/// A source distribution that exists at an arbitrary URL. /// A source distribution that exists at an arbitrary URL.
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct DirectUrlSourceDist { pub struct DirectUrlSourceDist {
/// Unlike [`DirectUrlBuiltDist`], we can't require a full filename with a version here, people
/// like using e.g. `foo @ https://github.com/org/repo/archive/master.zip`
pub name: PackageName, pub name: PackageName,
pub url: Url, pub url: Url,
} }
@ -120,13 +126,15 @@ impl Dist {
/// Create a [`Dist`] for a URL-based distribution. /// Create a [`Dist`] for a URL-based distribution.
pub fn from_url(name: PackageName, url: Url) -> Self { pub fn from_url(name: PackageName, url: Url) -> Self {
// The part after the last slash
let filename = url
.path()
.rsplit_once('/')
.map_or(url.path(), |(_path, filename)| filename);
if url.scheme().starts_with("git+") { if url.scheme().starts_with("git+") {
Self::Source(SourceDist::Git(GitSourceDist { name, url })) Self::Source(SourceDist::Git(GitSourceDist { name, url }))
} else if Path::new(url.path()) } else if let Ok(filename) = WheelFilename::from_str(filename) {
.extension() Self::Built(BuiltDist::DirectUrl(DirectUrlBuiltDist { filename, url }))
.is_some_and(|ext| ext.eq_ignore_ascii_case("whl"))
{
Self::Built(BuiltDist::DirectUrl(DirectUrlBuiltDist { name, url }))
} else { } else {
Self::Source(SourceDist::DirectUrl(DirectUrlSourceDist { name, url })) Self::Source(SourceDist::DirectUrl(DirectUrlSourceDist { name, url }))
} }
@ -145,7 +153,7 @@ impl Dist {
match self { match self {
Self::Built(built) => Self::Built(match built { Self::Built(built) => Self::Built(match built {
BuiltDist::DirectUrl(dist) => BuiltDist::DirectUrl(DirectUrlBuiltDist { BuiltDist::DirectUrl(dist) => BuiltDist::DirectUrl(DirectUrlBuiltDist {
name: dist.name, filename: dist.filename,
url, url,
}), }),
dist @ BuiltDist::Registry(_) => dist, dist @ BuiltDist::Registry(_) => dist,
@ -197,7 +205,7 @@ impl Metadata for RegistryBuiltDist {
impl Metadata for DirectUrlBuiltDist { impl Metadata for DirectUrlBuiltDist {
fn name(&self) -> &PackageName { fn name(&self) -> &PackageName {
&self.name &self.filename.name
} }
fn version_or_url(&self) -> VersionOrUrl { fn version_or_url(&self) -> VersionOrUrl {

View File

@ -11,6 +11,9 @@ authors = { workspace = true }
license = { workspace = true } license = { workspace = true }
[dependencies] [dependencies]
distribution-filename = { path = "../distribution-filename" }
pypi-types = { path = "../pypi-types" }
clap = { workspace = true, features = ["derive"], optional = true } clap = { workspace = true, features = ["derive"], optional = true }
directories = { workspace = true } directories = { workspace = true }
fs-err = { workspace = true } fs-err = { workspace = true }

View File

@ -11,6 +11,7 @@ mod cache_key;
mod canonical_url; mod canonical_url;
mod cli; mod cli;
mod digest; mod digest;
pub mod metadata;
/// A trait for types that can be hashed in a stable way across versions and platforms. /// A trait for types that can be hashed in a stable way across versions and platforms.
pub trait StableHash { pub trait StableHash {

View File

@ -0,0 +1,38 @@
use std::path::{Path, PathBuf};
use url::Url;
use pypi_types::IndexUrl;
use crate::{digest, CanonicalUrl};
const WHEEL_METADATA_CACHE: &str = "wheel-metadata-v0";
/// Cache wheel metadata.
///
/// Wheel metadata can come from a remote wheel or from building a source
/// distribution. For a remote wheel, we try the following ways to fetch the metadata:
/// 1. From a [PEP 658](https://peps.python.org/pep-0658/) data-dist-info-metadata url
/// 2. From a remote wheel by partial zip reading
/// 3. From a (temp) download of a remote wheel (this is a fallback, the webserver should support range requests)
pub enum WheelMetadataCache {
Index(IndexUrl),
Url,
}
impl WheelMetadataCache {
/// Cache structure:
/// * `<wheel metadata cache>/pypi/foo-1.0.0-py3-none-any.json`
/// * `<wheel metadata cache>/<digest(index-url)>/foo-1.0.0-py3-none-any.json`
/// * `<wheel metadata cache>/url/<digest(url)>/foo-1.0.0-py3-none-any.json`
pub fn cache_dir(&self, cache: &Path, url: &Url) -> PathBuf {
let cache_root = cache.join(WHEEL_METADATA_CACHE);
match self {
WheelMetadataCache::Index(IndexUrl::Pypi) => cache_root.join("pypi"),
WheelMetadataCache::Index(url) => cache_root
.join("index")
.join(digest(&CanonicalUrl::new(url))),
WheelMetadataCache::Url => cache_root.join("url").join(digest(&CanonicalUrl::new(url))),
}
}
}

View File

@ -6,6 +6,7 @@ edition = "2021"
[dependencies] [dependencies]
distribution-filename = { path = "../distribution-filename" } distribution-filename = { path = "../distribution-filename" }
install-wheel-rs = { path = "../install-wheel-rs" } install-wheel-rs = { path = "../install-wheel-rs" }
puffin-cache = { path = "../puffin-cache" }
puffin-normalize = { path = "../puffin-normalize" } puffin-normalize = { path = "../puffin-normalize" }
pypi-types = { path = "../pypi-types" } pypi-types = { path = "../pypi-types" }
@ -21,6 +22,7 @@ reqwest-middleware = { workspace = true }
reqwest-retry = { workspace = true } reqwest-retry = { workspace = true }
serde = { workspace = true } serde = { workspace = true }
serde_json = { workspace = true } serde_json = { workspace = true }
sha2 = { workspace = true }
thiserror = { workspace = true } thiserror = { workspace = true }
tempfile = { workspace = true } tempfile = { workspace = true }
tokio = { workspace = true, features = ["fs"] } tokio = { workspace = true, features = ["fs"] }

View File

@ -18,14 +18,13 @@ use url::Url;
use distribution_filename::WheelFilename; use distribution_filename::WheelFilename;
use install_wheel_rs::find_dist_info; use install_wheel_rs::find_dist_info;
use puffin_cache::metadata::WheelMetadataCache;
use puffin_normalize::PackageName; use puffin_normalize::PackageName;
use pypi_types::{File, IndexUrl, Metadata21, SimpleJson}; use pypi_types::{File, IndexUrl, Metadata21, SimpleJson};
use crate::cached_client::CachedClient; use crate::cached_client::CachedClient;
use crate::error::Error; use crate::error::Error;
use crate::remote_metadata::{ use crate::remote_metadata::wheel_metadata_from_remote_zip;
wheel_metadata_from_remote_zip, WHEEL_METADATA_FROM_INDEX, WHEEL_METADATA_FROM_ZIP_CACHE,
};
/// A builder for an [`RegistryClient`]. /// A builder for an [`RegistryClient`].
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
@ -41,7 +40,7 @@ pub struct RegistryClientBuilder {
impl RegistryClientBuilder { impl RegistryClientBuilder {
pub fn new(cache: impl Into<PathBuf>) -> Self { pub fn new(cache: impl Into<PathBuf>) -> Self {
Self { Self {
index: IndexUrl::from(Url::parse("https://pypi.org/simple").unwrap()), index: IndexUrl::Pypi,
extra_index: vec![], extra_index: vec![],
no_index: false, no_index: false,
proxy: Url::parse("https://pypi-metadata.ruff.rs").unwrap(), proxy: Url::parse("https://pypi-metadata.ruff.rs").unwrap(),
@ -199,7 +198,7 @@ impl RegistryClient {
} }
/// Fetch the metadata from a wheel file. /// Fetch the metadata from a wheel file.
pub async fn wheel_metadata(&self, file: File) -> Result<Metadata21, Error> { pub async fn wheel_metadata(&self, index: IndexUrl, file: File) -> Result<Metadata21, Error> {
if self.no_index { if self.no_index {
return Err(Error::NoIndex(file.filename)); return Err(Error::NoIndex(file.filename));
} }
@ -209,11 +208,12 @@ impl RegistryClient {
let filename = WheelFilename::from_str(&file.filename)?; let filename = WheelFilename::from_str(&file.filename)?;
if file if file
.dist_info_metadata .dist_info_metadata
.is_some_and(|dist_info_metadata| dist_info_metadata.is_available()) .as_ref()
.is_some_and(pypi_types::Metadata::is_available)
{ {
let url = Url::parse(&format!("{}.metadata", file.url))?; let url = Url::parse(&format!("{}.metadata", file.url))?;
let cache_dir = self.cache.join(WHEEL_METADATA_FROM_INDEX).join("pypi"); let cache_dir = WheelMetadataCache::Index(index).cache_dir(&self.cache, &url);
let cache_file = format!("{}.json", filename.stem()); let cache_file = format!("{}.json", filename.stem());
let response_callback = |response: Response| async { let response_callback = |response: Response| async {
@ -228,17 +228,23 @@ impl RegistryClient {
// `.dist-info/METADATA` file from the zip, and if that also fails, download the whole wheel // `.dist-info/METADATA` file from the zip, and if that also fails, download the whole wheel
// into the cache and read from there // into the cache and read from there
} else { } else {
self.wheel_metadata_no_index(&filename, &url).await self.wheel_metadata_no_pep658(&filename, &url, WheelMetadataCache::Index(index))
.await
} }
} }
/// Get the wheel metadata if it isn't available in an index through PEP 658 /// Get the wheel metadata if it isn't available in an index through PEP 658
pub async fn wheel_metadata_no_index( pub async fn wheel_metadata_no_pep658(
&self, &self,
filename: &WheelFilename, filename: &WheelFilename,
url: &Url, url: &Url,
cache_shard: WheelMetadataCache,
) -> Result<Metadata21, Error> { ) -> Result<Metadata21, Error> {
let cache_dir = self.cache.join(WHEEL_METADATA_FROM_ZIP_CACHE).join("pypi"); if self.no_index {
return Err(Error::NoIndex(url.to_string()));
}
let cache_dir = cache_shard.cache_dir(&self.cache, url);
let cache_file = format!("{}.json", filename.stem()); let cache_file = format!("{}.json", filename.stem());
// This response callback is special, we actually make a number of subsequent requests to // This response callback is special, we actually make a number of subsequent requests to

View File

@ -7,9 +7,6 @@ use install_wheel_rs::find_dist_info;
use crate::Error; use crate::Error;
pub(crate) const WHEEL_METADATA_FROM_INDEX: &str = "wheel-metadat-index-v0";
pub(crate) const WHEEL_METADATA_FROM_ZIP_CACHE: &str = "wheel-metadata-remote-v0";
/// Read the `.dist-info/METADATA` file from a async remote zip reader, so we avoid downloading the /// Read the `.dist-info/METADATA` file from a async remote zip reader, so we avoid downloading the
/// entire wheel just for the one file. /// entire wheel just for the one file.
/// ///

View File

@ -5,6 +5,7 @@ use tempfile::tempdir;
use url::Url; use url::Url;
use distribution_filename::WheelFilename; use distribution_filename::WheelFilename;
use puffin_cache::metadata::WheelMetadataCache;
use puffin_client::RegistryClientBuilder; use puffin_client::RegistryClientBuilder;
#[tokio::test] #[tokio::test]
@ -17,7 +18,11 @@ async fn remote_metadata_with_and_without_cache() -> Result<()> {
let url = "https://files.pythonhosted.org/packages/00/e5/f12a80907d0884e6dff9c16d0c0114d81b8cd07dc3ae54c5e962cc83037e/tqdm-4.66.1-py3-none-any.whl"; let url = "https://files.pythonhosted.org/packages/00/e5/f12a80907d0884e6dff9c16d0c0114d81b8cd07dc3ae54c5e962cc83037e/tqdm-4.66.1-py3-none-any.whl";
let filename = WheelFilename::from_str(url.rsplit_once('/').unwrap().1).unwrap(); let filename = WheelFilename::from_str(url.rsplit_once('/').unwrap().1).unwrap();
let metadata = client let metadata = client
.wheel_metadata_no_index(&filename, &Url::parse(url).unwrap()) .wheel_metadata_no_pep658(
&filename,
&Url::parse(url).unwrap(),
WheelMetadataCache::Url,
)
.await .await
.unwrap(); .unwrap();
assert_eq!(metadata.summary.unwrap(), "Fast, Extensible Progress Meter"); assert_eq!(metadata.summary.unwrap(), "Fast, Extensible Progress Meter");

View File

@ -5,6 +5,7 @@ use url::Url;
use anyhow::Result; use anyhow::Result;
use distribution_filename::WheelFilename; use distribution_filename::WheelFilename;
use puffin_cache::metadata::WheelMetadataCache;
use puffin_cache::{CacheArgs, CacheDir}; use puffin_cache::{CacheArgs, CacheDir};
use puffin_client::RegistryClientBuilder; use puffin_client::RegistryClientBuilder;
@ -28,7 +29,9 @@ pub(crate) async fn wheel_metadata(args: WheelMetadataArgs) -> Result<()> {
.1, .1,
)?; )?;
let metadata = client.wheel_metadata_no_index(&filename, &args.url).await?; let metadata = client
.wheel_metadata_no_pep658(&filename, &args.url, WheelMetadataCache::Url)
.await?;
println!("{metadata:?}"); println!("{metadata:?}");
Ok(()) Ok(())
} }

View File

@ -72,7 +72,9 @@ impl<'a> Fetcher<'a> {
match dist { match dist {
// Fetch the metadata directly from the registry. // Fetch the metadata directly from the registry.
Dist::Built(BuiltDist::Registry(wheel)) => { Dist::Built(BuiltDist::Registry(wheel)) => {
let metadata = client.wheel_metadata(wheel.file.clone()).await?; let metadata = client
.wheel_metadata(wheel.index.clone(), wheel.file.clone())
.await?;
Ok(metadata) Ok(metadata)
} }
// Fetch the distribution, then read the metadata (for built distributions), or build // Fetch the distribution, then read the metadata (for built distributions), or build

View File

@ -157,7 +157,7 @@ impl Graph {
marker: None, marker: None,
}, },
Dist::Built(BuiltDist::DirectUrl(wheel)) => Requirement { Dist::Built(BuiltDist::DirectUrl(wheel)) => Requirement {
name: wheel.name.clone(), name: wheel.filename.name.clone(),
extras: None, extras: None,
version_or_url: Some(VersionOrUrl::Url(wheel.url.clone())), version_or_url: Some(VersionOrUrl::Url(wheel.url.clone())),
marker: None, marker: None,

View File

@ -21,6 +21,7 @@ use distribution_filename::WheelFilename;
use distribution_types::{BuiltDist, Dist, Identifier, Metadata, SourceDist, VersionOrUrl}; use distribution_types::{BuiltDist, Dist, Identifier, Metadata, SourceDist, VersionOrUrl};
use pep508_rs::{MarkerEnvironment, Requirement}; use pep508_rs::{MarkerEnvironment, Requirement};
use platform_tags::Tags; use platform_tags::Tags;
use puffin_cache::metadata::WheelMetadataCache;
use puffin_cache::CanonicalUrl; use puffin_cache::CanonicalUrl;
use puffin_client::RegistryClient; use puffin_client::RegistryClient;
use puffin_distribution::Fetcher; use puffin_distribution::Fetcher;
@ -300,8 +301,8 @@ impl<'a, Context: BuildContext + Send + Sync> Resolver<'a, Context> {
PubGrubPackage::Package(package_name, _extra, Some(url)) => { PubGrubPackage::Package(package_name, _extra, Some(url)) => {
// Emit a request to fetch the metadata for this distribution. // Emit a request to fetch the metadata for this distribution.
if in_flight.insert_url(url) { if in_flight.insert_url(url) {
priorities.add(package_name.clone());
let distribution = Dist::from_url(package_name.clone(), url.clone()); let distribution = Dist::from_url(package_name.clone(), url.clone());
priorities.add(distribution.name().clone());
request_sink.unbounded_send(Request::Dist(distribution))?; request_sink.unbounded_send(Request::Dist(distribution))?;
} }
} }
@ -605,26 +606,32 @@ impl<'a, Context: BuildContext + Send + Sync> Resolver<'a, Context> {
.await .await
} }
// Fetch wheel metadata from the registry if possible. This is a fast-path to avoid // Fetch wheel metadata.
// reading from the cache in the common case: we cache wheel metadata in the HTTP Request::Dist(Dist::Built(distribution)) => {
// cache, rather than downloading the wheel itself. let metadata = match &distribution {
Request::Dist(Dist::Built(BuiltDist::Registry(wheel))) => { BuiltDist::Registry(wheel) => {
let metadata = self self.client
.client .wheel_metadata(wheel.index.clone(), wheel.file.clone())
.wheel_metadata(wheel.file.clone()) .await?
.map_err(ResolveError::Client) }
.await?; BuiltDist::DirectUrl(wheel) => {
if metadata.name != *wheel.name() { self.client
.wheel_metadata_no_pep658(
&wheel.filename,
&wheel.url,
WheelMetadataCache::Url,
)
.await?
}
};
if metadata.name != *distribution.name() {
return Err(ResolveError::NameMismatch { return Err(ResolveError::NameMismatch {
metadata: metadata.name, metadata: metadata.name,
given: wheel.name().clone(), given: distribution.name().clone(),
}); });
} }
Ok(Response::Dist( Ok(Response::Dist(Dist::Built(distribution), metadata, None))
Dist::Built(BuiltDist::Registry(wheel)),
metadata,
None,
))
} }
// Fetch distribution metadata. // Fetch distribution metadata.

View File

@ -1,17 +1,38 @@
use once_cell::sync::Lazy;
use std::ops::Deref;
use url::Url; use url::Url;
static PYPI_URL: Lazy<Url> = Lazy::new(|| Url::parse("https://pypi.org/simple").unwrap());
/// The url of an index, newtype'd to avoid mixing it with file urls /// The url of an index, newtype'd to avoid mixing it with file urls
#[derive(Debug, Clone, Hash, Eq, PartialEq)] #[derive(Debug, Clone, Hash, Eq, PartialEq)]
pub struct IndexUrl(Url); pub enum IndexUrl {
Pypi,
Url(Url),
}
impl From<Url> for IndexUrl { impl From<Url> for IndexUrl {
fn from(url: Url) -> Self { fn from(url: Url) -> Self {
Self(url) Self::Url(url)
} }
} }
impl From<IndexUrl> for Url { impl From<IndexUrl> for Url {
fn from(index: IndexUrl) -> Self { fn from(index: IndexUrl) -> Self {
index.0 match index {
IndexUrl::Pypi => PYPI_URL.clone(),
IndexUrl::Url(url) => url,
}
}
}
impl Deref for IndexUrl {
type Target = Url;
fn deref(&self) -> &Self::Target {
match &self {
IndexUrl::Pypi => &PYPI_URL,
IndexUrl::Url(url) => url,
}
} }
} }

View File

@ -2,7 +2,7 @@ pub use direct_url::{ArchiveInfo, DirectUrl, VcsInfo, VcsKind};
pub use index_url::IndexUrl; pub use index_url::IndexUrl;
pub use lenient_requirement::LenientVersionSpecifiers; pub use lenient_requirement::LenientVersionSpecifiers;
pub use metadata::{Error, Metadata21}; pub use metadata::{Error, Metadata21};
pub use simple_json::{File, SimpleJson, Yanked}; pub use simple_json::{File, Metadata, SimpleJson, Yanked};
mod direct_url; mod direct_url;
mod index_url; mod index_url;

View File

@ -0,0 +1,5 @@
# All kinds of dependencies
flask @ https://files.pythonhosted.org/packages/36/42/015c23096649b908c809c69388a805a571a3bea44362fe87e33fc3afa01f/flask-3.0.0-py3-none-any.whl
django_allauth==0.51.0
pandas
pydantic-extra-types @ git+https://github.com/pydantic/pydantic-extra-types.git