Move simple index queries to `CachedClient` (#504)

Replaces the usage of `http-cache-reqwest` for simple index queries with
our custom cached client, removing `http-cache-reqwest` altogether.

The new cache paths are `<cache>/simple-v0/<index>/<package_name>.json`.
I could not test with a non-pypi index since i'm not aware of any other
json indices (jax and torch are both html indices).

In a future step, we can transform the response to be a
`HashMap<Version, {source_dists: Vec<(SourceDistFilename, File)>,
wheels: Vec<(WheeFilename, File)>}` (independent of python version, this
cache is used by all environments together). This should speed up cache
deserialization a bit, since we don't need to try source dist and wheel
anymore and drop incompatible dists, and it should make building the
`VersionMap` simpler. We can speed this up even further by splitting
into a version lists and the info for each version. I'm mentioning this
because deserialization was a major bottleneck in the rust part of the
old python prototype.

Fixes #481
This commit is contained in:
konsti 2023-11-28 01:11:03 +01:00 committed by GitHub
parent 1142a14f4d
commit 8855f44b5f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 74 additions and 126 deletions

58
Cargo.lock generated
View File

@ -232,7 +232,7 @@ dependencies = [
"futures", "futures",
"http-content-range", "http-content-range",
"itertools 0.11.0", "itertools 0.11.0",
"memmap2 0.9.0", "memmap2",
"reqwest", "reqwest",
"thiserror", "thiserror",
"tokio", "tokio",
@ -308,15 +308,6 @@ dependencies = [
"platform-tags", "platform-tags",
] ]
[[package]]
name = "bincode"
version = "1.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
dependencies = [
"serde",
]
[[package]] [[package]]
name = "bisection" name = "bisection"
version = "0.1.0" version = "0.1.0"
@ -410,8 +401,6 @@ dependencies = [
"either", "either",
"futures", "futures",
"hex", "hex",
"libc",
"memmap2 0.5.10",
"miette 5.10.0 (registry+https://github.com/rust-lang/crates.io-index)", "miette 5.10.0 (registry+https://github.com/rust-lang/crates.io-index)",
"reflink-copy", "reflink-copy",
"serde", "serde",
@ -1298,40 +1287,6 @@ dependencies = [
"pin-project-lite", "pin-project-lite",
] ]
[[package]]
name = "http-cache"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57bae69908277d47122334bf6009e514cd3d24e65d0bf2b56f1b45db6ad8864d"
dependencies = [
"async-trait",
"bincode",
"cacache",
"http",
"http-cache-semantics",
"httpdate",
"serde",
"url",
]
[[package]]
name = "http-cache-reqwest"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee5a40915f9da8f3feef6f942f9aace74393803c47bce6870f19cd2b5123f27a"
dependencies = [
"anyhow",
"async-trait",
"http",
"http-cache",
"http-cache-semantics",
"reqwest",
"reqwest-middleware",
"serde",
"task-local-extensions",
"url",
]
[[package]] [[package]]
name = "http-cache-semantics" name = "http-cache-semantics"
version = "1.0.1" version = "1.0.1"
@ -1779,15 +1734,6 @@ version = "2.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
[[package]]
name = "memmap2"
version = "0.5.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327"
dependencies = [
"libc",
]
[[package]] [[package]]
name = "memmap2" name = "memmap2"
version = "0.9.0" version = "0.9.0"
@ -2450,7 +2396,6 @@ dependencies = [
"fs-err", "fs-err",
"futures", "futures",
"http", "http",
"http-cache-reqwest",
"http-cache-semantics", "http-cache-semantics",
"install-wheel-rs", "install-wheel-rs",
"puffin-cache", "puffin-cache",
@ -2620,7 +2565,6 @@ dependencies = [
name = "puffin-interpreter" name = "puffin-interpreter"
version = "0.0.1" version = "0.0.1"
dependencies = [ dependencies = [
"anyhow",
"cacache", "cacache",
"fs-err", "fs-err",
"pep440_rs 0.3.12", "pep440_rs 0.3.12",

View File

@ -40,7 +40,6 @@ glob = { version = "0.3.1" }
goblin = { version = "0.7.1" } goblin = { version = "0.7.1" }
hex = { version = "0.4.3" } hex = { version = "0.4.3" }
http = { version = "0.2.9" } http = { version = "0.2.9" }
http-cache-reqwest = { version = "0.12.0" }
http-cache-semantics = { version = "1.0.1" } http-cache-semantics = { version = "1.0.1" }
indicatif = { version = "0.17.7" } indicatif = { version = "0.17.7" }
indoc = { version = "2.0.4" } indoc = { version = "2.0.4" }

View File

@ -16,7 +16,6 @@ async_zip = { workspace = true, features = ["tokio"] }
futures = { workspace = true } futures = { workspace = true }
fs-err = { workspace = true, features = ["tokio"] } fs-err = { workspace = true, features = ["tokio"] }
http = { workspace = true } http = { workspace = true }
http-cache-reqwest = { workspace = true }
http-cache-semantics = { workspace = true } http-cache-semantics = { workspace = true }
reqwest = { workspace = true } reqwest = { workspace = true }
reqwest-middleware = { workspace = true } reqwest-middleware = { workspace = true }

View File

@ -8,7 +8,7 @@ use reqwest_middleware::ClientWithMiddleware;
use serde::de::DeserializeOwned; use serde::de::DeserializeOwned;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tempfile::NamedTempFile; use tempfile::NamedTempFile;
use tracing::{trace, warn}; use tracing::{debug, trace, warn};
/// Either a cached client error or a (user specified) error from the callback /// Either a cached client error or a (user specified) error from the callback
pub enum CachedClientError<CallbackError> { pub enum CachedClientError<CallbackError> {
@ -93,23 +93,23 @@ impl CachedClient {
&self, &self,
req: Request, req: Request,
cache_dir: &Path, cache_dir: &Path,
filename: &str, cache_file: &str,
response_callback: Callback, response_callback: Callback,
) -> Result<Payload, CachedClientError<CallBackError>> ) -> Result<Payload, CachedClientError<CallBackError>>
where where
Callback: FnOnce(Response) -> CallbackReturn, Callback: FnOnce(Response) -> CallbackReturn,
CallbackReturn: Future<Output = Result<Payload, CallBackError>>, CallbackReturn: Future<Output = Result<Payload, CallBackError>>,
{ {
let cache_file = cache_dir.join(filename); let cache_path = cache_dir.join(cache_file);
let cached = if let Ok(cached) = fs_err::tokio::read(&cache_file).await { let cached = if let Ok(cached) = fs_err::tokio::read(&cache_path).await {
match serde_json::from_slice::<DataWithCachePolicy<Payload>>(&cached) { match serde_json::from_slice::<DataWithCachePolicy<Payload>>(&cached) {
Ok(data) => Some(data), Ok(data) => Some(data),
Err(err) => { Err(err) => {
warn!( warn!(
"Broken cache entry at {}, removing: {err}", "Broken cache entry at {}, removing: {err}",
cache_file.display() cache_path.display()
); );
let _ = fs_err::tokio::remove_file(&cache_file).await; let _ = fs_err::tokio::remove_file(&cache_path).await;
None None
} }
} }
@ -129,7 +129,7 @@ impl CachedClient {
) )
.await .await
.map_err(crate::Error::from)?; .map_err(crate::Error::from)?;
temp_file.persist(cache_file).map_err(crate::Error::from)?; temp_file.persist(cache_path).map_err(crate::Error::from)?;
Ok(data_with_cache_policy.data) Ok(data_with_cache_policy.data)
} }
CachedResponse::ModifiedOrNew(res, cache_policy) => { CachedResponse::ModifiedOrNew(res, cache_policy) => {
@ -148,7 +148,7 @@ impl CachedClient {
) )
.await .await
.map_err(crate::Error::from)?; .map_err(crate::Error::from)?;
temp_file.persist(cache_file).map_err(crate::Error::from)?; temp_file.persist(cache_path).map_err(crate::Error::from)?;
Ok(data_with_cache_policy.data) Ok(data_with_cache_policy.data)
} else { } else {
Ok(data) Ok(data)
@ -169,26 +169,24 @@ impl CachedClient {
req.try_clone() req.try_clone()
.expect("You can't use streaming request bodies with this function"), .expect("You can't use streaming request bodies with this function"),
)?; )?;
let url = req.url().clone();
let cached_response = if let Some(cached) = cached { let cached_response = if let Some(cached) = cached {
match cached match cached
.cache_policy .cache_policy
.before_request(&converted_req, SystemTime::now()) .before_request(&converted_req, SystemTime::now())
{ {
BeforeRequest::Fresh(_) => { BeforeRequest::Fresh(_) => {
trace!("Fresh cache for {}", req.url()); debug!("Found fresh response for: {url}");
CachedResponse::FreshCache(cached.data) CachedResponse::FreshCache(cached.data)
} }
BeforeRequest::Stale { request, matches } => { BeforeRequest::Stale { request, matches } => {
if !matches { if !matches {
// This should not happen // This shouldn't happen; if it does, we'll override the cache.
warn!( warn!("Cached request doesn't match current request for: {url}");
"Cached request doesn't match current request for {}",
req.url(),
);
// This will override the bogus cache
return self.fresh_request(req, converted_req).await; return self.fresh_request(req, converted_req).await;
} }
trace!("Revalidation request for {}", req.url());
debug!("Sending revalidation request for: {url}");
for header in &request.headers { for header in &request.headers {
req.headers_mut().insert(header.0.clone(), header.1.clone()); req.headers_mut().insert(header.0.clone(), header.1.clone());
converted_req converted_req
@ -211,12 +209,14 @@ impl CachedClient {
); );
match after_response { match after_response {
AfterResponse::NotModified(new_policy, _parts) => { AfterResponse::NotModified(new_policy, _parts) => {
debug!("Found not-modified response for: {url}");
CachedResponse::NotModified(DataWithCachePolicy { CachedResponse::NotModified(DataWithCachePolicy {
data: cached.data, data: cached.data,
cache_policy: new_policy, cache_policy: new_policy,
}) })
} }
AfterResponse::Modified(new_policy, _parts) => { AfterResponse::Modified(new_policy, _parts) => {
debug!("Found modified response for: {url}");
CachedResponse::ModifiedOrNew( CachedResponse::ModifiedOrNew(
res, res,
new_policy.is_storable().then_some(new_policy), new_policy.is_storable().then_some(new_policy),
@ -226,6 +226,7 @@ impl CachedClient {
} }
} }
} else { } else {
debug!("Not cached {url}");
// No reusable cache // No reusable cache
self.fresh_request(req, converted_req).await? self.fresh_request(req, converted_req).await?
}; };

View File

@ -1,8 +1,8 @@
pub use cached_client::{CachedClient, CachedClientError, DataWithCachePolicy}; pub use cached_client::{CachedClient, CachedClientError, DataWithCachePolicy};
pub use client::{RegistryClient, RegistryClientBuilder};
pub use error::Error; pub use error::Error;
pub use registry_client::{RegistryClient, RegistryClientBuilder};
mod cached_client; mod cached_client;
mod client;
mod error; mod error;
mod registry_client;
mod remote_metadata; mod remote_metadata;

View File

@ -5,9 +5,7 @@ use std::str::FromStr;
use async_http_range_reader::{AsyncHttpRangeReader, AsyncHttpRangeReaderError}; use async_http_range_reader::{AsyncHttpRangeReader, AsyncHttpRangeReaderError};
use async_zip::tokio::read::seek::ZipFileReader; use async_zip::tokio::read::seek::ZipFileReader;
use futures::TryStreamExt; use futures::TryStreamExt;
use http_cache_reqwest::{CACacheManager, Cache, CacheMode, HttpCache, HttpCacheOptions};
use reqwest::{Client, ClientBuilder, Response, StatusCode}; use reqwest::{Client, ClientBuilder, Response, StatusCode};
use reqwest_middleware::ClientWithMiddleware;
use reqwest_retry::policies::ExponentialBackoff; use reqwest_retry::policies::ExponentialBackoff;
use reqwest_retry::RetryTransientMiddleware; use reqwest_retry::RetryTransientMiddleware;
use tempfile::tempfile; use tempfile::tempfile;
@ -19,13 +17,15 @@ use url::Url;
use distribution_filename::WheelFilename; use distribution_filename::WheelFilename;
use distribution_types::{BuiltDist, Metadata}; use distribution_types::{BuiltDist, Metadata};
use install_wheel_rs::find_dist_info; use install_wheel_rs::find_dist_info;
use puffin_cache::WheelMetadataCache; use puffin_cache::{digest, CanonicalUrl, WheelMetadataCache};
use puffin_normalize::PackageName; use puffin_normalize::PackageName;
use pypi_types::{File, IndexUrl, Metadata21, SimpleJson}; use pypi_types::{File, IndexUrl, Metadata21, SimpleJson};
use crate::remote_metadata::wheel_metadata_from_remote_zip; use crate::remote_metadata::wheel_metadata_from_remote_zip;
use crate::{CachedClient, CachedClientError, Error}; use crate::{CachedClient, CachedClientError, Error};
const SIMPLE_CACHE: &str = "simple-v0";
/// A builder for an [`RegistryClient`]. /// A builder for an [`RegistryClient`].
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct RegistryClientBuilder { pub struct RegistryClientBuilder {
@ -100,34 +100,18 @@ impl RegistryClientBuilder {
let retry_policy = ExponentialBackoff::builder().build_with_max_retries(self.retries); let retry_policy = ExponentialBackoff::builder().build_with_max_retries(self.retries);
let retry_strategy = RetryTransientMiddleware::new_with_policy(retry_policy); let retry_strategy = RetryTransientMiddleware::new_with_policy(retry_policy);
let mut client_builder =
reqwest_middleware::ClientBuilder::new(client_raw.clone()).with(retry_strategy);
client_builder = client_builder.with(Cache(HttpCache {
mode: CacheMode::Default,
manager: CACacheManager {
path: self.cache.clone(),
},
options: HttpCacheOptions::default(),
}));
let retry_policy = ExponentialBackoff::builder().build_with_max_retries(self.retries);
let retry_strategy = RetryTransientMiddleware::new_with_policy(retry_policy);
let uncached_client = reqwest_middleware::ClientBuilder::new(client_raw.clone()) let uncached_client = reqwest_middleware::ClientBuilder::new(client_raw.clone())
.with(retry_strategy) .with(retry_strategy)
.build(); .build();
let cached_client = CachedClient::new(uncached_client.clone()); let client = CachedClient::new(uncached_client.clone());
RegistryClient { RegistryClient {
index: self.index, index: self.index,
extra_index: self.extra_index, extra_index: self.extra_index,
no_index: self.no_index, no_index: self.no_index,
client: client_builder.build(),
client_raw: client_raw.clone(), client_raw: client_raw.clone(),
uncached_client,
cache: self.cache, cache: self.cache,
cached_client, client,
} }
} }
} }
@ -140,20 +124,24 @@ pub struct RegistryClient {
pub(crate) extra_index: Vec<IndexUrl>, pub(crate) extra_index: Vec<IndexUrl>,
/// Ignore the package index, instead relying on local archives and caches. /// Ignore the package index, instead relying on local archives and caches.
pub(crate) no_index: bool, pub(crate) no_index: bool,
pub(crate) client: ClientWithMiddleware, pub(crate) client: CachedClient,
pub(crate) uncached_client: ClientWithMiddleware, /// Don't use this client, it only exists because `async_http_range_reader` needs
/// [`reqwest::Client] instead of [`reqwest_middleware::Client`]
pub(crate) client_raw: Client, pub(crate) client_raw: Client,
pub(crate) cached_client: CachedClient,
/// Used for the remote wheel METADATA cache /// Used for the remote wheel METADATA cache
pub(crate) cache: PathBuf, pub(crate) cache: PathBuf,
} }
impl RegistryClient { impl RegistryClient {
pub fn cached_client(&self) -> &CachedClient { pub fn cached_client(&self) -> &CachedClient {
&self.cached_client &self.client
} }
/// Fetch a package from the `PyPI` simple API. /// Fetch a package from the `PyPI` simple API.
///
/// "simple" here refers to [PEP 503 Simple Repository API](https://peps.python.org/pep-0503/)
/// and [PEP 691 JSON-based Simple API for Python Package Indexes](https://peps.python.org/pep-0691/),
/// which the pypi json api approximately implements.
pub async fn simple(&self, package_name: PackageName) -> Result<(IndexUrl, SimpleJson), Error> { pub async fn simple(&self, package_name: PackageName) -> Result<(IndexUrl, SimpleJson), Error> {
if self.no_index { if self.no_index {
return Err(Error::NoIndex(package_name.as_ref().to_string())); return Err(Error::NoIndex(package_name.as_ref().to_string()));
@ -172,37 +160,54 @@ impl RegistryClient {
url url
); );
let cache_dir = self.cache.join(SIMPLE_CACHE).join(match index {
IndexUrl::Pypi => "pypi".to_string(),
IndexUrl::Url(url) => digest(&CanonicalUrl::new(url)),
});
let cache_file = format!("{}.json", package_name.as_ref());
let simple_request = self
.client
.uncached()
.get(url.clone())
.header("Accept-Encoding", "gzip")
.build()?;
let parse_simple_response = |response: Response| async {
let bytes = response.bytes().await?;
let data: SimpleJson = serde_json::from_slice(bytes.as_ref())
.map_err(|err| Error::from_json_err(err, url))?;
Ok(data)
};
let result = self
.client
.get_cached_with_callback(
simple_request,
&cache_dir,
&cache_file,
parse_simple_response,
)
.await;
// Fetch from the index. // Fetch from the index.
match self.simple_impl(&url).await { match result {
Ok(text) => { Ok(simple_json) => {
let data = serde_json::from_str(&text) return Ok((index.clone(), simple_json));
.map_err(move |e| Error::from_json_err(e, url))?;
return Ok((index.clone(), data));
} }
Err(err) => { Err(CachedClientError::Client(Error::RequestError(err))) => {
if err.status() == Some(StatusCode::NOT_FOUND) { if err.status() == Some(StatusCode::NOT_FOUND) {
continue; continue;
} }
return Err(err.into()); return Err(err.into());
} }
Err(err) => {
return Err(err.into());
}
} }
} }
Err(Error::PackageNotFound(package_name.as_ref().to_string())) Err(Error::PackageNotFound(package_name.as_ref().to_string()))
} }
async fn simple_impl(&self, url: &Url) -> Result<String, reqwest_middleware::Error> {
Ok(self
.client
.get(url.clone())
.header("Accept-Encoding", "gzip")
.send()
.await?
.error_for_status()?
.text()
.await?)
}
/// Fetch the metadata for a remote wheel file. /// Fetch the metadata for a remote wheel file.
/// ///
/// For a remote wheel, we try the following ways to fetch the metadata: /// For a remote wheel, we try the following ways to fetch the metadata:
@ -269,9 +274,9 @@ impl RegistryClient {
Metadata21::parse(response.bytes().await?.as_ref()) Metadata21::parse(response.bytes().await?.as_ref())
.map_err(|err| Error::MetadataParseError(filename, url.to_string(), err)) .map_err(|err| Error::MetadataParseError(filename, url.to_string(), err))
}; };
let req = self.client_raw.get(url.clone()).build()?; let req = self.client.uncached().get(url.clone()).build()?;
Ok(self Ok(self
.cached_client .client
.get_cached_with_callback(req, &cache_dir, &cache_file, response_callback) .get_cached_with_callback(req, &cache_dir, &cache_file, response_callback)
.await?) .await?)
} else { } else {
@ -309,9 +314,9 @@ impl RegistryClient {
Ok(metadata) Ok(metadata)
}; };
let req = self.client_raw.head(url.clone()).build()?; let req = self.client.uncached().head(url.clone()).build()?;
let result = self let result = self
.cached_client .client
.get_cached_with_callback( .get_cached_with_callback(
req, req,
&cache_dir, &cache_dir,
@ -392,7 +397,8 @@ impl RegistryClient {
} }
Ok(Box::new( Ok(Box::new(
self.uncached_client self.client
.uncached()
.get(url.to_string()) .get(url.to_string())
.send() .send()
.await? .await?

View File

@ -17,7 +17,6 @@ pep440_rs = { path = "../pep440-rs" }
pep508_rs = { path = "../pep508-rs", features = ["serde"] } pep508_rs = { path = "../pep508-rs", features = ["serde"] }
platform-host = { path = "../platform-host" } platform-host = { path = "../platform-host" }
anyhow = { workspace = true }
cacache = { workspace = true } cacache = { workspace = true }
fs-err = { workspace = true, features = ["tokio"] } fs-err = { workspace = true, features = ["tokio"] }
serde_json = { workspace = true } serde_json = { workspace = true }