From 09f5884f28b5da517d8bb82234da41466efab8eb Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Tue, 23 Jan 2024 16:22:21 -0500 Subject: [PATCH] Avoid revalidating immutable HTTP responses (#1069) ## Summary If you send a revalidation request to a resource that returns an `immutable` directive, the server apparently returns a 200 instead of a 304? In other words, the server can ignore the revalidation request. This PR adds handling on top of the HTTP cache semantics to respect immutable resources, which is especially useful since all PyPI files are immutable. --- crates/puffin-client/src/cache_headers.rs | 56 +++++++++++++++++++++++ crates/puffin-client/src/cached_client.rs | 31 +++++++++++-- crates/puffin-client/src/lib.rs | 1 + 3 files changed, 84 insertions(+), 4 deletions(-) create mode 100644 crates/puffin-client/src/cache_headers.rs diff --git a/crates/puffin-client/src/cache_headers.rs b/crates/puffin-client/src/cache_headers.rs new file mode 100644 index 000000000..3728a6ed6 --- /dev/null +++ b/crates/puffin-client/src/cache_headers.rs @@ -0,0 +1,56 @@ +use http::HeaderValue; +use rustc_hash::FxHashMap; +use std::collections::hash_map::Entry; + +/// Cache headers from an HTTP response. +#[derive(Debug, Default)] +pub(crate) struct CacheHeaders(FxHashMap, Option>>); + +impl CacheHeaders { + /// Parse the `Cache-Control` header from an HTTP response. + /// + /// See: + pub(crate) fn from_response<'a>( + headers: impl IntoIterator, + ) -> CacheHeaders { + let mut cc = FxHashMap::, Option>>::default(); + let mut is_valid = true; + + for h in headers.into_iter().filter_map(|v| v.to_str().ok()) { + for part in h.split(',') { + // TODO: lame parsing + if part.trim().is_empty() { + continue; + } + let mut kv = part.splitn(2, '='); + let k = kv.next().unwrap().trim(); + if k.is_empty() { + continue; + } + let v = kv.next().map(str::trim); + match cc.entry(k.into()) { + Entry::Occupied(e) => { + // When there is more than one value present for a given directive (e.g., two Expires header fields, multiple Cache-Control: max-age directives), + // the directive's value is considered invalid. Caches are encouraged to consider responses that have invalid freshness information to be stale + if e.get().as_deref() != v { + is_valid = false; + } + } + Entry::Vacant(e) => { + e.insert(v.map(|v| v.trim_matches('"')).map(From::from)); + // TODO: bad unquoting + } + } + } + } + if !is_valid { + cc.insert("must-revalidate".into(), None); + } + Self(cc) + } + + /// Returns `true` if the response has an `immutable` directive. + pub(crate) fn is_immutable(&self) -> bool { + self.0.contains_key("immutable") + } +} diff --git a/crates/puffin-client/src/cached_client.rs b/crates/puffin-client/src/cached_client.rs index e4b59c80b..56cf7463d 100644 --- a/crates/puffin-client/src/cached_client.rs +++ b/crates/puffin-client/src/cached_client.rs @@ -1,7 +1,7 @@ -use futures::FutureExt; use std::future::Future; use std::time::SystemTime; +use futures::FutureExt; use http_cache_semantics::{AfterResponse, BeforeRequest, CachePolicy}; use reqwest::{Request, Response}; use reqwest_middleware::ClientWithMiddleware; @@ -12,6 +12,8 @@ use tracing::{debug, info_span, instrument, trace, warn, Instrument}; use puffin_cache::CacheEntry; use puffin_fs::write_atomic; +use crate::cache_headers::CacheHeaders; + /// Either a cached client error or a (user specified) error from the callback #[derive(Debug)] pub enum CachedClientError { @@ -46,11 +48,15 @@ enum CachedResponse { ModifiedOrNew(Response, Option>), } -/// Serialize the actual payload together with its caching information +/// Serialize the actual payload together with its caching information. #[derive(Debug, Deserialize, Serialize)] pub struct DataWithCachePolicy { pub data: Payload, - // The cache policy is large (448 bytes at time of writing), reduce the stack size + /// Whether the response should be considered immutable. + immutable: bool, + /// The [`CachePolicy`] is used to determine if the response is fresh or stale. + /// The policy is large (448 bytes at time of writing), so we reduce the stack size by + /// boxing it. cache_policy: Box, } @@ -148,11 +154,18 @@ impl CachedClient { .await } CachedResponse::ModifiedOrNew(res, cache_policy) => { + let headers = CacheHeaders::from_response(res.headers().get_all("cache-control")); + let immutable = headers.is_immutable(); + let data = response_callback(res) .await .map_err(|err| CachedClientError::Callback(err))?; if let Some(cache_policy) = cache_policy { - let data_with_cache_policy = DataWithCachePolicy { data, cache_policy }; + let data_with_cache_policy = DataWithCachePolicy { + data, + immutable, + cache_policy, + }; async { fs_err::tokio::create_dir_all(cache_entry.dir()) .await @@ -187,6 +200,12 @@ impl CachedClient { )?; let url = req.url().clone(); let cached_response = if let Some(cached) = cached { + // Avoid sending revalidation requests for immutable responses. + if cached.immutable && !cached.cache_policy.is_stale(SystemTime::now()) { + debug!("Found immutable response for: {url}"); + return Ok(CachedResponse::FreshCache(cached.data)); + } + match cached .cache_policy .before_request(&converted_req, SystemTime::now()) @@ -231,8 +250,12 @@ impl CachedClient { match after_response { AfterResponse::NotModified(new_policy, _parts) => { debug!("Found not-modified response for: {url}"); + let headers = + CacheHeaders::from_response(res.headers().get_all("cache-control")); + let immutable = headers.is_immutable(); CachedResponse::NotModified(DataWithCachePolicy { data: cached.data, + immutable, cache_policy: Box::new(new_policy), }) } diff --git a/crates/puffin-client/src/lib.rs b/crates/puffin-client/src/lib.rs index c25e68d28..70e5fa2c6 100644 --- a/crates/puffin-client/src/lib.rs +++ b/crates/puffin-client/src/lib.rs @@ -5,6 +5,7 @@ pub use registry_client::{ read_metadata_async, RegistryClient, RegistryClientBuilder, SimpleMetadata, VersionFiles, }; +mod cache_headers; mod cached_client; mod error; mod flat_index;