Fix retry counts in cached client

Previously, we dropped the counts from the middleware layer, potentially doing to many retries and/or reporting too few. Not pretty but fixes the bug.
2025-12-12 18:00:44 +01:00 · 2025-12-12 18:00:44 +01:00 · f4a3fac705
parent 7571ffa385
commit f4a3fac705
6 changed files with 158 additions and 101 deletions
--- a/crates/uv-client/src/cached_client.rs
+++ b/crates/uv-client/src/cached_client.rs
@ -119,51 +119,35 @@ where
    }
 }

-/// Dispatch type: Either a cached client error or a (user specified) error from the callback
+/// Dispatch type: Either a cached client error or a (user specified) error from the callback.
+#[derive(Debug)]
 pub enum CachedClientError<CallbackError: std::error::Error + 'static> {
-    Client {
-        retries: Option<u32>,
-        err: Error,
-    },
-    Callback {
-        retries: Option<u32>,
-        err: CallbackError,
-    },
+    /// The client tracks retries internally.
+    Client(Error),
+    /// Track retries before a callback explicitly, as we can't attach them to the callback error
+    /// type.
+    Callback { retries: u32, err: CallbackError },
 }

 impl<CallbackError: std::error::Error + 'static> CachedClientError<CallbackError> {
-    /// Attach the number of retries to the error context.
-    ///
-    /// Adds to existing errors if any, in case different layers retried.
+    /// Attach the combined number of retries to the error context, discarding the previous count.
    fn with_retries(self, retries: u32) -> Self {
        match self {
-            Self::Client {
-                retries: existing_retries,
-                err,
-            } => Self::Client {
-                retries: Some(existing_retries.unwrap_or_default() + retries),
-                err,
-            },
-            Self::Callback {
-                retries: existing_retries,
-                err,
-            } => Self::Callback {
-                retries: Some(existing_retries.unwrap_or_default() + retries),
-                err,
-            },
+            Self::Client(err) => Self::Client(err.with_retries(retries)),
+            Self::Callback { retries: _, err } => Self::Callback { retries, err },
        }
    }

-    fn retries(&self) -> Option<u32> {
+    fn retries(&self) -> u32 {
        match self {
-            Self::Client { retries, .. } => *retries,
+            Self::Client(err) => err.retries(),
            Self::Callback { retries, .. } => *retries,
        }
    }

    fn error(&self) -> &(dyn std::error::Error + 'static) {
        match self {
-            Self::Client { err, .. } => err,
+            Self::Client(err) => err,
            Self::Callback { err, .. } => err,
        }
    }
@ -171,10 +155,7 @@ impl<CallbackError: std::error::Error + 'static> CachedClientError<CallbackError

 impl<CallbackError: std::error::Error + 'static> From<Error> for CachedClientError<CallbackError> {
    fn from(error: Error) -> Self {
-        Self::Client {
-            retries: None,
-            err: error,
-        }
+        Self::Client(error)
    }
 }

@ -182,10 +163,7 @@ impl<CallbackError: std::error::Error + 'static> From<ErrorKind>
    for CachedClientError<CallbackError>
 {
    fn from(error: ErrorKind) -> Self {
-        Self::Client {
-            retries: None,
-            err: error.into(),
-        }
+        Self::Client(error.into())
    }
 }

@ -193,16 +171,10 @@ impl<E: Into<Self> + std::error::Error + 'static> From<CachedClientError<E>> for
    /// Attach retry error context, if there were retries.
    fn from(error: CachedClientError<E>) -> Self {
        match error {
-            CachedClientError::Client {
-                retries: Some(retries),
-                err,
-            } => Self::new(err.into_kind(), retries),
-            CachedClientError::Client { retries: None, err } => err,
-            CachedClientError::Callback {
-                retries: Some(retries),
-                err,
-            } => Self::new(err.into().into_kind(), retries),
-            CachedClientError::Callback { retries: None, err } => err.into(),
+            CachedClientError::Client(error) => error,
+            CachedClientError::Callback { retries, err } => {
+                Self::new(err.into().into_kind(), retries)
+            }
        }
    }
 }
@ -450,10 +422,16 @@ impl CachedClient {
        response_callback: Callback,
    ) -> Result<Payload::Target, CachedClientError<CallBackError>> {
        let new_cache = info_span!("new_cache", file = %cache_entry.path().display());
+        // Capture retries from the retry middleware
+        let retries = response
+            .extensions()
+            .get::<reqwest_retry::RetryCount>()
+            .map(|retries| retries.value())
+            .unwrap_or_default();
        let data = response_callback(response)
            .boxed_local()
            .await
-            .map_err(|err| CachedClientError::Callback { retries: None, err })?;
+            .map_err(|err| CachedClientError::Callback { retries, err })?;
        let Some(cache_policy) = cache_policy else {
            return Ok(data.into_target());
        };
@ -662,16 +640,10 @@ impl CachedClient {
            } else {
                None
            };
-            return Err(CachedClientError::<Error>::Client {
-                retries: retry_count,
-                err: ErrorKind::from_reqwest_with_problem_details(
-                    url,
-                    status_error,
-                    problem_details,
-                )
-                .into(),
-            }
-            .into());
+            return Err(Error::new(
+                ErrorKind::from_reqwest_with_problem_details(url, status_error, problem_details),
+                retry_count.unwrap_or_default(),
+            ));
        }

        let cache_policy = cache_policy_builder.build(&response);
@ -720,7 +692,7 @@ impl CachedClient {
        cache_control: CacheControl<'_>,
        response_callback: Callback,
    ) -> Result<Payload::Target, CachedClientError<CallBackError>> {
-        let mut past_retries = 0;
+        let mut total_retries = 0;
        let start_time = SystemTime::now();
        let retry_policy = self.uncached().retry_policy();
        loop {
@ -729,20 +701,17 @@ impl CachedClient {
                .get_cacheable(fresh_req, cache_entry, cache_control, &response_callback)
                .await;

+            match result {
+                Ok(ok) => return Ok(ok),
+                Err(err) => {
                    // Check if the middleware already performed retries
-            let middleware_retries = match &result {
-                Err(err) => err.retries().unwrap_or_default(),
-                Ok(_) => 0,
-            };
-
-            if result
-                .as_ref()
-                .is_err_and(|err| is_transient_network_error(err.error()))
-            {
+                    total_retries += err.retries();
+                    if is_transient_network_error(err.error()) {
                        // If middleware already retried, consider that in our retry budget
-                let total_retries = past_retries + middleware_retries;
                        let retry_decision = retry_policy.should_retry(start_time, total_retries);
-                if let reqwest_retry::RetryDecision::Retry { execute_after } = retry_decision {
+                        if let reqwest_retry::RetryDecision::Retry { execute_after } =
+                            retry_decision
+                        {
                            let duration = execute_after
                                .duration_since(SystemTime::now())
                                .unwrap_or_else(|_| Duration::default());
@ -753,16 +722,18 @@ impl CachedClient {
                                duration.as_secs_f32(),
                            );
                            tokio::time::sleep(duration).await;
-                    past_retries += 1;
+                            total_retries += 1;
                            continue;
                        }
                    }

-            if past_retries > 0 {
-                return result.map_err(|err| err.with_retries(past_retries));
+                    return if total_retries > 0 {
+                        Err(err.with_retries(total_retries))
+                    } else {
+                        Err(err)
+                    };
+                }
            }
-
-            return result;
        }
    }

@ -791,14 +762,13 @@ impl CachedClient {

            // Check if the middleware already performed retries
            let middleware_retries = match &result {
-                Err(err) => err.retries().unwrap_or_default(),
+                Err(err) => err.retries(),
                _ => 0,
            };

            if result
                .as_ref()
-                .err()
-                .is_some_and(|err| is_transient_network_error(err.error()))
+                .is_err_and(|err| is_transient_network_error(err.error()))
            {
                let total_retries = past_retries + middleware_retries;
                let retry_decision = retry_policy.should_retry(start_time, total_retries);
--- a/crates/uv-client/src/error.rs
+++ b/crates/uv-client/src/error.rs
@ -123,6 +123,11 @@ impl Error {
        &self.kind
    }

+    pub(crate) fn with_retries(mut self, retries: u32) -> Self {
+        self.retries = retries;
+        self
+    }
+
    /// Create a new error from a JSON parsing error.
    pub(crate) fn from_json_err(err: serde_json::Error, url: DisplaySafeUrl) -> Self {
        ErrorKind::BadJson { source: err, url }.into()
--- a/crates/uv-client/src/flat_index.rs
+++ b/crates/uv-client/src/flat_index.rs
@ -246,7 +246,7 @@ impl<'a> FlatIndexClient<'a> {
                    .collect();
                Ok(FlatIndexEntries::from_entries(files))
            }
-            Err(CachedClientError::Client { err, .. }) if err.is_offline() => {
+            Err(CachedClientError::Client(err)) if err.is_offline() => {
                Ok(FlatIndexEntries::offline())
            }
            Err(err) => Err(err.into()),
--- a/crates/uv-distribution/src/distribution_database.rs
+++ b/crates/uv-distribution/src/distribution_database.rs
@ -721,7 +721,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
            .await
            .map_err(|err| match err {
                CachedClientError::Callback { err, .. } => err,
-                CachedClientError::Client { err, .. } => Error::Client(err),
+                CachedClientError::Client(err) => Error::Client(err),
            })?;

        // If the archive is missing the required hashes, or has since been removed, force a refresh.
@ -745,7 +745,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
                        .await
                        .map_err(|err| match err {
                            CachedClientError::Callback { err, .. } => err,
-                            CachedClientError::Client { err, .. } => Error::Client(err),
+                            CachedClientError::Client(err) => Error::Client(err),
                        })
                })
                .await?
@ -926,7 +926,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
            .await
            .map_err(|err| match err {
                CachedClientError::Callback { err, .. } => err,
-                CachedClientError::Client { err, .. } => Error::Client(err),
+                CachedClientError::Client(err) => Error::Client(err),
            })?;

        // If the archive is missing the required hashes, or has since been removed, force a refresh.
@ -950,7 +950,7 @@ impl<'a, Context: BuildContext> DistributionDatabase<'a, Context> {
                        .await
                        .map_err(|err| match err {
                            CachedClientError::Callback { err, .. } => err,
-                            CachedClientError::Client { err, .. } => Error::Client(err),
+                            CachedClientError::Client(err) => Error::Client(err),
                        })
                })
                .await?
--- a/crates/uv-distribution/src/source/mod.rs
+++ b/crates/uv-distribution/src/source/mod.rs
@ -823,7 +823,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
            .await
            .map_err(|err| match err {
                CachedClientError::Callback { err, .. } => err,
-                CachedClientError::Client { err, .. } => Error::Client(err),
+                CachedClientError::Client(err) => Error::Client(err),
            })?;

        // If the archive is missing the required hashes, force a refresh.
@ -843,7 +843,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
                        .await
                        .map_err(|err| match err {
                            CachedClientError::Callback { err, .. } => err,
-                            CachedClientError::Client { err, .. } => Error::Client(err),
+                            CachedClientError::Client(err) => Error::Client(err),
                        })
                })
                .await
@ -2280,7 +2280,7 @@ impl<'a, T: BuildContext> SourceDistributionBuilder<'a, T> {
                    .await
                    .map_err(|err| match err {
                        CachedClientError::Callback { err, .. } => err,
-                        CachedClientError::Client { err, .. } => Error::Client(err),
+                        CachedClientError::Client(err) => Error::Client(err),
                    })
            })
            .await
--- a/crates/uv/tests/it/network.rs
+++ b/crates/uv/tests/it/network.rs
@ -37,6 +37,30 @@ async fn io_error_server() -> (MockServer, String) {
    (server, mock_server_uri)
 }

+/// Answers with a retryable HTTP status 500 for 2 times, then with a retryable connection reset
+/// IO error.
+///
+/// Tests different errors paths inside uv, which retries 3 times by default, for a total for 4
+/// requests.
+async fn mixed_error_server() -> (MockServer, String) {
+    let server = MockServer::start().await;
+
+    Mock::given(method("GET"))
+        .respond_with_err(connection_reset)
+        .up_to_n_times(2)
+        .mount(&server)
+        .await;
+
+    Mock::given(method("GET"))
+        .respond_with(ResponseTemplate::new(StatusCode::INTERNAL_SERVER_ERROR))
+        .up_to_n_times(2)
+        .mount(&server)
+        .await;
+
+    let mock_server_uri = server.uri();
+    (server, mock_server_uri)
+}
+
 /// Check the simple index error message when the server returns HTTP status 500, a retryable error.
 #[tokio::test]
 async fn simple_http_500() {
@ -149,6 +173,35 @@ async fn find_links_io_error() {
    ");
 }

+/// Check the error message for a find links index page, a non-streaming request, when the server
+/// returns different kinds of retryable errors.
+#[tokio::test]
+async fn find_links_mixed_error() {
+    let context = TestContext::new("3.12");
+
+    let (_server_drop_guard, mock_server_uri) = mixed_error_server().await;
+
+    let filters = vec![(mock_server_uri.as_str(), "[SERVER]")];
+    uv_snapshot!(filters, context
+        .pip_install()
+        .arg("tqdm")
+        .arg("--no-index")
+        .arg("--find-links")
+        .arg(&mock_server_uri)
+        .env_remove(EnvVars::UV_HTTP_RETRIES)
+        .env(EnvVars::UV_TEST_NO_HTTP_RETRY_DELAY, "true"), @r"
+    success: false
+    exit_code: 2
+    ----- stdout -----
+
+    ----- stderr -----
+    error: Failed to read `--find-links` URL: [SERVER]/
+      Caused by: Request failed after 3 retries
+      Caused by: Failed to fetch: `[SERVER]/`
+      Caused by: HTTP status server error (500 Internal Server Error) for url ([SERVER]/)
+    ");
+}
+
 /// Check the direct package URL error message when the server returns HTTP status 500, a retryable
 /// error.
 #[tokio::test]
@ -193,7 +246,7 @@ async fn direct_url_io_error() {
        .pip_install()
        .arg(format!("tqdm @ {tqdm_url}"))
        .env_remove(EnvVars::UV_HTTP_RETRIES)
-        .env(EnvVars::UV_TEST_NO_HTTP_RETRY_DELAY, "true"), @r"
+        .env(EnvVars::UV_TEST_NO_HTTP_RETRY_DELAY, "true"), @r#"
    success: false
    exit_code: 1
    ----- stdout -----
@ -205,6 +258,35 @@ async fn direct_url_io_error() {
      ├─▶ error sending request for url ([SERVER]/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl)
      ├─▶ client error (SendRequest)
      ╰─▶ connection closed before message completed
+    "#);
+}
+
+/// Check the error message for direct package URL, a streaming request, when the server returns
+/// different kinds of retryable errors.
+#[tokio::test]
+async fn direct_url_mixed_error() {
+    let context = TestContext::new("3.12");
+
+    let (_server_drop_guard, mock_server_uri) = mixed_error_server().await;
+
+    let tqdm_url = format!(
+        "{mock_server_uri}/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl"
+    );
+    let filters = vec![(mock_server_uri.as_str(), "[SERVER]")];
+    uv_snapshot!(filters, context
+        .pip_install()
+        .arg(format!("tqdm @ {tqdm_url}"))
+        .env_remove(EnvVars::UV_HTTP_RETRIES)
+        .env(EnvVars::UV_TEST_NO_HTTP_RETRY_DELAY, "true"), @r"
+    success: false
+    exit_code: 1
+    ----- stdout -----
+
+    ----- stderr -----
+      × Failed to download `tqdm @ [SERVER]/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl`
+      ├─▶ Request failed after 3 retries
+      ├─▶ Failed to fetch: `[SERVER]/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl`
+      ╰─▶ HTTP status server error (500 Internal Server Error) for url ([SERVER]/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl)
    ");
 }