From 71964ec7a8d3428d06ea19a3af1d55ee7e3c20a8 Mon Sep 17 00:00:00 2001 From: konsti Date: Sat, 16 Dec 2023 22:01:35 +0100 Subject: [PATCH] Switch to msgpack in the cached client (#662) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This gives a 1.23 speedup on transformers-extras. We could change to msgpack for the entire cache if we want. I only tried this format and postcard so far, where postcard was much slower (like 1.6s). I don't actually want to merge it like this, i wanted to figure out the ballpark of improvement for switching away from json. ``` hyperfine --warmup 3 --runs 10 "target/profiling/puffin pip-compile --cache-dir cache-msgpack scripts/requirements/transformers-extras.in" "target/profiling/branch pip-compile scripts/requirements/transformers-extras.in" Benchmark 1: target/profiling/puffin pip-compile --cache-dir cache-msgpack scripts/requirements/transformers-extras.in Time (mean ± σ): 179.1 ms ± 4.8 ms [User: 157.5 ms, System: 48.1 ms] Range (min … max): 174.9 ms … 188.1 ms 10 runs Benchmark 2: target/profiling/branch pip-compile scripts/requirements/transformers-extras.in Time (mean ± σ): 221.1 ms ± 6.7 ms [User: 208.1 ms, System: 46.5 ms] Range (min … max): 213.5 ms … 235.5 ms 10 runs Summary target/profiling/puffin pip-compile --cache-dir cache-msgpack scripts/requirements/transformers-extras.in ran 1.23 ± 0.05 times faster than target/profiling/branch pip-compile scripts/requirements/transformers-extras.in ``` Disadvantage: We can't manually look into the cache anymore to debug things - [ ] Check more formats, i currently only tested json, msgpack and postcard, there should be other formats, too - [x] Switch over `CachedByTimestamp` serialization (for the interpreter caching) - [x] Switch over error handling and make sure puffin is still resilient to cache failure --- Cargo.lock | 32 +++++++++++- Cargo.toml | 1 + crates/puffin-cache/src/lib.rs | 52 +++++++++---------- crates/puffin-client/Cargo.toml | 5 +- crates/puffin-client/src/cached_client.rs | 6 +-- crates/puffin-client/src/error.rs | 9 ++-- crates/puffin-client/src/registry_client.rs | 6 +-- crates/puffin-distribution/Cargo.toml | 2 +- crates/puffin-distribution/src/source_dist.rs | 27 +++++----- crates/puffin-interpreter/Cargo.toml | 3 +- crates/puffin-interpreter/src/interpreter.rs | 36 ++++++++----- crates/puffin-interpreter/src/lib.rs | 4 ++ 12 files changed, 117 insertions(+), 66 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 29b5e5c06..58adb429f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1971,6 +1971,12 @@ dependencies = [ "windows-targets 0.48.5", ] +[[package]] +name = "paste" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" + [[package]] name = "pep440_rs" version = "0.3.12" @@ -2373,6 +2379,7 @@ dependencies = [ "reqwest", "reqwest-middleware", "reqwest-retry", + "rmp-serde", "serde", "serde_json", "sha2", @@ -2474,9 +2481,9 @@ dependencies = [ "puffin-traits", "pypi-types", "reqwest", + "rmp-serde", "rustc-hash", "serde", - "serde_json", "tempfile", "thiserror", "tokio", @@ -2571,6 +2578,7 @@ dependencies = [ "platform-host", "puffin-cache", "puffin-fs", + "rmp-serde", "serde", "serde_json", "tempfile", @@ -3091,6 +3099,28 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "rmp" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f9860a6cc38ed1da53456442089b4dfa35e7cedaa326df63017af88385e6b20" +dependencies = [ + "byteorder", + "num-traits", + "paste", +] + +[[package]] +name = "rmp-serde" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bffea85eea980d8a74453e5d02a8d93028f3c34725de143085a844ebe953258a" +dependencies = [ + "byteorder", + "rmp", + "serde", +] + [[package]] name = "rustc-demangle" version = "0.1.23" diff --git a/Cargo.toml b/Cargo.toml index b75616122..6e3ba4da6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,6 +64,7 @@ reqwest = { version = "0.11.22", default-features = false, features = ["json", " reqwest-middleware = { version = "0.2.4" } reqwest-retry = { version = "0.3.0" } rfc2047-decoder = { version = "1.0.1" } +rmp-serde = { version = "1.1.2" } rustc-hash = { version = "1.1.0" } seahash = { version = "4.1.0" } serde = { version = "1.0.190" } diff --git a/crates/puffin-cache/src/lib.rs b/crates/puffin-cache/src/lib.rs index 9212dae74..8171c0cca 100644 --- a/crates/puffin-cache/src/lib.rs +++ b/crates/puffin-cache/src/lib.rs @@ -173,15 +173,15 @@ impl Cache { pub enum CacheBucket { /// Wheels (excluding built wheels), alongside their metadata and cache policy. /// - /// There are three kinds from cache entries: Wheel metadata and policy as JSON files, the + /// There are three kinds from cache entries: Wheel metadata and policy as MsgPack files, the /// wheels themselves, and the unzipped wheel archives. If a wheel file is over an in-memory /// size threshold, we first download the zip file into the cache, then unzip it into a /// directory with the same name (exclusive of the `.whl` extension). /// /// Cache structure: - /// * `wheel-metadata-v0/pypi/foo/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}` - /// * `wheel-metadata-v0//foo/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}` - /// * `wheel-metadata-v0/url//foo/{foo-1.0.0-py3-none-any.json, foo-1.0.0-py3-none-any.whl}` + /// * `wheel-metadata-v0/pypi/foo/{foo-1.0.0-py3-none-any.msgpack, foo-1.0.0-py3-none-any.whl}` + /// * `wheel-metadata-v0//foo/{foo-1.0.0-py3-none-any.msgpack, foo-1.0.0-py3-none-any.whl}` + /// * `wheel-metadata-v0/url//foo/{foo-1.0.0-py3-none-any.msgpack, foo-1.0.0-py3-none-any.whl}` /// /// See `puffin_client::RegistryClient::wheel_metadata` for information on how wheel metadata /// is fetched. @@ -203,12 +203,12 @@ pub enum CacheBucket { /// ├── pypi /// │ ... /// │ ├── pandas - /// │ │ └── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.json + /// │ │ └── pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.msgpack /// │ ... /// └── url /// └── 4b8be67c801a7ecb /// └── flask - /// └── flask-3.0.0-py3-none-any.json + /// └── flask-3.0.0-py3-none-any.msgpack /// ``` /// /// We get the following `requirement.txt` from `pip-compile`: @@ -250,7 +250,7 @@ pub enum CacheBucket { /// ├── pypi /// │ ├── ... /// │ ├── pandas - /// │ │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.json + /// │ │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.msgpack /// │ │ ├── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl /// │ │ └── pandas-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64 /// │ │ ├── pandas @@ -262,8 +262,8 @@ pub enum CacheBucket { /// └── url /// └── 4b8be67c801a7ecb /// └── flask - /// ├── flask-3.0.0-py3-none-any.json - /// ├── flask-3.0.0-py3-none-any.json + /// ├── flask-3.0.0-py3-none-any.msgpack + /// ├── flask-3.0.0-py3-none-any.msgpack /// └── flask-3.0.0-py3-none-any /// ├── flask /// │ └── ... @@ -287,15 +287,15 @@ pub enum CacheBucket { /// directories in the cache. /// /// Cache structure: - /// * `built-wheels-v0/pypi/foo/34a17436ed1e9669/{metadata.json, foo-1.0.0.zip, foo-1.0.0-py3-none-any.whl, ...other wheels}` - /// * `built-wheels-v0//foo/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` - /// * `built-wheels-v0/url//foo/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` - /// * `built-wheels-v0/git///foo/foo-1.0.0.zip/{metadata.json, foo-1.0.0-py3-none-any.whl, ...other wheels}` + /// * `built-wheels-v0/pypi/foo/34a17436ed1e9669/{metadata.msgpack, foo-1.0.0.zip, foo-1.0.0-py3-none-any.whl, ...other wheels}` + /// * `built-wheels-v0//foo/foo-1.0.0.zip/{metadata.msgpack, foo-1.0.0-py3-none-any.whl, ...other wheels}` + /// * `built-wheels-v0/url//foo/foo-1.0.0.zip/{metadata.msgpack, foo-1.0.0-py3-none-any.whl, ...other wheels}` + /// * `built-wheels-v0/git///foo/foo-1.0.0.zip/{metadata.msgpack, foo-1.0.0-py3-none-any.whl, ...other wheels}` /// /// But the url filename does not need to be a valid source dist filename /// (), /// so it could also be the following and we have to take any string as filename: - /// * `built-wheels-v0/url//master.zip/metadata.json` + /// * `built-wheels-v0/url//master.zip/metadata.msgpack` /// /// # Example /// @@ -315,22 +315,22 @@ pub enum CacheBucket { /// ├── git /// │ └── a67db8ed076e3814 /// │ └── 843b753e9e8cb74e83cac55598719b39a4d5ef1f - /// │ ├── metadata.json + /// │ ├── metadata.msgpack /// │ └── pydantic_extra_types-2.1.0-py3-none-any.whl /// ├── pypi /// │ └── django /// │ └── django-allauth-0.51.0.tar.gz /// │ ├── django_allauth-0.51.0-py3-none-any.whl - /// │ └── metadata.json + /// │ └── metadata.msgpack /// └── url /// └── 6781bd6440ae72c2 /// └── werkzeug /// └── werkzeug-3.0.1.tar.gz - /// ├── metadata.json + /// ├── metadata.msgpack /// └── werkzeug-3.0.1-py3-none-any.whl /// ``` /// - /// The inside of a `metadata.json`: + /// Structurally, the inside of a `metadata.msgpack` looks like: /// ```json /// { /// "data": { @@ -352,11 +352,11 @@ pub enum CacheBucket { /// without the shim itself changing, we only cache when the path equals `sys.executable`, i.e. /// the path we're running is the python executable itself and not a shim. /// - /// Cache structure: `interpreter-v0/.json` + /// Cache structure: `interpreter-v0/.msgpack` /// /// # Example /// - /// The contents of each of the json files has a timestamp field in unix time, the [PEP 508] + /// The contents of each of the MsgPack files has a timestamp field in unix time, the [PEP 508] /// markers and some information from the `sys`/`sysconfig` modules. /// /// ```json @@ -388,8 +388,8 @@ pub enum CacheBucket { /// Index responses through the simple metadata API. /// /// Cache structure: - /// * `simple-v0/pypi/.json` - /// * `simple-v0//.json` + /// * `simple-v0/pypi/.msgpack` + /// * `simple-v0//.msgpack` /// /// The response is parsed into [`puffin_client::SimpleMetadata`] before storage. Simple, @@ -492,17 +492,17 @@ impl CacheBucket { } } CacheBucket::Simple => { - // For `pypi` wheels, we expect a JSON file per package, indexed by name. + // For `pypi` wheels, we expect a MsgPack file per package, indexed by name. let root = cache.bucket(self).join(WheelCacheKind::Pypi); - if remove(root.join(format!("{name}.json")))? { + if remove(root.join(format!("{name}.msgpack")))? { count += 1; } // For alternate indices, we expect a directory for every index, followed by a - // JSON file per package, indexed by name. + // MsgPack file per package, indexed by name. let root = cache.bucket(self).join(WheelCacheKind::Url); for directory in directories(root) { - if remove(directory.join(format!("{name}.json")))? { + if remove(directory.join(format!("{name}.msgpack")))? { count += 1; } } diff --git a/crates/puffin-client/Cargo.toml b/crates/puffin-client/Cargo.toml index 10cac0c5f..73f77b4a8 100644 --- a/crates/puffin-client/Cargo.toml +++ b/crates/puffin-client/Cargo.toml @@ -15,18 +15,19 @@ pypi-types = { path = "../pypi-types" } async_http_range_reader = { workspace = true } async_zip = { workspace = true, features = ["tokio"] } -futures = { workspace = true } fs-err = { workspace = true, features = ["tokio"] } +futures = { workspace = true } http = { workspace = true } http-cache-semantics = { workspace = true } reqwest = { workspace = true } reqwest-middleware = { workspace = true } reqwest-retry = { workspace = true } +rmp-serde = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } sha2 = { workspace = true } -thiserror = { workspace = true } tempfile = { workspace = true } +thiserror = { workspace = true } tokio = { workspace = true, features = ["fs"] } tokio-util = { workspace = true } tracing = { workspace = true } diff --git a/crates/puffin-client/src/cached_client.rs b/crates/puffin-client/src/cached_client.rs index 230728681..1bda0fe40 100644 --- a/crates/puffin-client/src/cached_client.rs +++ b/crates/puffin-client/src/cached_client.rs @@ -101,7 +101,7 @@ impl CachedClient { CallbackReturn: Future>, { let cached = if let Ok(cached) = fs_err::tokio::read(cache_entry.path()).await { - match serde_json::from_slice::>(&cached) { + match rmp_serde::from_slice::>(&cached) { Ok(data) => Some(data), Err(err) => { warn!( @@ -123,7 +123,7 @@ impl CachedClient { CachedResponse::NotModified(data_with_cache_policy) => { write_atomic( cache_entry.path(), - serde_json::to_vec(&data_with_cache_policy).map_err(crate::Error::from)?, + rmp_serde::to_vec(&data_with_cache_policy).map_err(crate::Error::from)?, ) .await .map_err(crate::Error::CacheWrite)?; @@ -139,7 +139,7 @@ impl CachedClient { .await .map_err(crate::Error::CacheWrite)?; let data = - serde_json::to_vec(&data_with_cache_policy).map_err(crate::Error::from)?; + rmp_serde::to_vec(&data_with_cache_policy).map_err(crate::Error::from)?; write_atomic(cache_entry.path(), data) .await .map_err(crate::Error::CacheWrite)?; diff --git a/crates/puffin-client/src/error.rs b/crates/puffin-client/src/error.rs index 8b53aeab8..dd3aa6bba 100644 --- a/crates/puffin-client/src/error.rs +++ b/crates/puffin-client/src/error.rs @@ -73,12 +73,15 @@ pub enum Error { #[error(transparent)] Io(#[from] io::Error), + #[error("Cache deserialization failed")] + Decode(#[from] rmp_serde::decode::Error), + + #[error("Cache serialization failed")] + Encode(#[from] rmp_serde::encode::Error), + /// An [`io::Error`] with a filename attached #[error(transparent)] Persist(#[from] tempfile::PersistError), - - #[error("Failed to serialize response to cache")] - SerdeJson(#[from] serde_json::Error), } impl Error { diff --git a/crates/puffin-client/src/registry_client.rs b/crates/puffin-client/src/registry_client.rs index 348df4057..e8250a102 100644 --- a/crates/puffin-client/src/registry_client.rs +++ b/crates/puffin-client/src/registry_client.rs @@ -145,7 +145,7 @@ impl RegistryClient { IndexUrl::Pypi => "pypi".to_string(), IndexUrl::Url(url) => digest(&CanonicalUrl::new(url)), }), - format!("{}.json", package_name.as_ref()), + format!("{}.msgpack", package_name.as_ref()), ); let simple_request = self @@ -246,7 +246,7 @@ impl RegistryClient { let cache_entry = self.cache.entry( CacheBucket::Wheels, WheelCache::Index(&index).remote_wheel_dir(filename.name.as_ref()), - format!("{}.json", filename.stem()), + format!("{}.msgpack", filename.stem()), ); let response_callback = |response: Response| async { @@ -281,7 +281,7 @@ impl RegistryClient { let cache_entry = self.cache.entry( CacheBucket::Wheels, cache_shard.remote_wheel_dir(filename.name.as_ref()), - format!("{}.json", filename.stem()), + format!("{}.msgpack", filename.stem()), ); // This response callback is special, we actually make a number of subsequent requests to diff --git a/crates/puffin-distribution/Cargo.toml b/crates/puffin-distribution/Cargo.toml index c657db78d..a06e5fbc9 100644 --- a/crates/puffin-distribution/Cargo.toml +++ b/crates/puffin-distribution/Cargo.toml @@ -34,9 +34,9 @@ fs-err = { workspace = true } fs2 = { workspace = true } futures = { workspace = true } reqwest = { workspace = true } +rmp-serde = { workspace = true } rustc-hash = { workspace = true } serde = { workspace = true , features = ["derive"] } -serde_json = { workspace = true } tempfile = { workspace = true } thiserror = { workspace = true } tokio = { workspace = true } diff --git a/crates/puffin-distribution/src/source_dist.rs b/crates/puffin-distribution/src/source_dist.rs index bf835f63c..0f68a1eff 100644 --- a/crates/puffin-distribution/src/source_dist.rs +++ b/crates/puffin-distribution/src/source_dist.rs @@ -56,8 +56,10 @@ pub enum SourceDistError { // Cache writing error #[error("Failed to write to source dist cache")] Io(#[from] std::io::Error), - #[error("Cache (de)serialization failed")] - Serde(#[from] serde_json::Error), + #[error("Cache deserialization failed")] + Decode(#[from] rmp_serde::decode::Error), + #[error("Cache serialization failed")] + Encode(#[from] rmp_serde::encode::Error), // Build error #[error("Failed to build: {0}")] @@ -179,7 +181,8 @@ pub struct SourceDistCachedBuilder<'a, T: BuildContext> { tags: &'a Tags, } -const METADATA_JSON: &str = "metadata.json"; +/// The name of the file that contains the cached metadata, encoded via `MsgPack`. +const METADATA: &str = "metadata.msgpack"; impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { /// Initialize a [`SourceDistCachedBuilder`] from a [`BuildContext`]. @@ -268,7 +271,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { cache_shard: &CacheShard, subdirectory: Option<&'data Path>, ) -> Result { - let cache_entry = cache_shard.entry(METADATA_JSON.to_string()); + let cache_entry = cache_shard.entry(METADATA.to_string()); let response_callback = |response| async { // At this point, we're seeing a new or updated source distribution; delete all @@ -368,12 +371,12 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { if let Ok(cached) = fs::read(cache_entry.path()).await { // If the file exists and it was just read or written by `CachedClient`, we assume it must // be correct. - let mut cached = serde_json::from_slice::>(&cached)?; + let mut cached = rmp_serde::from_slice::>(&cached)?; cached .data .insert(wheel_filename.clone(), cached_data.clone()); - write_atomic(cache_entry.path(), serde_json::to_vec(&cached)?).await?; + write_atomic(cache_entry.path(), rmp_serde::to_vec(&cached)?).await?; }; Ok(BuiltWheelMetadata::from_cached( @@ -393,7 +396,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { CacheBucket::BuiltWheels, WheelCache::Path(&path_source_dist.url) .remote_wheel_dir(path_source_dist.name().as_ref()), - METADATA_JSON.to_string(), + METADATA.to_string(), ); // Determine the last-modified time of the source distribution. @@ -464,7 +467,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { timestamp: modified, data: manifest, }; - let data = serde_json::to_vec(&cached)?; + let data = rmp_serde::to_vec(&cached)?; write_atomic(cache_entry.path(), data).await?; if let Some(task) = task { @@ -498,7 +501,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { CacheBucket::BuiltWheels, WheelCache::Git(&git_source_dist.url, &git_sha.to_short_string()) .remote_wheel_dir(git_source_dist.name().as_ref()), - METADATA_JSON.to_string(), + METADATA.to_string(), ); // Read the existing metadata from the cache. @@ -540,7 +543,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { metadata: metadata.clone(), }, ); - let data = serde_json::to_vec(&manifest)?; + let data = rmp_serde::to_vec(&manifest)?; write_atomic(cache_entry.path(), data).await?; if let Some(task) = task { @@ -707,7 +710,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { ) -> Result, SourceDistError> { match fs::read(&cache_entry.path()).await { Ok(cached) => { - let cached = serde_json::from_slice::>(&cached)?; + let cached = rmp_serde::from_slice::>(&cached)?; if cached.timestamp == modified { Ok(Some(cached.data)) } else { @@ -729,7 +732,7 @@ impl<'a, T: BuildContext> SourceDistCachedBuilder<'a, T> { /// Read an existing cache entry, if it exists. async fn read_metadata(cache_entry: &CacheEntry) -> Result, SourceDistError> { match fs::read(&cache_entry.path()).await { - Ok(cached) => Ok(Some(serde_json::from_slice::(&cached)?)), + Ok(cached) => Ok(Some(rmp_serde::from_slice::(&cached)?)), Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None), Err(err) => Err(err.into()), } diff --git a/crates/puffin-interpreter/Cargo.toml b/crates/puffin-interpreter/Cargo.toml index 3262dd34a..def192e10 100644 --- a/crates/puffin-interpreter/Cargo.toml +++ b/crates/puffin-interpreter/Cargo.toml @@ -20,11 +20,12 @@ puffin-cache = { path = "../puffin-cache" } puffin-fs = { path = "../puffin-fs" } fs-err = { workspace = true, features = ["tokio"] } +rmp-serde = { workspace = true } +serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } thiserror = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } -serde = { workspace = true, features = ["derive"] } [dev-dependencies] indoc = { version = "2.0.4" } diff --git a/crates/puffin-interpreter/src/interpreter.rs b/crates/puffin-interpreter/src/interpreter.rs index 3a2ce9330..9f11b3ca4 100644 --- a/crates/puffin-interpreter/src/interpreter.rs +++ b/crates/puffin-interpreter/src/interpreter.rs @@ -3,7 +3,7 @@ use std::process::Command; use fs_err as fs; use serde::{Deserialize, Serialize}; -use tracing::debug; +use tracing::{debug, warn}; use pep440_rs::Version; use pep508_rs::MarkerEnvironment; @@ -133,9 +133,8 @@ impl InterpreterQueryResult { let data = serde_json::from_slice::(&output.stdout).map_err(|err| { Error::PythonSubcommandOutput { message: format!( - "Querying python at {} did not return the expected data: {}", + "Querying python at {} did not return the expected data: {err}", interpreter.display(), - err, ), stdout: String::from_utf8_lossy(&output.stdout).trim().to_string(), stderr: String::from_utf8_lossy(&output.stderr).trim().to_string(), @@ -155,7 +154,7 @@ impl InterpreterQueryResult { let cache_entry = cache.entry( CacheBucket::Interpreter, "", - format!("{}.json", digest(&executable_bytes)), + format!("{}.msgpack", digest(&executable_bytes)), ); // `modified()` is infallible on windows and unix (i.e., all platforms we support). @@ -163,16 +162,25 @@ impl InterpreterQueryResult { // Read from the cache. if let Ok(data) = fs::read(cache_entry.path()) { - if let Ok(cached) = serde_json::from_slice::>(&data) { - if cached.timestamp == modified { - debug!("Using cached markers for: {}", executable.display()); - return Ok(cached.data); - } + match rmp_serde::from_slice::>(&data) { + Ok(cached) => { + if cached.timestamp == modified { + debug!("Using cached markers for: {}", executable.display()); + return Ok(cached.data); + } - debug!( - "Ignoring stale cached markers for: {}", - executable.display() - ); + debug!( + "Ignoring stale cached markers for: {}", + executable.display() + ); + } + Err(err) => { + warn!( + "Broken cache entry at {}, removing: {err}", + cache_entry.path().display() + ); + let _ = fs_err::remove_file(cache_entry.path()); + } } } @@ -187,7 +195,7 @@ impl InterpreterQueryResult { // Write to the cache. write_atomic_sync( cache_entry.path(), - serde_json::to_vec(&CachedByTimestamp { + rmp_serde::to_vec(&CachedByTimestamp { timestamp: modified, data: info.clone(), })?, diff --git a/crates/puffin-interpreter/src/lib.rs b/crates/puffin-interpreter/src/lib.rs index 524a05fdf..b48b87d40 100644 --- a/crates/puffin-interpreter/src/lib.rs +++ b/crates/puffin-interpreter/src/lib.rs @@ -40,6 +40,10 @@ pub enum Error { }, #[error("Failed to write to cache")] Serde(#[from] serde_json::Error), + #[error("Cache deserialization failed")] + Decode(#[from] rmp_serde::decode::Error), + #[error("Cache serialization failed")] + Encode(#[from] rmp_serde::encode::Error), #[error("Failed to parse pyvenv.cfg")] Cfg(#[from] cfg::Error), }