From d89fbeb6424f39ba78167304337b7dd277c04edd Mon Sep 17 00:00:00 2001 From: konsti Date: Tue, 28 Nov 2023 18:14:59 +0100 Subject: [PATCH] Migrate interpreter query to custom caching (#508) This removes the last usage of cacache by replacing it with a custom, flat json caching keyed by the digest of the executable path. ![image](https://github.com/astral-sh/puffin/assets/6826232/8f777c4c-1f1b-4656-ba7b-002175270556) A step towards #478. I've made `CachedByTimestamp` generic over `T` but intentionally not moved it to `puffin-cache` yet. --- Cargo.lock | 100 +------------------ Cargo.toml | 1 - crates/gourgeist/Cargo.toml | 1 + crates/gourgeist/src/main.rs | 5 +- crates/puffin-cache/src/lib.rs | 37 +++++++ crates/puffin-cli/Cargo.toml | 1 - crates/puffin-cli/src/commands/venv.rs | 10 +- crates/puffin-client/Cargo.toml | 1 + crates/puffin-interpreter/Cargo.toml | 1 - crates/puffin-interpreter/src/interpreter.rs | 69 +++++++------ crates/puffin-interpreter/src/lib.rs | 2 - crates/puffin-interpreter/src/virtual_env.rs | 14 +-- crates/puffin-resolver/Cargo.toml | 2 +- 13 files changed, 88 insertions(+), 156 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a230b227a..f6a447027 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -391,31 +391,6 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e368af43e418a04d52505cf3dbc23dda4e3407ae2fa99fd0e4f308ce546acc" -[[package]] -name = "cacache" -version = "12.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "142316461ed3a3dfcba10417317472da5bfd0461e4d276bf7c07b330766d9490" -dependencies = [ - "digest", - "either", - "futures", - "hex", - "miette 5.10.0 (registry+https://github.com/rust-lang/crates.io-index)", - "reflink-copy", - "serde", - "serde_derive", - "serde_json", - "sha1", - "sha2", - "ssri", - "tempfile", - "thiserror", - "tokio", - "tokio-stream", - "walkdir", -] - [[package]] name = "camino" version = "1.1.6" @@ -1188,6 +1163,7 @@ dependencies = [ "clap", "fs-err", "platform-host", + "puffin-cache", "puffin-interpreter", "serde", "serde_json", @@ -1753,18 +1729,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "miette" -version = "5.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59bb584eaeeab6bd0226ccf3509a69d7936d148cf3d036ad350abe35e8c6856e" -dependencies = [ - "miette-derive 5.10.0 (registry+https://github.com/rust-lang/crates.io-index)", - "once_cell", - "thiserror", - "unicode-width", -] - [[package]] name = "miette" version = "5.10.0" @@ -1773,7 +1737,7 @@ dependencies = [ "backtrace", "backtrace-ext", "is-terminal", - "miette-derive 5.10.0 (git+https://github.com/zkat/miette.git?rev=fd77257cee0f5d03aa7dccb4ba8cbaa40c1a88c6)", + "miette-derive", "once_cell", "owo-colors", "supports-color", @@ -1785,17 +1749,6 @@ dependencies = [ "unicode-width", ] -[[package]] -name = "miette-derive" -version = "5.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49e7bc1560b95a3c4a25d03de42fe76ca718ab92d1a22a55b9b4cf67b3ae635c" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.39", -] - [[package]] name = "miette-derive" version = "5.10.0" @@ -2338,7 +2291,6 @@ dependencies = [ "assert_cmd", "assert_fs", "bitflags 2.4.1", - "cacache", "chrono", "clap", "colored", @@ -2351,7 +2303,7 @@ dependencies = [ "insta-cmd", "install-wheel-rs", "itertools 0.11.0", - "miette 5.10.0 (git+https://github.com/zkat/miette.git?rev=fd77257cee0f5d03aa7dccb4ba8cbaa40c1a88c6)", + "miette", "mimalloc", "pep440_rs 0.3.12", "pep508_rs", @@ -2567,7 +2519,6 @@ dependencies = [ name = "puffin-interpreter" version = "0.0.1" dependencies = [ - "cacache", "fs-err", "pep440_rs 0.3.12", "pep508_rs", @@ -3262,28 +3213,6 @@ dependencies = [ "serde", ] -[[package]] -name = "sha-1" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5058ada175748e33390e40e872bd0fe59a19f265d0158daa551c5a88a76009c" -dependencies = [ - "cfg-if 1.0.0", - "cpufeatures", - "digest", -] - -[[package]] -name = "sha1" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" -dependencies = [ - "cfg-if 1.0.0", - "cpufeatures", - "digest", -] - [[package]] name = "sha2" version = "0.10.8" @@ -3363,23 +3292,6 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" -[[package]] -name = "ssri" -version = "9.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da7a2b3c2bc9693bcb40870c4e9b5bf0d79f9cb46273321bf855ec513e919082" -dependencies = [ - "base64 0.21.5", - "digest", - "hex", - "miette 5.10.0 (registry+https://github.com/rust-lang/crates.io-index)", - "serde", - "sha-1", - "sha2", - "thiserror", - "xxhash-rust", -] - [[package]] name = "stacker" version = "0.1.15" @@ -4440,12 +4352,6 @@ dependencies = [ "libc", ] -[[package]] -name = "xxhash-rust" -version = "0.8.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9828b178da53440fa9c766a3d2f73f7cf5d0ac1fe3980c1e5018d899fd19e07b" - [[package]] name = "yaml-rust" version = "0.4.5" diff --git a/Cargo.toml b/Cargo.toml index a1dfea317..3f95eba3d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,6 @@ async_http_range_reader = { git = "https://github.com/baszalmstra/async_http_ran async_zip = { version = "0.0.15", features = ["tokio", "deflate"] } bitflags = { version = "2.4.1" } bytesize = { version = "1.3.0" } -cacache = { version = "12.0.0", default-features = false, features = ["tokio-runtime"] } camino = { version = "1.1.6", features = ["serde1"] } cargo-util = { version = "0.2.6" } chrono = { version = "0.4.31" } diff --git a/crates/gourgeist/Cargo.toml b/crates/gourgeist/Cargo.toml index 44963cc10..0f973bb8d 100644 --- a/crates/gourgeist/Cargo.toml +++ b/crates/gourgeist/Cargo.toml @@ -18,6 +18,7 @@ workspace = true [dependencies] platform-host = { path = "../platform-host" } +puffin-cache = { path = "../puffin-cache" } puffin-interpreter = { path = "../puffin-interpreter" } camino = { workspace = true } diff --git a/crates/gourgeist/src/main.rs b/crates/gourgeist/src/main.rs index 87eb769c2..19fe28fd9 100644 --- a/crates/gourgeist/src/main.rs +++ b/crates/gourgeist/src/main.rs @@ -11,6 +11,7 @@ use tracing_subscriber::{fmt, EnvFilter}; use gourgeist::{create_bare_venv, parse_python_cli}; use platform_host::Platform; +use puffin_cache::Cache; use puffin_interpreter::Interpreter; #[derive(Parser, Debug)] @@ -25,8 +26,8 @@ fn run() -> Result<(), gourgeist::Error> { let location = cli.path.unwrap_or(Utf8PathBuf::from(".venv")); let python = parse_python_cli(cli.python)?; let platform = Platform::current()?; - let cache = tempfile::tempdir()?; - let info = Interpreter::query(python.as_std_path(), platform, cache.path()).unwrap(); + let cache = Cache::temp()?; + let info = Interpreter::query(python.as_std_path(), platform, &cache).unwrap(); create_bare_venv(&location, &info)?; Ok(()) } diff --git a/crates/puffin-cache/src/lib.rs b/crates/puffin-cache/src/lib.rs index 503f94bab..63d7933dc 100644 --- a/crates/puffin-cache/src/lib.rs +++ b/crates/puffin-cache/src/lib.rs @@ -150,6 +150,43 @@ pub enum CacheBucket { /// Git repositories. Git, /// Information about an interpreter at a path. + /// + /// To avoid caching pyenv shims, bash scripts which may redirect to a new python version + /// without the shim itself changing, we only cache when the path equals `sys.executable`, i.e. + /// the path we're running is the python executable itself and not a shim. + /// + /// Cache structure: `interpreter-v0/.json` + /// + /// # Example + /// + /// The contents of each of the json files has a timestamp field in unix time, the [PEP 508] + /// markers and some information from the `sys`/`sysconfig` modules. + /// + /// ```json + /// { + /// "timestamp": 1698047994491, + /// "data": { + /// "markers": { + /// "implementation_name": "cpython", + /// "implementation_version": "3.12.0", + /// "os_name": "posix", + /// "platform_machine": "x86_64", + /// "platform_python_implementation": "CPython", + /// "platform_release": "6.5.0-13-generic", + /// "platform_system": "Linux", + /// "platform_version": "#13-Ubuntu SMP PREEMPT_DYNAMIC Fri Nov 3 12:16:05 UTC 2023", + /// "python_full_version": "3.12.0", + /// "python_version": "3.12", + /// "sys_platform": "linux" + /// }, + /// "base_exec_prefix": "/home/ferris/.pyenv/versions/3.12.0", + /// "base_prefix": "/home/ferris/.pyenv/versions/3.12.0", + /// "sys_executable": "/home/ferris/projects/puffin/.venv/bin/python" + /// } + /// } + /// ``` + /// + /// [PEP 508]: https://peps.python.org/pep-0508/#environment-markers Interpreter, /// Index responses through the simple metadata API. Simple, diff --git a/crates/puffin-cli/Cargo.toml b/crates/puffin-cli/Cargo.toml index a837556fe..1439a17e5 100644 --- a/crates/puffin-cli/Cargo.toml +++ b/crates/puffin-cli/Cargo.toml @@ -39,7 +39,6 @@ requirements-txt = { path = "../requirements-txt" } anstream = { workspace = true } anyhow = { workspace = true } bitflags = { workspace = true } -cacache = { workspace = true } chrono = { workspace = true } clap = { workspace = true, features = ["derive"] } colored = { workspace = true } diff --git a/crates/puffin-cli/src/commands/venv.rs b/crates/puffin-cli/src/commands/venv.rs index daa4f8bd3..89a99fc8c 100644 --- a/crates/puffin-cli/src/commands/venv.rs +++ b/crates/puffin-cli/src/commands/venv.rs @@ -8,7 +8,7 @@ use miette::{Diagnostic, IntoDiagnostic}; use thiserror::Error; use platform_host::Platform; -use puffin_cache::{Cache, CacheBucket}; +use puffin_cache::Cache; use puffin_interpreter::Interpreter; use crate::commands::ExitStatus; @@ -77,12 +77,8 @@ fn venv_impl( }; let platform = Platform::current().into_diagnostic()?; - let interpreter_info = Interpreter::query( - &base_python, - platform, - &cache.bucket(CacheBucket::Interpreter), - ) - .map_err(VenvError::InterpreterError)?; + let interpreter_info = + Interpreter::query(&base_python, platform, cache).map_err(VenvError::InterpreterError)?; writeln!( printer, diff --git a/crates/puffin-client/Cargo.toml b/crates/puffin-client/Cargo.toml index a6c47f261..2f183b8d1 100644 --- a/crates/puffin-client/Cargo.toml +++ b/crates/puffin-client/Cargo.toml @@ -32,3 +32,4 @@ url = { workspace = true } [dev-dependencies] anyhow = { workspace = true } +tokio = { workspace = true, features = ["fs", "macros"] } diff --git a/crates/puffin-interpreter/Cargo.toml b/crates/puffin-interpreter/Cargo.toml index d6bbb0926..a0e7d98cd 100644 --- a/crates/puffin-interpreter/Cargo.toml +++ b/crates/puffin-interpreter/Cargo.toml @@ -18,7 +18,6 @@ pep508_rs = { path = "../pep508-rs", features = ["serde"] } platform-host = { path = "../platform-host" } puffin-cache = { path = "../puffin-cache" } -cacache = { workspace = true } fs-err = { workspace = true, features = ["tokio"] } serde_json = { workspace = true } thiserror = { workspace = true } diff --git a/crates/puffin-interpreter/src/interpreter.rs b/crates/puffin-interpreter/src/interpreter.rs index d076d96a0..332e7fe13 100644 --- a/crates/puffin-interpreter/src/interpreter.rs +++ b/crates/puffin-interpreter/src/interpreter.rs @@ -2,12 +2,14 @@ use std::path::{Path, PathBuf}; use std::process::Command; use std::time::UNIX_EPOCH; +use fs_err as fs; use serde::{Deserialize, Serialize}; use tracing::debug; use pep440_rs::Version; use pep508_rs::MarkerEnvironment; use platform_host::Platform; +use puffin_cache::{digest, Cache, CacheBucket}; use crate::python_platform::PythonPlatform; use crate::Error; @@ -24,7 +26,7 @@ pub struct Interpreter { impl Interpreter { /// Detect the interpreter info for the given Python executable. - pub fn query(executable: &Path, platform: Platform, cache: &Path) -> Result { + pub fn query(executable: &Path, platform: Platform, cache: &Cache) -> Result { let info = InterpreterQueryResult::query_cached(executable, cache)?; debug_assert!( info.base_prefix == info.base_exec_prefix, @@ -101,7 +103,7 @@ impl Interpreter { } } -#[derive(Deserialize, Serialize)] +#[derive(Deserialize, Serialize, Clone)] pub(crate) struct InterpreterQueryResult { pub(crate) markers: MarkerEnvironment, pub(crate) base_exec_prefix: PathBuf, @@ -109,6 +111,12 @@ pub(crate) struct InterpreterQueryResult { pub(crate) sys_executable: PathBuf, } +#[derive(Deserialize, Serialize)] +pub(crate) struct CachedByTimestamp { + pub(crate) timestamp: u128, + pub(crate) data: T, +} + impl InterpreterQueryResult { /// Return the resolved [`InterpreterQueryResult`] for the given Python executable. pub(crate) fn query(interpreter: &Path) -> Result { @@ -153,48 +161,43 @@ impl InterpreterQueryResult { /// Running a Python script is (relatively) expensive, and the markers won't change /// unless the Python executable changes, so we use the executable's last modified /// time as a cache key. - pub(crate) fn query_cached(executable: &Path, cache: &Path) -> Result { + pub(crate) fn query_cached(executable: &Path, cache: &Cache) -> Result { + let executable_bytes = executable.as_os_str().as_encoded_bytes(); + let cache_dir = cache.bucket(CacheBucket::Interpreter); + let cache_path = cache_dir.join(format!("{}.json", digest(&executable_bytes))); + // Read from the cache. - let key = if let Ok(key) = cache_key(executable) { - if let Ok(data) = cacache::read_sync(cache, &key) { - if let Ok(info) = serde_json::from_slice::(&data) { - debug!("Using cached markers for {}", executable.display()); - return Ok(info); - } + if let Ok(data) = fs::read(&cache_path) { + if let Ok(cached) = serde_json::from_slice::>(&data) { + debug!("Using cached markers for {}", executable.display()); + return Ok(cached.data); } - Some(key) - } else { - None - }; + } // Otherwise, run the Python script. debug!("Detecting markers for {}", executable.display()); let info = Self::query(executable)?; - // If the executable is actually a pyenv shim, a bash script that redirects to the activated - // python, we're not allowed to cache the interpreter info + let modified = fs_err::metadata(executable)? + // Note: This is infallible on windows and unix (i.e. all platforms we support) + .modified()? + .duration_since(UNIX_EPOCH) + .map_err(|err| Error::SystemTime(executable.to_path_buf(), err))?; + + // If `executable` is a pyenv shim, a bash script that redirects to the activated + // python executable at another path, we're not allowed to cache the interpreter info if executable == info.sys_executable { + fs::create_dir_all(cache_dir)?; // Write to the cache. - if let Some(key) = key { - cacache::write_sync(cache, key, serde_json::to_vec(&info)?)?; - } + fs::write( + cache_path, + serde_json::to_vec(&CachedByTimestamp { + timestamp: modified.as_millis(), + data: info.clone(), + })?, + )?; } Ok(info) } } - -/// Create a cache key for the Python executable, consisting of the executable's -/// last modified time and the executable's path. -fn cache_key(executable: &Path) -> Result { - let modified = fs_err::metadata(executable)? - // Note: This is infallible on windows and unix (i.e. all platforms we support) - .modified()? - .duration_since(UNIX_EPOCH) - .map_err(|err| Error::SystemTime(executable.to_path_buf(), err))?; - Ok(format!( - "puffin:v0:{}:{}", - executable.display(), - modified.as_millis() - )) -} diff --git a/crates/puffin-interpreter/src/lib.rs b/crates/puffin-interpreter/src/lib.rs index 4eca309be..5d64d9cb5 100644 --- a/crates/puffin-interpreter/src/lib.rs +++ b/crates/puffin-interpreter/src/lib.rs @@ -38,7 +38,5 @@ pub enum Error { stderr: String, }, #[error("Failed to write to cache")] - Cacache(#[from] cacache::Error), - #[error("Failed to write to cache")] Serde(#[from] serde_json::Error), } diff --git a/crates/puffin-interpreter/src/virtual_env.rs b/crates/puffin-interpreter/src/virtual_env.rs index e90a3142d..a9ced6da2 100644 --- a/crates/puffin-interpreter/src/virtual_env.rs +++ b/crates/puffin-interpreter/src/virtual_env.rs @@ -4,7 +4,7 @@ use std::path::{Path, PathBuf}; use tracing::debug; use platform_host::Platform; -use puffin_cache::{Cache, CacheBucket}; +use puffin_cache::Cache; use crate::python_platform::PythonPlatform; use crate::{Error, Interpreter}; @@ -24,11 +24,7 @@ impl Virtualenv { return Err(Error::NotFound); }; let executable = platform.venv_python(&venv); - let interpreter = Interpreter::query( - &executable, - platform.0, - &cache.bucket(CacheBucket::Interpreter), - )?; + let interpreter = Interpreter::query(&executable, platform.0, cache)?; Ok(Self { root: venv, @@ -39,11 +35,7 @@ impl Virtualenv { pub fn from_virtualenv(platform: Platform, root: &Path, cache: &Cache) -> Result { let platform = PythonPlatform::from(platform); let executable = platform.venv_python(root); - let interpreter = Interpreter::query( - &executable, - platform.0, - &cache.bucket(CacheBucket::Interpreter), - )?; + let interpreter = Interpreter::query(&executable, platform.0, cache)?; Ok(Self { root: root.to_path_buf(), diff --git a/crates/puffin-resolver/Cargo.toml b/crates/puffin-resolver/Cargo.toml index b7f06f0a7..aa5ed02c1 100644 --- a/crates/puffin-resolver/Cargo.toml +++ b/crates/puffin-resolver/Cargo.toml @@ -49,7 +49,7 @@ serde_json = { workspace = true } sha2 = { workspace = true } tempfile = { workspace = true } thiserror = { workspace = true } -tokio = { workspace = true } +tokio = { workspace = true, features = ["macros"] } tokio-util = { workspace = true, features = ["compat"] } tracing = { workspace = true } url = { workspace = true }