diff --git a/crates/ruff_cli/resources/test/fixtures/cache_remove_old_files/source.py b/crates/ruff_cli/resources/test/fixtures/cache_remove_old_files/source.py new file mode 100644 index 0000000000..7e397f06e5 --- /dev/null +++ b/crates/ruff_cli/resources/test/fixtures/cache_remove_old_files/source.py @@ -0,0 +1,4 @@ +# NOTE: sync with cache::invalidation test +a = 1 + +__all__ = list(["a", "b"]) diff --git a/crates/ruff_cli/src/cache.rs b/crates/ruff_cli/src/cache.rs index 9c4aa3db73..fe614e7f8a 100644 --- a/crates/ruff_cli/src/cache.rs +++ b/crates/ruff_cli/src/cache.rs @@ -3,8 +3,9 @@ use std::fs::{self, File}; use std::hash::Hasher; use std::io::{self, BufReader, BufWriter, Write}; use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Mutex; -use std::time::SystemTime; +use std::time::{Duration, SystemTime}; use anyhow::{Context, Result}; use serde::{Deserialize, Serialize}; @@ -20,6 +21,9 @@ use ruff_text_size::{TextRange, TextSize}; use crate::diagnostics::Diagnostics; +/// Maximum duration for which we keep a file in cache that hasn't been seen. +const MAX_LAST_SEEN: Duration = Duration::from_secs(30 * 24 * 60 * 60); // 30 days. + /// [`Path`] that is relative to the package root in [`PackageCache`]. pub(crate) type RelativePath = Path; /// [`PathBuf`] that is relative to the package root in [`PackageCache`]. @@ -33,6 +37,7 @@ pub(crate) type RelativePathBuf = PathBuf; /// /// This type manages the cache file, reading it from disk and writing it back /// to disk (if required). +#[derive(Debug)] pub(crate) struct Cache { /// Location of the cache. path: PathBuf, @@ -44,6 +49,9 @@ pub(crate) struct Cache { /// `package.files` but are outdated. This gets merged with `package.files` /// when the cache is written back to disk in [`Cache::store`]. new_files: Mutex>, + /// The "current" timestamp used as cache for the updates of + /// [`FileCache::last_seen`] + last_seen_cache: u64, } impl Cache { @@ -94,26 +102,32 @@ impl Cache { ); package.files.clear(); } - Cache { - path, - package, - new_files: Mutex::new(HashMap::new()), - } + Cache::new(path, package) } /// Create an empty `Cache`. fn empty(path: PathBuf, package_root: PathBuf) -> Cache { + let package = PackageCache { + package_root, + files: HashMap::new(), + }; + Cache::new(path, package) + } + + #[allow(clippy::cast_possible_truncation)] + fn new(path: PathBuf, package: PackageCache) -> Cache { Cache { path, - package: PackageCache { - package_root, - files: HashMap::new(), - }, + package, new_files: Mutex::new(HashMap::new()), + // SAFETY: this will be truncated to the year ~2554 (so don't use + // this code after that!). + last_seen_cache: SystemTime::UNIX_EPOCH.elapsed().unwrap().as_millis() as u64, } } /// Store the cache to disk, if it has been changed. + #[allow(clippy::cast_possible_truncation)] pub(crate) fn store(mut self) -> Result<()> { let new_files = self.new_files.into_inner().unwrap(); if new_files.is_empty() { @@ -122,7 +136,14 @@ impl Cache { return Ok(()); } - // Add/overwrite the changes made. + // Remove cached files that we haven't seen in a while. + let now = self.last_seen_cache; + self.package.files.retain(|_, file| { + // SAFETY: this will be truncated to the year ~2554. + (now - *file.last_seen.get_mut()) <= MAX_LAST_SEEN.as_millis() as u64 + }); + + // Apply any changes made and keep track of when we last saw files. self.package.files.extend(new_files); let file = File::create(&self.path) @@ -161,51 +182,19 @@ impl Cache { return None; } + file.last_seen.store(self.last_seen_cache, Ordering::SeqCst); + Some(file) } /// Add or update a file cache at `path` relative to the package root. - pub(crate) fn update(&self, path: RelativePathBuf, file: FileCache) { - self.new_files.lock().unwrap().insert(path, file); - } -} - -/// On disk representation of a cache of a package. -#[derive(Deserialize, Debug, Serialize)] -struct PackageCache { - /// Path to the root of the package. - /// - /// Usually this is a directory, but it can also be a single file in case of - /// single file "packages", e.g. scripts. - package_root: PathBuf, - /// Mapping of source file path to it's cached data. - files: HashMap, -} - -/// On disk representation of the cache per source file. -#[derive(Clone, Deserialize, Debug, Serialize)] -pub(crate) struct FileCache { - /// Timestamp when the file was last modified before the (cached) check. - last_modified: SystemTime, - /// Imports made. - imports: ImportMap, - /// Diagnostic messages. - messages: Vec, - /// Source code of the file. - /// - /// # Notes - /// - /// This will be empty if `messages` is empty. - source: String, -} - -impl FileCache { - /// Create a new source file cache. - pub(crate) fn new( + pub(crate) fn update( + &self, + path: RelativePathBuf, last_modified: SystemTime, messages: &[Message], imports: &ImportMap, - ) -> FileCache { + ) { let source = if let Some(msg) = messages.first() { msg.file.source_text().to_owned() } else { @@ -229,14 +218,52 @@ impl FileCache { }) .collect(); - FileCache { + let file = FileCache { last_modified, + last_seen: AtomicU64::new(self.last_seen_cache), imports: imports.clone(), messages, source, - } + }; + self.new_files.lock().unwrap().insert(path, file); } +} +/// On disk representation of a cache of a package. +#[derive(Deserialize, Debug, Serialize)] +struct PackageCache { + /// Path to the root of the package. + /// + /// Usually this is a directory, but it can also be a single file in case of + /// single file "packages", e.g. scripts. + package_root: PathBuf, + /// Mapping of source file path to it's cached data. + files: HashMap, +} + +/// On disk representation of the cache per source file. +#[derive(Deserialize, Debug, Serialize)] +pub(crate) struct FileCache { + /// Timestamp when the file was last modified before the (cached) check. + last_modified: SystemTime, + /// Timestamp when we last linted this file. + /// + /// Represented as the number of milliseconds since Unix epoch. This will + /// break in 1970 + ~584 years (~2554). + last_seen: AtomicU64, + /// Imports made. + imports: ImportMap, + /// Diagnostic messages. + messages: Vec, + /// Source code of the file. + /// + /// # Notes + /// + /// This will be empty if `messages` is empty. + source: String, +} + +impl FileCache { /// Convert the file cache into `Diagnostics`, using `path` as file name. pub(crate) fn as_diagnostics(&self, path: &Path) -> Diagnostics { let messages = if self.messages.is_empty() { @@ -259,7 +286,7 @@ impl FileCache { } /// On disk representation of a diagnostic message. -#[derive(Clone, Deserialize, Debug, Serialize)] +#[derive(Deserialize, Debug, Serialize)] struct CacheMessage { kind: DiagnosticKind, /// Range into the message's [`FileCache::source`]. @@ -303,12 +330,15 @@ mod tests { use std::env::temp_dir; use std::fs; use std::io::{self, Write}; - use std::path::Path; + use std::path::{Path, PathBuf}; + use std::sync::atomic::AtomicU64; + use std::time::SystemTime; use ruff::settings::{flags, AllSettings}; use ruff_cache::CACHE_DIR_NAME; + use ruff_python_ast::imports::ImportMap; - use crate::cache::{self, Cache}; + use crate::cache::{self, Cache, FileCache}; use crate::diagnostics::{lint_path, Diagnostics}; #[test] @@ -497,4 +527,54 @@ mod tests { assert!(expected_diagnostics == got_diagnostics); } } + + #[test] + fn remove_old_files() { + let mut cache_dir = temp_dir(); + cache_dir.push("ruff_tests/cache_remove_old_files"); + let _ = fs::remove_dir_all(&cache_dir); + cache::init(&cache_dir).unwrap(); + + let settings = AllSettings::default(); + let package_root = + fs::canonicalize("resources/test/fixtures/cache_remove_old_files").unwrap(); + let mut cache = Cache::open(&cache_dir, package_root.clone(), &settings.lib); + assert_eq!(cache.new_files.lock().unwrap().len(), 0); + + // Add a file to the cache that hasn't been linted or seen since the + // '70s! + cache.package.files.insert( + PathBuf::from("old.py"), + FileCache { + last_modified: SystemTime::UNIX_EPOCH, + last_seen: AtomicU64::new(123), + imports: ImportMap::new(), + messages: Vec::new(), + source: String::new(), + }, + ); + + // Now actually lint a file. + let path = package_root.join("source.py"); + lint_path( + &path, + Some(&package_root), + &settings, + Some(&cache), + flags::Noqa::Enabled, + flags::FixMode::Generate, + ) + .unwrap(); + + // Storing the cache should remove the old (`old.py`) file. + cache.store().unwrap(); + // So we when we open the cache again it shouldn't contain `old.py`. + let cache = Cache::open(&cache_dir, package_root, &settings.lib); + + assert_eq!(cache.package.files.len(), 1, "didn't remove the old file"); + assert!( + !cache.package.files.contains_key(&path), + "removed the wrong file" + ); + } } diff --git a/crates/ruff_cli/src/diagnostics.rs b/crates/ruff_cli/src/diagnostics.rs index 1da81ed4cd..03f5527a5f 100644 --- a/crates/ruff_cli/src/diagnostics.rs +++ b/crates/ruff_cli/src/diagnostics.rs @@ -25,7 +25,7 @@ use ruff_python_ast::imports::ImportMap; use ruff_python_ast::source_code::{LineIndex, SourceCode, SourceFileBuilder}; use ruff_python_stdlib::path::is_project_toml; -use crate::cache::{Cache, FileCache}; +use crate::cache::Cache; #[derive(Debug, Default, PartialEq)] pub(crate) struct Diagnostics { @@ -208,10 +208,14 @@ pub(crate) fn lint_path( let imports = imports.unwrap_or_default(); if let Some((cache, relative_path, file_last_modified)) = caching { + // We don't cache parsing errors. if parse_error.is_none() { - // We don't cache parsing error. - let file_cache = FileCache::new(file_last_modified, &messages, &imports); - cache.update(relative_path.to_owned(), file_cache); + cache.update( + relative_path.to_owned(), + file_last_modified, + &messages, + &imports, + ); } }