From ff6f0b6ab8fef4df5a14651cf9ef86f016ec0367 Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Wed, 4 Jun 2025 17:05:00 +0200 Subject: [PATCH] Prototype of file inclusion and exclusion --- Cargo.lock | 3 + Cargo.toml | 13 +- crates/ruff_db/src/diagnostic/mod.rs | 3 + crates/ruff_db/src/system/path.rs | 6 +- crates/ty/docs/configuration.md | 17 + crates/ty/src/lib.rs | 10 +- crates/ty_project/Cargo.toml | 3 + crates/ty_project/src/db/changes.rs | 16 +- crates/ty_project/src/lib.rs | 9 +- crates/ty_project/src/metadata/options.rs | 56 ++- crates/ty_project/src/metadata/settings.rs | 41 +- crates/ty_project/src/metadata/value.rs | 46 +++ crates/ty_project/src/walk.rs | 453 ++++++++++++++++++++- ty.schema.json | 10 + 14 files changed, 645 insertions(+), 41 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9f952927cc..12c5fb78df 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3935,10 +3935,13 @@ dependencies = [ "anyhow", "crossbeam", "glob", + "globset", + "ignore", "insta", "notify", "pep440_rs", "rayon", + "regex-automata 0.4.9", "ruff_cache", "ruff_db", "ruff_macros", diff --git a/Cargo.toml b/Cargo.toml index 989b595f6a..dc09f3c707 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -126,6 +126,7 @@ quote = { version = "1.0.23" } rand = { version = "0.9.0" } rayon = { version = "1.10.0" } regex = { version = "1.10.2" } +regex-automata = { version = "0.4.9" } rustc-hash = { version = "2.0.0" } rustc-stable-hash = { version = "0.1.2" } # When updating salsa, make sure to also update the revision in `fuzz/Cargo.toml` @@ -165,7 +166,7 @@ tracing-subscriber = { version = "0.3.18", default-features = false, features = "env-filter", "fmt", "ansi", - "smallvec" + "smallvec", ] } tryfn = { version = "0.2.1" } typed-arena = { version = "2.0.2" } @@ -175,11 +176,7 @@ unicode-width = { version = "0.2.0" } unicode_names2 = { version = "1.2.2" } unicode-normalization = { version = "0.1.23" } url = { version = "2.5.0" } -uuid = { version = "1.6.1", features = [ - "v4", - "fast-rng", - "macro-diagnostics", -] } +uuid = { version = "1.6.1", features = ["v4", "fast-rng", "macro-diagnostics"] } walkdir = { version = "2.3.2" } wasm-bindgen = { version = "0.2.92" } wasm-bindgen-test = { version = "0.3.42" } @@ -214,8 +211,8 @@ must_use_candidate = "allow" similar_names = "allow" single_match_else = "allow" too_many_lines = "allow" -needless_continue = "allow" # An explicit continue can be more readable, especially if the alternative is an empty block. -unnecessary_debug_formatting = "allow" # too many instances, the display also doesn't quote the path which is often desired in logs where we use them the most often. +needless_continue = "allow" # An explicit continue can be more readable, especially if the alternative is an empty block. +unnecessary_debug_formatting = "allow" # too many instances, the display also doesn't quote the path which is often desired in logs where we use them the most often. # Without the hashes we run into a `rustfmt` bug in some snapshot tests, see #13250 needless_raw_string_hashes = "allow" # Disallowed restriction lints diff --git a/crates/ruff_db/src/diagnostic/mod.rs b/crates/ruff_db/src/diagnostic/mod.rs index 8c373ca1e4..f6e94702e8 100644 --- a/crates/ruff_db/src/diagnostic/mod.rs +++ b/crates/ruff_db/src/diagnostic/mod.rs @@ -652,6 +652,8 @@ pub enum DiagnosticId { /// Some I/O operation failed Io, + NoFiles, + /// Some code contains a syntax error InvalidSyntax, @@ -699,6 +701,7 @@ impl DiagnosticId { DiagnosticId::Lint(name) => name.as_str(), DiagnosticId::RevealedType => "revealed-type", DiagnosticId::UnknownRule => "unknown-rule", + DiagnosticId::NoFiles => "no-files", } } diff --git a/crates/ruff_db/src/system/path.rs b/crates/ruff_db/src/system/path.rs index c20b2b19f5..7bd15890fa 100644 --- a/crates/ruff_db/src/system/path.rs +++ b/crates/ruff_db/src/system/path.rs @@ -534,6 +534,10 @@ impl SystemPathBuf { self.0 } + pub fn into_string(self) -> String { + self.0.into_string() + } + pub fn into_std_path_buf(self) -> PathBuf { self.0.into_std_path_buf() } @@ -822,7 +826,7 @@ impl ruff_cache::CacheKey for SystemVirtualPathBuf { /// /// # Examples /// ```rust -/// use ruff_db::system::{SystemPath, deduplicate_nested_paths};/// +/// use ruff_db::system::{SystemPath, deduplicate_nested_paths}; /// /// let paths = vec![SystemPath::new("/a/b/c"), SystemPath::new("/a/b"), SystemPath::new("/a/beta"), SystemPath::new("/a/b/c")]; /// assert_eq!(deduplicate_nested_paths(paths).collect::>(), &[SystemPath::new("/a/b"), SystemPath::new("/a/beta")]); diff --git a/crates/ty/docs/configuration.md b/crates/ty/docs/configuration.md index a2986eeb27..28bfe18646 100644 --- a/crates/ty/docs/configuration.md +++ b/crates/ty/docs/configuration.md @@ -143,6 +143,23 @@ typeshed = "/path/to/custom/typeshed" ## `src` +#### `files` + +TODO + +**Default value**: `null` + +**Type**: `list[pattern]` + +**Example usage** (`pyproject.toml`): + +```toml +[tool.ty.src] +files = ["./app", "!app/build"] +``` + +--- + #### `respect-ignore-files` Whether to automatically exclude files that are ignored by `.ignore`, diff --git a/crates/ty/src/lib.rs b/crates/ty/src/lib.rs index 91efddde60..7fa1ad4210 100644 --- a/crates/ty/src/lib.rs +++ b/crates/ty/src/lib.rs @@ -19,7 +19,7 @@ use colored::Colorize; use crossbeam::channel as crossbeam_channel; use rayon::ThreadPoolBuilder; use ruff_db::Upcast; -use ruff_db::diagnostic::{Diagnostic, DisplayDiagnosticConfig, Severity}; +use ruff_db::diagnostic::{Diagnostic, DiagnosticId, DisplayDiagnosticConfig, Severity}; use ruff_db::max_parallelism; use ruff_db::system::{OsSystem, SystemPath, SystemPathBuf}; use salsa::plumbing::ZalsaDatabase; @@ -276,7 +276,7 @@ impl MainLoop { } MainLoopMessage::CheckCompleted { - result, + mut result, revision: check_revision, } => { let terminal_settings = db.project().settings(db).terminal(); @@ -286,7 +286,11 @@ impl MainLoop { if check_revision == revision { if db.project().files(db).is_empty() { - tracing::warn!("No python files found under the given path(s)"); + result.push(Diagnostic::new( + DiagnosticId::NoFiles, + Severity::Warning, + "No python files found under the given path(s)", + )); } let mut stdout = stdout().lock(); diff --git a/crates/ty_project/Cargo.toml b/crates/ty_project/Cargo.toml index c6e30a6abc..2560b05d49 100644 --- a/crates/ty_project/Cargo.toml +++ b/crates/ty_project/Cargo.toml @@ -25,7 +25,9 @@ ty_vendored = { workspace = true } anyhow = { workspace = true } crossbeam = { workspace = true } +ignore = { workspace = true } glob = { workspace = true } +globset = { workspace = true } notify = { workspace = true } pep440_rs = { workspace = true, features = ["version-ranges"] } rayon = { workspace = true } @@ -35,6 +37,7 @@ schemars = { workspace = true, optional = true } serde = { workspace = true } thiserror = { workspace = true } toml = { workspace = true } +regex-automata = { workspace = true } tracing = { workspace = true } [dev-dependencies] diff --git a/crates/ty_project/src/db/changes.rs b/crates/ty_project/src/db/changes.rs index de9dcd1b42..4757fda138 100644 --- a/crates/ty_project/src/db/changes.rs +++ b/crates/ty_project/src/db/changes.rs @@ -7,7 +7,7 @@ use std::collections::BTreeSet; use crate::walk::ProjectFilesWalker; use ruff_db::Db as _; use ruff_db::files::{File, Files}; -use ruff_db::system::SystemPath; +use ruff_db::system::{FileType, SystemPath}; use rustc_hash::FxHashSet; use ty_python_semantic::Program; @@ -113,8 +113,16 @@ impl ProjectDatabase { // should be included in the project. We can skip this check for // paths that aren't part of the project or shouldn't be included // when checking the project. - if project.is_path_included(self, &path) { - if self.system().is_file(&path) { + let metadata = self + .system() + .path_metadata(&path) + .map(|metadata| metadata.file_type()); + if project.is_path_included( + self, + &path, + matches!(metadata, Ok(FileType::Directory)), + ) { + if matches!(metadata, Ok(FileType::File)) { // Add the parent directory because `walkdir` always visits explicitly passed files // even if they match an exclude filter. added_paths.insert(path.parent().unwrap().to_path_buf()); @@ -153,7 +161,7 @@ impl ProjectDatabase { result.custom_stdlib_changed = true; } - if project.is_path_included(self, &path) || path == project_root { + if project.is_path_included(self, &path, true) || path == project_root { // TODO: Shouldn't it be enough to simply traverse the project files and remove all // that start with the given path? tracing::debug!( diff --git a/crates/ty_project/src/lib.rs b/crates/ty_project/src/lib.rs index ecd9501a47..99f2e5027c 100644 --- a/crates/ty_project/src/lib.rs +++ b/crates/ty_project/src/lib.rs @@ -127,7 +127,7 @@ impl Reporter for DummyReporter { #[salsa::tracked] impl Project { pub fn from_metadata(db: &dyn Db, metadata: ProjectMetadata) -> Self { - let (settings, settings_diagnostics) = metadata.options().to_settings(db); + let (settings, settings_diagnostics) = metadata.options().to_settings(db, metadata.root()); Project::builder(metadata, settings, settings_diagnostics) .durability(Durability::MEDIUM) @@ -160,8 +160,8 @@ impl Project { /// the project's include and exclude settings as well as the paths that were passed to `ty check `. /// This means, that this method is an over-approximation of `Self::files` and may return `true` for paths /// that won't be included when checking the project because they're ignored in a `.gitignore` file. - pub fn is_path_included(self, db: &dyn Db, path: &SystemPath) -> bool { - ProjectFilesFilter::from_project(db, self).is_included(path) + pub fn is_path_included(self, db: &dyn Db, path: &SystemPath, is_directory: bool) -> bool { + ProjectFilesFilter::from_project(db, self).is_included(path, is_directory) } pub fn reload(self, db: &mut dyn Db, metadata: ProjectMetadata) { @@ -169,7 +169,8 @@ impl Project { assert_eq!(self.root(db), metadata.root()); if &metadata != self.metadata(db) { - let (settings, settings_diagnostics) = metadata.options().to_settings(db); + let (settings, settings_diagnostics) = + metadata.options().to_settings(db, metadata.root()); if self.settings(db) != &settings { self.set_settings(db).to(settings); diff --git a/crates/ty_project/src/metadata/options.rs b/crates/ty_project/src/metadata/options.rs index a9a52a4b07..9f808d4c50 100644 --- a/crates/ty_project/src/metadata/options.rs +++ b/crates/ty_project/src/metadata/options.rs @@ -1,5 +1,9 @@ use crate::Db; -use crate::metadata::value::{RangedValue, RelativePathBuf, ValueSource, ValueSourceGuard}; +use crate::metadata::settings::SrcSettings; +use crate::metadata::value::{ + RangedValue, RelativePathBuf, RelativePathPattern, ValueSource, ValueSourceGuard, +}; +use crate::walk::FilePatternsBuilder; use ruff_db::diagnostic::{Annotation, Diagnostic, DiagnosticFormat, DiagnosticId, Severity, Span}; use ruff_db::files::system_path_to_file; use ruff_db::system::{System, SystemPath, SystemPathBuf}; @@ -199,10 +203,20 @@ impl Options { } #[must_use] - pub(crate) fn to_settings(&self, db: &dyn Db) -> (Settings, Vec) { + pub(crate) fn to_settings( + &self, + db: &dyn Db, + project_root: &SystemPath, + ) -> (Settings, Vec) { let (rules, diagnostics) = self.to_rule_selection(db); - let mut settings = Settings::new(rules, self.src.as_ref()); + let mut settings = Settings::new(rules); + + if let Some(src) = self.src.as_ref() { + tracing::debug!("found src options: {src:?}"); + // TODO: Error handling + settings.set_src(src.to_settings(db.system(), project_root).unwrap()); + } if let Some(terminal) = self.terminal.as_ref() { settings.set_terminal(TerminalSettings { @@ -408,6 +422,17 @@ pub struct SrcOptions { )] pub root: Option, + /// TODO + #[serde(skip_serializing_if = "Option::is_none")] + #[option( + default = r#"null"#, + value_type = "list[pattern]", + example = r#" + files = ["./app", "!app/build"] + "# + )] + pub files: Option>, + /// Whether to automatically exclude files that are ignored by `.ignore`, /// `.gitignore`, `.git/info/exclude`, and global `gitignore` files. /// Enabled by default. @@ -422,6 +447,31 @@ pub struct SrcOptions { pub respect_ignore_files: Option, } +impl SrcOptions { + fn to_settings( + &self, + system: &dyn System, + project_root: &SystemPath, + // diagnostics: &mut Vec, + ) -> Result { + // TODO: Error handling, default exclusions + let mut files = FilePatternsBuilder::new(); + + for pattern in self.files.iter().flatten() { + files.add(&pattern.absolute(project_root, system)).unwrap(); + } + + let src = SrcSettings { + respect_ignore_files: self.respect_ignore_files.unwrap_or(true), + files: files.build().unwrap(), + }; + + tracing::debug!("Resolved src settings: {src:?}"); + + Ok(src) + } +} + #[derive(Debug, Default, Clone, Eq, PartialEq, Combine, Serialize, Deserialize)] #[serde(rename_all = "kebab-case", transparent)] #[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] diff --git a/crates/ty_project/src/metadata/settings.rs b/crates/ty_project/src/metadata/settings.rs index 0a48fc559c..c55c1efe71 100644 --- a/crates/ty_project/src/metadata/settings.rs +++ b/crates/ty_project/src/metadata/settings.rs @@ -1,9 +1,9 @@ -use std::sync::Arc; - -use crate::metadata::options::SrcOptions; use ruff_db::diagnostic::DiagnosticFormat; +use std::sync::Arc; use ty_python_semantic::lint::RuleSelection; +use crate::walk::FilePatterns; + /// The resolved [`super::Options`] for the project. /// /// Unlike [`super::Options`], the struct has default values filled in and @@ -23,19 +23,15 @@ pub struct Settings { terminal: TerminalSettings, - respect_ignore_files: bool, + src: SrcSettings, } impl Settings { - pub fn new(rules: RuleSelection, src_options: Option<&SrcOptions>) -> Self { - let respect_ignore_files = src_options - .and_then(|src| src.respect_ignore_files) - .unwrap_or(true); - + pub fn new(rules: RuleSelection) -> Self { Self { rules: Arc::new(rules), terminal: TerminalSettings::default(), - respect_ignore_files, + src: SrcSettings::default(), } } @@ -43,8 +39,12 @@ impl Settings { &self.rules } - pub fn respect_ignore_files(&self) -> bool { - self.respect_ignore_files + pub fn src(&self) -> &SrcSettings { + &self.src + } + + pub fn set_src(&mut self, src: SrcSettings) { + self.src = src; } pub fn to_rules(&self) -> Arc { @@ -65,3 +65,20 @@ pub struct TerminalSettings { pub output_format: DiagnosticFormat, pub error_on_warning: bool, } + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SrcSettings { + pub respect_ignore_files: bool, + + pub files: FilePatterns, +} + +impl Default for SrcSettings { + fn default() -> Self { + Self { + respect_ignore_files: true, + // TODO: This should include all files by default + files: FilePatterns::empty(), + } + } +} diff --git a/crates/ty_project/src/metadata/value.rs b/crates/ty_project/src/metadata/value.rs index b73c59a3e7..9754618956 100644 --- a/crates/ty_project/src/metadata/value.rs +++ b/crates/ty_project/src/metadata/value.rs @@ -344,3 +344,49 @@ impl RelativePathBuf { SystemPath::absolute(&self.0, relative_to) } } + +/// A relative path pattern that allows for negative patterns (git ignore style). +#[derive( + Debug, + Clone, + serde::Serialize, + serde::Deserialize, + PartialEq, + Eq, + PartialOrd, + Ord, + Hash, + Combine, +)] +#[serde(transparent)] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] +pub struct RelativePathPattern(RangedValue); + +impl RelativePathPattern { + pub fn new(pattern: String, source: ValueSource) -> Self { + Self(RangedValue::new(pattern, source)) + } + + pub fn cli(pattern: String) -> Self { + Self::new(pattern, ValueSource::Cli) + } + + /// Returns the relative pattern. + pub fn pattern(&self) -> &str { + &self.0 + } + + /// Resolves the relative pattern to an absolute pattern. + pub fn absolute(&self, project_root: &SystemPath, system: &dyn System) -> String { + let relative_to = match &self.0.source { + ValueSource::File(_) => project_root, + ValueSource::Cli => system.current_directory(), + }; + + if let Some(after) = self.0.strip_prefix('!') { + format!("!{}", SystemPath::absolute(after, relative_to)) + } else { + SystemPath::absolute(&self.0, relative_to).into_string() + } + } +} diff --git a/crates/ty_project/src/walk.rs b/crates/ty_project/src/walk.rs index e6d7f7fdc3..c6b92654fb 100644 --- a/crates/ty_project/src/walk.rs +++ b/crates/ty_project/src/walk.rs @@ -1,10 +1,15 @@ use crate::{Db, IOErrorDiagnostic, IOErrorKind, Project}; +use globset::{Candidate, GlobBuilder, GlobSet, GlobSetBuilder}; +use regex_automata::util::pool::Pool; use ruff_db::files::{File, system_path_to_file}; use ruff_db::system::walk_directory::{ErrorKind, WalkDirectoryBuilder, WalkState}; -use ruff_db::system::{FileType, SystemPath, SystemPathBuf}; +use ruff_db::system::{FileType, SystemPath, SystemPathBuf, deduplicate_nested_paths}; use ruff_python_ast::PySourceType; use rustc_hash::{FxBuildHasher, FxHashSet}; +use std::borrow::Cow; +use std::collections::BTreeSet; use std::path::PathBuf; +use std::sync::Arc; use thiserror::Error; /// Filter that decides which files are included in the project. @@ -13,11 +18,15 @@ use thiserror::Error; /// /// This struct mainly exists because `dyn Db` isn't `Send` or `Sync`, making it impossible /// to access fields from within the walker. -#[derive(Default, Debug)] +#[derive(Debug)] pub(crate) struct ProjectFilesFilter<'a> { /// The same as [`Project::included_paths_or_root`]. included_paths: &'a [SystemPathBuf], + files_patterns: &'a FilePatterns, + + project_root: &'a SystemPath, + /// The filter skips checking if the path is in `included_paths` if set to `true`. /// /// Skipping this check is useful when the walker only walks over `included_paths`. @@ -28,6 +37,8 @@ impl<'a> ProjectFilesFilter<'a> { pub(crate) fn from_project(db: &'a dyn Db, project: Project) -> Self { Self { included_paths: project.included_paths_or_root(db), + project_root: project.root(db), + files_patterns: &project.settings(db).src().files, skip_included_paths: false, } } @@ -45,7 +56,7 @@ impl<'a> ProjectFilesFilter<'a> { /// This method may return `true` for files that don't end up being included when walking the /// project tree because it doesn't consider `.gitignore` and other ignore files when deciding /// if a file's included. - pub(crate) fn is_included(&self, path: &SystemPath) -> bool { + pub(crate) fn is_included(&self, path: &SystemPath, is_directory: bool) -> bool { #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] enum CheckPathMatch { /// The path is a partial match of the checked path (it's a sub path) @@ -78,8 +89,26 @@ impl<'a> ProjectFilesFilter<'a> { match m { None => false, Some(CheckPathMatch::Partial) => { + if path == self.project_root { + return true; + } + + // TODO: Do we need to use `matched_path_or_any_parents` when not walking? + + let matched = self.files_patterns.matches(path, is_directory); + tracing::debug!("path `{path} matches {matched:?}"); // TODO: For partial matches, only include the file if it is included by the project's include/exclude settings. - true + match matched { + // We need to traverse directories that don't match because `a` doesn't match the pattern `a/b/c/d.py` + // but we need to traverse the directory to successfully match `a/b/c/d.py`. + // This is very unfortunate because it means ty traverses all directories when e.g. using `files = ["src"]`. + // TODO(micha): 04.06.2025: It would be nice if we could avoid traversing directories + // that are known can never match because they don't share a common prefix with any of the globs. + // But we'd need to be careful in the precense of `**/test` patterns because they can match any path. + PatternMatch::None => true, + PatternMatch::Exclude(_) => false, + PatternMatch::Include => true, + } } Some(CheckPathMatch::Full) => true, } @@ -132,7 +161,7 @@ impl<'a> ProjectFilesWalker<'a> { let mut walker = db .system() .walk_directory(paths.next()?.as_ref()) - .standard_filters(db.project().settings(db).respect_ignore_files()) + .standard_filters(db.project().settings(db).src().respect_ignore_files) .ignore_hidden(false); for path in paths { @@ -152,7 +181,10 @@ impl<'a> ProjectFilesWalker<'a> { Box::new(|entry| { match entry { Ok(entry) => { - if !self.filter.is_included(entry.path()) { + if !self + .filter + .is_included(entry.path(), entry.file_type().is_directory()) + { tracing::debug!("Ignoring not-included path: {}", entry.path()); return WalkState::Skip; } @@ -258,3 +290,412 @@ pub(crate) enum WalkError { #[error("`{path}` is not a valid UTF-8 path")] NonUtf8Path { path: PathBuf }, } + +#[derive(Clone)] +pub struct FilePatterns { + set: GlobSet, + patterns: Box<[FilePattern]>, + matches: Option>>>, + static_prefixes: Option>, + num_positive: usize, +} + +impl FilePatterns { + pub(crate) fn empty() -> Self { + Self { + set: GlobSet::empty(), + patterns: Box::default(), + matches: None, + static_prefixes: Some(BTreeSet::new()), + num_positive: 0, + } + } + + pub(crate) fn matches(&self, path: &SystemPath, is_directory: bool) -> PatternMatch { + if self.patterns.is_empty() { + return PatternMatch::None; + } + + let candidate = Candidate::new(path); + let mut matches = self.matches.as_ref().unwrap().get(); + self.set.matches_candidate_into(&candidate, &mut *matches); + + for &i in matches.iter().rev() { + let pattern = &self.patterns[i]; + + if pattern.is_only_directory && !is_directory { + continue; + } + + return if pattern.negated { + PatternMatch::Exclude(ExcludeReason::Match) + } else { + PatternMatch::Include + }; + } + + if self.num_positive > 0 { + if is_directory { + if let Some(static_prefixes) = self.static_prefixes.as_ref() { + // Skip directories for which we know that no glob has a shared prefix with. + // E.g. if `files = ["src"], skip `tests` + if static_prefixes + .range(..=path.to_path_buf()) + .next() + .is_none() + { + return PatternMatch::Exclude(ExcludeReason::NoIncludePattern); + } + } + } else { + // If this is a file and there's at least one include pattern but the file doesn't match it, + // then the file is excluded. If there are only exclude patterns, than the file should be included. + return PatternMatch::Exclude(ExcludeReason::NoIncludePattern); + } + } + + PatternMatch::None + } +} + +impl PartialEq for FilePatterns { + fn eq(&self, other: &Self) -> bool { + self.patterns == other.patterns + } +} + +impl Eq for FilePatterns {} + +impl std::fmt::Debug for FilePatterns { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("FilePatterns") + .field("patterns", &self.patterns) + .finish() + } +} + +#[derive(Clone, Debug)] +pub(crate) struct FilePatternsBuilder { + set: GlobSetBuilder, + patterns: Vec, + static_prefixes: Option>, + num_positive: usize, +} + +impl FilePatternsBuilder { + pub(crate) fn new() -> Self { + Self { + set: GlobSetBuilder::new(), + patterns: Vec::new(), + static_prefixes: Some(Vec::new()), + num_positive: 0, + } + } + + pub(crate) fn add(&mut self, input: &str) -> Result<&mut Self, globset::Error> { + let mut pattern = FilePattern { + negated: false, + is_only_directory: false, + original: input.to_string(), + }; + + let mut glob = input; + + if let Some(after) = glob.strip_prefix('!') { + pattern.negated = true; + glob = after; + } + + // A pattern ending with a `/` should only match directories. E.g. `src/` only matches directories + // whereas `src` matches both files and directories. + // We need to remove the `/` to ensure that a path missing the trailing `/` matches. + if let Some(before) = glob.strip_suffix('/') { + pattern.is_only_directory = true; + glob = before; + + // If the slash was escaped, then remove the escape. + // See: https://github.com/BurntSushi/ripgrep/issues/2236 + let trailing_backslashes = glob.chars().rev().filter(|c| *c == '\\').count(); + if trailing_backslashes % 2 == 1 { + glob = &glob[..glob.len() - trailing_backslashes] + } + } + + // If the last component contains no wildcards or extension, consider it an implicit glob + // This turns `src` into `src/**/*` + // TODO: Should we also enable this behavior for `is_only_directory` patterns? + if is_implicit_glob(glob) && !pattern.negated { + let parsed = GlobBuilder::new(&format!("{glob}/**")) + .literal_separator(true) + .backslash_escape(true) + // TODO: Map the error to the pattern the user provided. + .build()?; + + self.set.add(parsed); + self.patterns.push(FilePattern { + is_only_directory: false, + ..pattern.clone() + }); + } + + let mut actual = Cow::Borrowed(glob); + + // If the glob ends with `/**`, then we should only match everything + // inside a directory, but not the directory itself. Standard globs + // will match the directory. So we add `/*` to force the issue. + if actual.ends_with("/**") { + actual = Cow::Owned(format!("{}/*", actual)); + } + + // Unlike gitignore, anchor paths (don't insert a `**` prefix). + let parsed = GlobBuilder::new(&*actual) + .literal_separator(true) + .backslash_escape(true) + // TODO: Map the error to the pattern the user provided. + .build()?; + + if !pattern.negated { + self.num_positive += 1; + + // Do a best effort at extracting a static prefix from a positive include match. + // This allows short-circuting traversal of folders that are known to not overlap with any positive + // match. However, we have to be careful. Any path starting with a `**` requires visiting all folders. + if let Some(static_prefixes) = self.static_prefixes.as_mut() { + let mut static_prefix = SystemPathBuf::new(); + for component in SystemPath::new(glob).components() { + if glob::Pattern::escape(component.as_str()) == component.as_str() { + static_prefix.push(component); + } else { + break; + } + } + + if static_prefix.as_str().is_empty() { + // If we see a `**/` pattern, then we have to visit all directories. + self.static_prefixes.take(); + } else { + static_prefixes.push(static_prefix); + } + } + } + + self.set.add(parsed); + self.patterns.push(pattern); + + Ok(self) + } + + pub(crate) fn build(self) -> Result { + let static_prefixes = self + .static_prefixes + .map(|prefixes| deduplicate_nested_paths(prefixes).collect::>()); + + Ok(FilePatterns { + set: self.set.build()?, + patterns: self.patterns.into(), + matches: Some(Arc::new(Pool::new(|| vec![]))), + static_prefixes, + num_positive: self.num_positive, + }) + } +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub(crate) enum PatternMatch { + /// The highest precedence pattern is an include pattern. + Include, + + /// The highest precedence pattern is a negated pattern (the file should not be included). + Exclude(ExcludeReason), + + /// No pattern matched the path. + None, +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub(crate) enum ExcludeReason { + /// The path is excluded because it matches a negative pattern. + Match, + + /// It's a file path that doesn't match any include pattern. + NoIncludePattern, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct FilePattern { + /// The pattern as specified by the user. + original: String, + + /// Whether the glob should only match directories (`src/` matches only directories). + is_only_directory: bool, + + /// Whether this pattern was negated. + negated: bool, +} + +fn is_implicit_glob(pattern: &str) -> bool { + let as_path = SystemPath::new(pattern); + + as_path + .components() + .last() + .is_some_and(|last| !last.as_str().contains(['.', '*', '?'])) +} + +#[cfg(test)] +mod tests { + use ruff_db::system::SystemPath; + + use crate::walk::{ExcludeReason, FilePatterns, FilePatternsBuilder, PatternMatch}; + + fn create_patterns(patterns: impl IntoIterator) -> FilePatterns { + let mut builder = FilePatternsBuilder::new(); + + for pattern in patterns { + builder.add(pattern).unwrap_or_else(|err| { + panic!("Invalid pattern '{pattern}`: {err}"); + }); + } + + builder.build().unwrap() + } + + #[test] + fn all() { + let patterns = create_patterns(["**"]); + + assert_eq!( + patterns.matches(SystemPath::new("/src"), true), + PatternMatch::Include + ); + assert_eq!( + patterns.matches(SystemPath::new("/src/"), true), + PatternMatch::Include + ); + + assert_eq!( + patterns.matches(SystemPath::new("/"), true), + PatternMatch::Include + ); + assert_eq!( + patterns.matches(SystemPath::new("/test.py"), true), + PatternMatch::Include + ); + } + + #[test] + fn implicit_directory_pattern() { + // Patterns ending with a slash only match directories with the given name, but not files. + // It includes all files in said directory + let patterns = create_patterns(["/src/"]); + + assert_eq!( + patterns.matches(SystemPath::new("/src"), true), + PatternMatch::Include + ); + assert_eq!( + patterns.matches(SystemPath::new("/src/"), true), + PatternMatch::Include + ); + + // Don't include files, because the pattern ends with `/` + assert_eq!( + patterns.matches(SystemPath::new("/src"), false), + PatternMatch::Exclude(ExcludeReason::NoIncludePattern) + ); + + // But include the content of src + assert_eq!( + patterns.matches(SystemPath::new("/src/test.py"), false), + PatternMatch::Include + ); + + // Deep nesting + assert_eq!( + patterns.matches(SystemPath::new("/src/glob/builder.py"), false), + PatternMatch::Include + ); + + // Or a file with the same name + assert_eq!( + patterns.matches(SystemPath::new("/src/src"), false), + PatternMatch::Include + ); + + // Or a directory with the same name + assert_eq!( + patterns.matches(SystemPath::new("/src/src"), true), + PatternMatch::Include + ); + } + + #[test] + fn implicit_pattern() { + // Patterns ending without a slash include both files and directories. + // It includes all files in said directory + let patterns = create_patterns(["/src"]); + + assert_eq!( + patterns.matches(SystemPath::new("/src"), true), + PatternMatch::Include + ); + assert_eq!( + patterns.matches(SystemPath::new("/src/"), true), + PatternMatch::Include + ); + + // Also include files + assert_eq!( + patterns.matches(SystemPath::new("/src"), false), + PatternMatch::Include + ); + + assert_eq!( + patterns.matches(SystemPath::new("/src/test.py"), false), + PatternMatch::Include + ); + + // Deep nesting + assert_eq!( + patterns.matches(SystemPath::new("/src/glob/builder.py"), false), + PatternMatch::Include + ); + + // Or a file with the same name + assert_eq!( + patterns.matches(SystemPath::new("/src/src"), false), + PatternMatch::Include + ); + + // Or a directory with the same name + assert_eq!( + patterns.matches(SystemPath::new("/src/src"), true), + PatternMatch::Include + ); + } + + #[test] + fn pattern_with_extension() { + // Patterns with an extension only match files or directories with the exact name. + let patterns = create_patterns(["test.py"]); + + assert_eq!( + patterns.matches(SystemPath::new("test.py"), true), + PatternMatch::Include + ); + assert_eq!( + patterns.matches(SystemPath::new("test.py"), false), + PatternMatch::Include + ); + + assert_eq!( + patterns.matches(SystemPath::new("test.py/abcd"), false), + PatternMatch::Exclude(ExcludeReason::NoIncludePattern) + ); + + assert_eq!( + patterns.matches(SystemPath::new("test.py/abcd"), true), + PatternMatch::None + ); + } +} diff --git a/ty.schema.json b/ty.schema.json index f4ae8d241b..adbafa4e74 100644 --- a/ty.schema.json +++ b/ty.schema.json @@ -851,6 +851,16 @@ "SrcOptions": { "type": "object", "properties": { + "files": { + "description": "TODO", + "type": [ + "array", + "null" + ], + "items": { + "type": "string" + } + }, "respect-ignore-files": { "description": "Whether to automatically exclude files that are ignored by `.ignore`, `.gitignore`, `.git/info/exclude`, and global `gitignore` files. Enabled by default.", "type": [