diff --git a/Cargo.lock b/Cargo.lock index c2f109f23..006dfea70 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4522,7 +4522,6 @@ dependencies = [ "csv", "flate2", "fs-err", - "glob", "globset", "indoc", "insta", @@ -4537,6 +4536,7 @@ dependencies = [ "tracing", "uv-distribution-filename", "uv-fs", + "uv-globfilter", "uv-normalize", "uv-pep440", "uv-pep508", @@ -4974,6 +4974,21 @@ dependencies = [ "which", ] +[[package]] +name = "uv-globfilter" +version = "0.1.0" +dependencies = [ + "fs-err", + "globset", + "insta", + "regex", + "regex-automata 0.4.8", + "tempfile", + "thiserror", + "tracing", + "walkdir", +] + [[package]] name = "uv-install-wheel" version = "0.0.1" diff --git a/Cargo.toml b/Cargo.toml index 82f490c75..609dd4567 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,6 +39,7 @@ uv-distribution-types = { path = "crates/uv-distribution-types" } uv-extract = { path = "crates/uv-extract" } uv-fs = { path = "crates/uv-fs" } uv-git = { path = "crates/uv-git" } +uv-globfilter = { path = "crates/uv-globfilter" } uv-install-wheel = { path = "crates/uv-install-wheel", default-features = false } uv-installer = { path = "crates/uv-installer" } uv-macros = { path = "crates/uv-macros" } @@ -134,6 +135,7 @@ quote = { version = "1.0.37" } rayon = { version = "1.10.0" } reflink-copy = { version = "0.1.19" } regex = { version = "1.10.6" } +regex-automata = { version = "0.4.8", default-features = false, features = ["dfa-build", "dfa-search", "perf", "std", "syntax"] } reqwest = { version = "0.12.7", default-features = false, features = ["json", "gzip", "stream", "rustls-tls", "rustls-tls-native-roots", "socks", "multipart", "http2"] } reqwest-middleware = { version = "0.4.0", features = ["multipart"] } reqwest-retry = { version = "0.7.0" } diff --git a/crates/uv-build-backend/Cargo.toml b/crates/uv-build-backend/Cargo.toml index 962dcaf5d..b07524cb0 100644 --- a/crates/uv-build-backend/Cargo.toml +++ b/crates/uv-build-backend/Cargo.toml @@ -15,6 +15,7 @@ doctest = false [dependencies] uv-distribution-filename = { workspace = true } uv-fs = { workspace = true } +uv-globfilter = { workspace = true } uv-normalize = { workspace = true } uv-pep440 = { workspace = true } uv-pep508 = { workspace = true } @@ -24,7 +25,6 @@ uv-warnings = { workspace = true } csv = { workspace = true } flate2 = { workspace = true } fs-err = { workspace = true } -glob = { workspace = true } globset = { workspace = true } itertools = { workspace = true } serde = { workspace = true } diff --git a/crates/uv-build-backend/src/lib.rs b/crates/uv-build-backend/src/lib.rs index 51c8d95bd..771465055 100644 --- a/crates/uv-build-backend/src/lib.rs +++ b/crates/uv-build-backend/src/lib.rs @@ -1,13 +1,10 @@ mod metadata; -mod pep639_glob; use crate::metadata::{PyProjectToml, ValidationError}; -use crate::pep639_glob::Pep639GlobError; use flate2::write::GzEncoder; use flate2::Compression; use fs_err::File; -use glob::{GlobError, PatternError}; -use globset::{Glob, GlobSetBuilder}; +use globset::GlobSetBuilder; use itertools::Itertools; use sha2::{Digest, Sha256}; use std::fs::FileType; @@ -19,6 +16,7 @@ use thiserror::Error; use tracing::{debug, trace}; use uv_distribution_filename::{SourceDistExtension, SourceDistFilename, WheelFilename}; use uv_fs::Simplified; +use uv_globfilter::{parse_portable_glob, GlobDirFilter, PortableGlobError}; use walkdir::WalkDir; use zip::{CompressionMethod, ZipWriter}; @@ -30,16 +28,26 @@ pub enum Error { Toml(#[from] toml::de::Error), #[error("Invalid pyproject.toml")] Validation(#[from] ValidationError), - #[error("Invalid `project.license-files` glob expression: `{0}`")] - Pep639Glob(String, #[source] Pep639GlobError), - #[error("The `project.license-files` entry is not a valid glob pattern: `{0}`")] - Pattern(String, #[source] PatternError), - /// [`GlobError`] is a wrapped io error. - #[error(transparent)] - Glob(#[from] GlobError), + #[error("Unsupported glob expression in: `{field}`")] + PortableGlob { + field: String, + #[source] + source: PortableGlobError, + }, + /// + #[error("Glob expressions caused to large regex in: `{field}`")] + GlobSetTooLarge { + field: String, + #[source] + source: globset::Error, + }, /// [`globset::Error`] shows the glob that failed to parse. - #[error(transparent)] - GlobSet(#[from] globset::Error), + #[error("Unsupported glob expression in: `{field}`")] + GlobSet { + field: String, + #[source] + err: globset::Error, + }, #[error("Failed to walk source tree: `{}`", root.user_display())] WalkDir { root: PathBuf, @@ -322,7 +330,10 @@ pub fn build_wheel( err, })?; - let relative_path = entry.path().strip_prefix(&strip_root)?; + let relative_path = entry + .path() + .strip_prefix(&strip_root) + .expect("walkdir starts with root"); let relative_path_str = relative_path .to_str() .ok_or_else(|| Error::NotUtf8Path(relative_path.to_path_buf()))?; @@ -354,10 +365,52 @@ pub fn build_wheel( Ok(filename) } +/// TODO(konsti): Wire this up with actual settings and remove this struct. +/// +/// To select which files to include in the source distribution, we first add the includes, then +/// remove the excludes from that. +pub struct SourceDistSettings { + /// Glob expressions which files and directories to include in the source distribution. + /// + /// Includes are anchored, which means that `pyproject.toml` includes only + /// `/pyproject.toml`. Use for example `assets/**/sample.csv` to include for all + /// `sample.csv` files in `/assets` or any child directory. To recursively include + /// all files under a directory, use a `/**` suffix, e.g. `src/**`. For performance and + /// reproducibility, avoid unanchored matches such as `**/sample.csv`. + /// + /// The glob syntax is the reduced portable glob from + /// [PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key). + include: Vec, + /// Glob expressions which files and directories to exclude from the previous source + /// distribution includes. + /// + /// Excludes are not anchored, which means that `__pycache__` excludes all directories named + /// `__pycache__` and it's children anywhere. To anchor a directory, use a `/` prefix, e.g., + /// `/dist` will exclude only `/dist`. + /// + /// The glob syntax is the reduced portable glob from + /// [PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key). + exclude: Vec, +} + +impl Default for SourceDistSettings { + fn default() -> Self { + Self { + include: vec!["src/**".to_string(), "pyproject.toml".to_string()], + exclude: vec![ + "__pycache__".to_string(), + "*.pyc".to_string(), + "*.pyo".to_string(), + ], + } + } +} + /// Build a source distribution from the source tree and place it in the output directory. pub fn build_source_dist( source_tree: &Path, source_dist_directory: &Path, + settings: SourceDistSettings, uv_version: &str, ) -> Result { let contents = fs_err::read_to_string(source_tree.join("pyproject.toml"))?; @@ -392,42 +445,75 @@ pub fn build_source_dist( ) .map_err(|err| Error::TarWrite(source_dist_path.clone(), err))?; - let includes = ["src/**/*", "pyproject.toml"]; - let mut include_builder = GlobSetBuilder::new(); - for include in includes { - include_builder.add(Glob::new(include)?); + let mut include_globs = Vec::new(); + for include in settings.include { + let glob = parse_portable_glob(&include).map_err(|err| Error::PortableGlob { + field: "tool.uv.source-dist.include".to_string(), + source: err, + })?; + include_globs.push(glob.clone()); } - let include_matcher = include_builder.build()?; + let include_matcher = + GlobDirFilter::from_globs(&include_globs).map_err(|err| Error::GlobSetTooLarge { + field: "tool.uv.source-dist.include".to_string(), + source: err, + })?; - let excludes = ["__pycache__", "*.pyc", "*.pyo"]; let mut exclude_builder = GlobSetBuilder::new(); - for exclude in excludes { - exclude_builder.add(Glob::new(exclude)?); + for exclude in settings.exclude { + // Excludes are unanchored + let exclude = if let Some(exclude) = exclude.strip_prefix("/") { + exclude.to_string() + } else { + format!("**/{exclude}").to_string() + }; + let glob = parse_portable_glob(&exclude).map_err(|err| Error::PortableGlob { + field: "tool.uv.source-dist.exclude".to_string(), + source: err, + })?; + exclude_builder.add(glob); } - let exclude_matcher = exclude_builder.build()?; + let exclude_matcher = exclude_builder + .build() + .map_err(|err| Error::GlobSetTooLarge { + field: "tool.uv.source-dist.exclude".to_string(), + source: err, + })?; // TODO(konsti): Add files linked by pyproject.toml - for file in WalkDir::new(source_tree).into_iter().filter_entry(|dir| { - let relative = dir - .path() - .strip_prefix(source_tree) - .expect("walkdir starts with root"); - // TODO(konsti): Also check that we're matching at least a prefix of an include matcher. - !exclude_matcher.is_match(relative) - }) { - let entry = file.map_err(|err| Error::WalkDir { - root: source_tree.to_path_buf(), - err, - })?; + for entry in WalkDir::new(source_tree).into_iter().filter_entry(|entry| { + // TODO(konsti): This should be prettier. let relative = entry .path() .strip_prefix(source_tree) - .expect("walkdir starts with root"); - if !include_matcher.is_match(relative) { + .expect("walkdir starts with root") + .to_path_buf(); + + // Fast path: Don't descend into a directory that can't be included. This is the most + // important performance optimization, it avoids descending into directories such as + // `.venv`. While walkdir is generally cheap, we still avoid traversing large data + // directories that often exist on the top level of a project. This is especially noticeable + // on network file systems with high latencies per operation (while contiguous reading may + // still be fast). + include_matcher.match_directory(&relative) && !exclude_matcher.is_match(&relative) + }) { + let entry = entry.map_err(|err| Error::WalkDir { + root: source_tree.to_path_buf(), + err, + })?; + // TODO(konsti): This should be prettier. + let relative = entry + .path() + .strip_prefix(source_tree) + .expect("walkdir starts with root") + .to_path_buf(); + + if !include_matcher.match_path(&relative) || exclude_matcher.is_match(&relative) { trace!("Excluding {}", relative.user_display()); continue; - } + }; + debug!("Including {}", relative.user_display()); let metadata = fs_err::metadata(entry.path())?; @@ -462,7 +548,7 @@ pub fn build_source_dist( .map_err(|err| Error::TarWrite(source_dist_path.clone(), err))?; } else { return Err(Error::UnsupportedFileType( - relative.to_path_buf(), + relative.clone(), entry.file_type(), )); } diff --git a/crates/uv-build-backend/src/metadata.rs b/crates/uv-build-backend/src/metadata.rs index 30219acb7..577a26d85 100644 --- a/crates/uv-build-backend/src/metadata.rs +++ b/crates/uv-build-backend/src/metadata.rs @@ -1,19 +1,21 @@ -use crate::pep639_glob::parse_pep639_glob; use crate::Error; +use globset::{Glob, GlobSetBuilder}; use itertools::Itertools; use serde::Deserialize; use std::collections::{BTreeMap, Bound}; use std::ffi::OsStr; use std::path::{Path, PathBuf}; use std::str::FromStr; -use tracing::debug; +use tracing::{debug, trace}; use uv_fs::Simplified; +use uv_globfilter::parse_portable_glob; use uv_normalize::{ExtraName, PackageName}; use uv_pep440::{Version, VersionSpecifiers}; use uv_pep508::{Requirement, VersionOrUrl}; use uv_pypi_types::{Metadata23, VerbatimParsedUrl}; use uv_warnings::warn_user_once; use version_ranges::Ranges; +use walkdir::WalkDir; #[derive(Debug, Error)] pub enum ValidationError { @@ -312,27 +314,53 @@ impl PyProjectToml { }; let mut license_files = Vec::new(); + let mut license_glob_builder = GlobSetBuilder::new(); for license_glob in license_globs { - let pep639_glob = parse_pep639_glob(license_glob) - .map_err(|err| Error::Pep639Glob(license_glob.to_string(), err))?; - let absolute_glob = PathBuf::from(glob::Pattern::escape( + let pep639_glob = + parse_portable_glob(license_glob).map_err(|err| Error::PortableGlob { + field: license_glob.to_string(), + source: err, + })?; + let absolute_glob = PathBuf::from(globset::escape( root.simplified().to_string_lossy().as_ref(), )) .join(pep639_glob.to_string()) .to_string_lossy() .to_string(); - for license_file in glob::glob(&absolute_glob) - .map_err(|err| Error::Pattern(absolute_glob.to_string(), err))? - { - let license_file = license_file - .map_err(Error::Glob)? - .to_string_lossy() - .to_string(); - if !license_files.contains(&license_file) { - license_files.push(license_file); + license_glob_builder.add(Glob::new(&absolute_glob).map_err(|err| { + Error::GlobSet { + field: "project.license-files".to_string(), + err, } + })?); + } + let license_globs = license_glob_builder.build().map_err(|err| Error::GlobSet { + field: "project.license-files".to_string(), + err, + })?; + + for entry in WalkDir::new(".") { + let entry = entry.map_err(|err| Error::WalkDir { + root: PathBuf::from("."), + err, + })?; + let relative = entry + .path() + .strip_prefix("./") + .expect("walkdir starts with root"); + if !license_globs.is_match(relative) { + trace!("Not a license files match: `{}`", relative.user_display()); + continue; + } + + debug!("License files match: `{}`", relative.user_display()); + let license_file = relative.to_string_lossy().to_string(); + + if !license_files.contains(&license_file) { + license_files.push(license_file); } } + // The glob order may be unstable license_files.sort(); diff --git a/crates/uv-build-backend/src/pep639_glob.rs b/crates/uv-build-backend/src/pep639_glob.rs deleted file mode 100644 index aae4d68f2..000000000 --- a/crates/uv-build-backend/src/pep639_glob.rs +++ /dev/null @@ -1,81 +0,0 @@ -//! Implementation of PEP 639 cross-language restricted globs. - -use glob::{Pattern, PatternError}; -use thiserror::Error; - -#[derive(Debug, Error)] -pub enum Pep639GlobError { - #[error(transparent)] - PatternError(#[from] PatternError), - #[error("The parent directory operator (`..`) at position {pos} is not allowed in license file globs")] - ParentDirectory { pos: usize }, - #[error("Glob contains invalid character at position {pos}: `{invalid}`")] - InvalidCharacter { pos: usize, invalid: char }, - #[error("Glob contains invalid character in range at position {pos}: `{invalid}`")] - InvalidCharacterRange { pos: usize, invalid: char }, -} - -/// Parse a PEP 639 `license-files` glob. -/// -/// The syntax is more restricted than regular globbing in Python or Rust for platform independent -/// results. Since [`glob::Pattern`] is a superset over this format, we can use it after validating -/// that no unsupported features are in the string. -/// -/// From [PEP 639](https://peps.python.org/pep-0639/#add-license-files-key): -/// -/// > Its value is an array of strings which MUST contain valid glob patterns, -/// > as specified below: -/// > -/// > - Alphanumeric characters, underscores (`_`), hyphens (`-`) and dots (`.`) -/// > MUST be matched verbatim. -/// > -/// > - Special glob characters: `*`, `?`, `**` and character ranges: `[]` -/// > containing only the verbatim matched characters MUST be supported. -/// > Within `[...]`, the hyphen indicates a range (e.g. `a-z`). -/// > Hyphens at the start or end are matched literally. -/// > -/// > - Path delimiters MUST be the forward slash character (`/`). -/// > Patterns are relative to the directory containing `pyproject.toml`, -/// > therefore the leading slash character MUST NOT be used. -/// > -/// > - Parent directory indicators (`..`) MUST NOT be used. -/// > -/// > Any characters or character sequences not covered by this specification are -/// > invalid. Projects MUST NOT use such values. -/// > Tools consuming this field MAY reject invalid values with an error. -pub(crate) fn parse_pep639_glob(glob: &str) -> Result { - let mut chars = glob.chars().enumerate().peekable(); - // A `..` is on a parent directory indicator at the start of the string or after a directory - // separator. - let mut start_or_slash = true; - while let Some((pos, c)) = chars.next() { - if c.is_alphanumeric() || matches!(c, '_' | '-' | '*' | '?') { - start_or_slash = false; - } else if c == '.' { - if start_or_slash && matches!(chars.peek(), Some((_, '.'))) { - return Err(Pep639GlobError::ParentDirectory { pos }); - } - start_or_slash = false; - } else if c == '/' { - start_or_slash = true; - } else if c == '[' { - for (pos, c) in chars.by_ref() { - // TODO: https://discuss.python.org/t/pep-639-round-3-improving-license-clarity-with-better-package-metadata/53020/98 - if c.is_alphanumeric() || matches!(c, '_' | '-' | '.') { - // Allowed. - } else if c == ']' { - break; - } else { - return Err(Pep639GlobError::InvalidCharacterRange { pos, invalid: c }); - } - } - start_or_slash = false; - } else { - return Err(Pep639GlobError::InvalidCharacter { pos, invalid: c }); - } - } - Ok(Pattern::new(glob)?) -} - -#[cfg(test)] -mod tests; diff --git a/crates/uv-build-backend/src/pep639_glob/tests.rs b/crates/uv-build-backend/src/pep639_glob/tests.rs deleted file mode 100644 index 1bb02520c..000000000 --- a/crates/uv-build-backend/src/pep639_glob/tests.rs +++ /dev/null @@ -1,54 +0,0 @@ -use super::*; -use insta::assert_snapshot; - -#[test] -fn test_error() { - let parse_err = |glob| parse_pep639_glob(glob).unwrap_err().to_string(); - assert_snapshot!( - parse_err(".."), - @"The parent directory operator (`..`) at position 0 is not allowed in license file globs" - ); - assert_snapshot!( - parse_err("licenses/.."), - @"The parent directory operator (`..`) at position 9 is not allowed in license file globs" - ); - assert_snapshot!( - parse_err("licenses/LICEN!E.txt"), - @"Glob contains invalid character at position 14: `!`" - ); - assert_snapshot!( - parse_err("licenses/LICEN[!C]E.txt"), - @"Glob contains invalid character in range at position 15: `!`" - ); - assert_snapshot!( - parse_err("licenses/LICEN[C?]E.txt"), - @"Glob contains invalid character in range at position 16: `?`" - ); - assert_snapshot!(parse_err("******"), @"Pattern syntax error near position 2: wildcards are either regular `*` or recursive `**`"); - assert_snapshot!( - parse_err(r"licenses\eula.txt"), - @r"Glob contains invalid character at position 8: `\`" - ); -} - -#[test] -fn test_valid() { - let cases = [ - "licenses/*.txt", - "licenses/**/*.txt", - "LICEN[CS]E.txt", - "LICEN?E.txt", - "[a-z].txt", - "[a-z._-].txt", - "*/**", - "LICENSE..txt", - "LICENSE_file-1.txt", - // (google translate) - "licenses/라이센스*.txt", - "licenses/ライセンス*.txt", - "licenses/执照*.txt", - ]; - for case in cases { - parse_pep639_glob(case).unwrap(); - } -} diff --git a/crates/uv-build-backend/src/tests.rs b/crates/uv-build-backend/src/tests.rs index 265e083bd..34af7e381 100644 --- a/crates/uv-build-backend/src/tests.rs +++ b/crates/uv-build-backend/src/tests.rs @@ -79,7 +79,7 @@ fn test_prepare_metadata() { .unwrap() .path() .strip_prefix(metadata_dir.path()) - .unwrap() + .expect("walkdir starts with root") .portable_display() .to_string() }) diff --git a/crates/uv-globfilter/Cargo.toml b/crates/uv-globfilter/Cargo.toml new file mode 100644 index 000000000..047ede182 --- /dev/null +++ b/crates/uv-globfilter/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "uv-globfilter" +version = "0.1.0" +readme = "README.md" +edition.workspace = true +rust-version.workspace = true +homepage.workspace = true +documentation.workspace = true +repository.workspace = true +authors.workspace = true +license.workspace = true + +[dependencies] +globset = { workspace = true } +regex = { workspace = true } +regex-automata = { workspace = true } +thiserror = { workspace = true } +tracing = { workspace = true } +walkdir = { workspace = true } + +[dev-dependencies] +fs-err = "2.11.0" +insta = "1.41.1" +tempfile = "3.14.0" + +[lints] +workspace = true diff --git a/crates/uv-globfilter/README.md b/crates/uv-globfilter/README.md new file mode 100644 index 000000000..7dfd8b41f --- /dev/null +++ b/crates/uv-globfilter/README.md @@ -0,0 +1,34 @@ +# globfilter + +Portable directory walking with includes and excludes. + +Motivating example: You want to allow the user to select paths within a project. + +```toml +include = ["src", "License.txt", "resources/icons/*.svg"] +exclude = ["target", "/dist", ".cache", "*.tmp"] +``` + +When traversing the directory, you can use +`GlobDirFilter::from_globs(...)?.match_directory(&relative)` skip directories that never match in +`WalkDir`s `filter_entry`. + +## Syntax + +This crate supports the cross-language, restricted glob syntax from +[PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key): + +- Alphanumeric characters, underscores (`_`), hyphens (`-`) and dots (`.`) are matched verbatim. +- The special glob characters are: + - `*`: Matches any number of characters except path separators + - `?`: Matches a single character except the path separator + - `**`: Matches any number of characters including path separators + - `[]`, containing only the verbatim matched characters: Matches a single of the characters + contained. Within `[...]`, the hyphen indicates a locale-agnostic range (e.g. `a-z`, order based + on Unicode code points). Hyphens at the start or end are matched literally. +- The path separator is the forward slash character (`/`). Patterns are relative to the given + directory, a leading slash character for absolute paths is not supported. +- Parent directory indicators (`..`) are not allowed. + +These rules mean that matching the backslash (`\`) is forbidden, which avoid collisions with the +windows path separator. diff --git a/crates/uv-globfilter/src/glob_dir_filter.rs b/crates/uv-globfilter/src/glob_dir_filter.rs new file mode 100644 index 000000000..4f8d81657 --- /dev/null +++ b/crates/uv-globfilter/src/glob_dir_filter.rs @@ -0,0 +1,276 @@ +use globset::{Glob, GlobSet, GlobSetBuilder}; +use regex_automata::dfa; +use regex_automata::dfa::Automaton; +use std::path::{Path, MAIN_SEPARATOR, MAIN_SEPARATOR_STR}; +use tracing::warn; + +/// Chosen at a whim -Konsti +const DFA_SIZE_LIMIT: usize = 1_000_000; + +/// Filter a directory tree traversal (walkdir) by whether any paths of a directory can be included +/// at all. +/// +/// Internally, the globs are converted to a regex and then to a DFA, which unlike the globs and the +/// regex allows to check for prefix matches. +pub struct GlobDirFilter { + glob_set: GlobSet, + dfa: Option>>, +} + +impl GlobDirFilter { + /// The filter matches if any of the globs matches. + /// + /// See for the error returned. + pub fn from_globs(globs: &[Glob]) -> Result { + let mut glob_set_builder = GlobSetBuilder::new(); + for glob in globs { + glob_set_builder.add(glob.clone()); + } + let glob_set = glob_set_builder.build()?; + + let regexes: Vec<_> = globs + .iter() + .map(|glob| { + let main_separator = regex::escape(MAIN_SEPARATOR_STR); + let regex = glob + .regex() + // We are using a custom DFA builder + .strip_prefix("(?-u)") + .expect("a glob is a non-unicode byte regex") + // Match windows paths if applicable + .replace('/', &main_separator); + regex + }) + .collect(); + + let dfa_builder = dfa::dense::Builder::new() + .syntax( + // The glob regex is a byte matcher + regex_automata::util::syntax::Config::new() + .unicode(false) + .utf8(false), + ) + .configure( + dfa::dense::Config::new() + .start_kind(dfa::StartKind::Anchored) + // DFA can grow exponentially, in which case we bail out + .dfa_size_limit(Some(DFA_SIZE_LIMIT)) + .determinize_size_limit(Some(DFA_SIZE_LIMIT)), + ) + .build_many(®exes); + let dfa = if let Ok(dfa) = dfa_builder { + Some(dfa) + } else { + // TODO(konsti): `regex_automata::dfa::dense::BuildError` should allow asking whether + // is a size error + warn!( + "Glob expressions regex is larger than {DFA_SIZE_LIMIT} bytes, \ + falling back to full directory traversal!" + ); + None + }; + + Ok(Self { glob_set, dfa }) + } + + /// Whether the path (file or directory) matches any of the globs. + pub fn match_path(&self, path: &Path) -> bool { + self.glob_set.is_match(path) + } + + /// Check whether a directory or any of its children can be matched by any of the globs. + /// + /// This option never returns false if any child matches, but it may return true even if we + /// don't end up including any child. + pub fn match_directory(&self, path: &Path) -> bool { + let Some(dfa) = &self.dfa else { + return false; + }; + + // Allow the root path + if path == Path::new("") { + return true; + } + + let config_anchored = + regex_automata::util::start::Config::new().anchored(regex_automata::Anchored::Yes); + let mut state = dfa.start_state(&config_anchored).unwrap(); + + // Paths aren't necessarily UTF-8, which we can gloss over since the globs match bytes only + // anyway. + let byte_path = path.as_os_str().as_encoded_bytes(); + for b in byte_path { + state = dfa.next_state(state, *b); + } + // Say we're looking at a directory `foo/bar`. We want to continue if either `foo/bar` is + // a match, e.g., from `foo/*`, or a path below it can match, e.g., from `foo/bar/*`. + let eoi_state = dfa.next_eoi_state(state); + // We must not call `next_eoi_state` on the slash state, we want to only check if more + // characters (path components) are allowed, not if we're matching the `$` anchor at the + // end. + let slash_state = dfa.next_state(state, u8::try_from(MAIN_SEPARATOR).unwrap()); + + debug_assert!( + !dfa.is_quit_state(eoi_state) && !dfa.is_quit_state(slash_state), + "matcher is in quit state" + ); + + dfa.is_match_state(eoi_state) || !dfa.is_dead_state(slash_state) + } +} + +#[cfg(test)] +mod tests { + use crate::glob_dir_filter::GlobDirFilter; + use crate::portable_glob::parse_portable_glob; + use std::path::{Path, MAIN_SEPARATOR}; + use tempfile::tempdir; + use walkdir::WalkDir; + + const FILES: [&str; 5] = [ + "path1/dir1/subdir/a.txt", + "path2/dir2/subdir/a.txt", + "path3/dir3/subdir/a.txt", + "path4/dir4/subdir/a.txt", + "path5/dir5/subdir/a.txt", + ]; + + const PATTERNS: [&str; 5] = [ + // Only sufficient for descending one level + "path1/*", + // Only sufficient for descending one level + "path2/dir2", + // Sufficient for descending + "path3/dir3/subdir/a.txt", + // Sufficient for descending + "path4/**/*", + // Not sufficient for descending + "path5", + ]; + + #[test] + fn match_directory() { + let patterns = PATTERNS.map(|pattern| parse_portable_glob(pattern).unwrap()); + let matcher = GlobDirFilter::from_globs(&patterns).unwrap(); + assert!(matcher.match_directory(&Path::new("path1").join("dir1"))); + assert!(matcher.match_directory(&Path::new("path2").join("dir2"))); + assert!(matcher.match_directory(&Path::new("path3").join("dir3"))); + assert!(matcher.match_directory(&Path::new("path4").join("dir4"))); + assert!(!matcher.match_directory(&Path::new("path5").join("dir5"))); + } + + /// Check that we skip directories that can never match. + #[test] + fn prefilter() { + let dir = tempdir().unwrap(); + for file in FILES { + let file = dir.path().join(file); + fs_err::create_dir_all(file.parent().unwrap()).unwrap(); + fs_err::File::create(file).unwrap(); + } + let patterns = PATTERNS.map(|pattern| parse_portable_glob(pattern).unwrap()); + let matcher = GlobDirFilter::from_globs(&patterns).unwrap(); + + // Test the prefix filtering + let mut visited: Vec<_> = WalkDir::new(dir.path()) + .into_iter() + .filter_entry(|entry| { + let relative = entry + .path() + .strip_prefix(dir.path()) + .expect("walkdir starts with root"); + matcher.match_directory(relative) + }) + .map(|entry| { + let entry = entry.unwrap(); + let relative = entry + .path() + .strip_prefix(dir.path()) + .expect("walkdir starts with root") + .to_str() + .unwrap() + .to_string(); + // Translate windows paths back to the unix fixture + relative.replace(MAIN_SEPARATOR, "/") + }) + .collect(); + visited.sort(); + assert_eq!( + visited, + [ + "", + "path1", + "path1/dir1", + "path2", + "path2/dir2", + "path3", + "path3/dir3", + "path3/dir3/subdir", + "path3/dir3/subdir/a.txt", + "path4", + "path4/dir4", + "path4/dir4/subdir", + "path4/dir4/subdir/a.txt", + "path5" + ] + ); + } + + /// Check that the walkdir yield the correct set of files. + #[test] + fn walk_dir() { + let dir = tempdir().unwrap(); + + for file in FILES { + let file = dir.path().join(file); + fs_err::create_dir_all(file.parent().unwrap()).unwrap(); + fs_err::File::create(file).unwrap(); + } + let patterns = PATTERNS.map(|pattern| parse_portable_glob(pattern).unwrap()); + + let include_matcher = GlobDirFilter::from_globs(&patterns).unwrap(); + + let walkdir_root = dir.path(); + let mut matches: Vec<_> = WalkDir::new(walkdir_root) + .into_iter() + .filter_entry(|entry| { + // TODO(konsti): This should be prettier. + let relative = entry + .path() + .strip_prefix(walkdir_root) + .expect("walkdir starts with root") + .to_path_buf(); + + include_matcher.match_directory(&relative) + }) + .filter_map(|entry| { + let entry = entry.as_ref().unwrap(); + // TODO(konsti): This should be prettier. + let relative = entry + .path() + .strip_prefix(walkdir_root) + .expect("walkdir starts with root") + .to_path_buf(); + if include_matcher.match_path(&relative) { + // Translate windows paths back to the unix fixture + Some(relative.to_str().unwrap().replace(MAIN_SEPARATOR, "/")) + } else { + None + } + }) + .collect(); + matches.sort(); + assert_eq!( + matches, + [ + "path1/dir1", + "path2/dir2", + "path3/dir3/subdir/a.txt", + "path4/dir4", + "path4/dir4/subdir", + "path4/dir4/subdir/a.txt", + "path5" + ] + ); + } +} diff --git a/crates/uv-globfilter/src/lib.rs b/crates/uv-globfilter/src/lib.rs new file mode 100644 index 000000000..3f18b15f0 --- /dev/null +++ b/crates/uv-globfilter/src/lib.rs @@ -0,0 +1,10 @@ +//! Implementation of PEP 639 cross-language restricted globs and a related directory traversal +//! prefilter. +//! +//! The goal is globs that are portable between languages and operating systems. + +mod glob_dir_filter; +mod portable_glob; + +pub use glob_dir_filter::GlobDirFilter; +pub use portable_glob::{check_portable_glob, parse_portable_glob, PortableGlobError}; diff --git a/crates/uv-globfilter/src/main.rs b/crates/uv-globfilter/src/main.rs new file mode 100644 index 000000000..9cdba3dad --- /dev/null +++ b/crates/uv-globfilter/src/main.rs @@ -0,0 +1,62 @@ +#![allow(clippy::print_stdout)] + +use globset::GlobSetBuilder; +use std::env::args; +use tracing::trace; +use uv_globfilter::{parse_portable_glob, GlobDirFilter}; +use walkdir::WalkDir; + +fn main() { + let includes = ["src/**", "pyproject.toml"]; + let excludes = ["__pycache__", "*.pyc", "*.pyo"]; + + let mut include_globs = Vec::new(); + for include in includes { + let glob = parse_portable_glob(include).unwrap(); + include_globs.push(glob.clone()); + } + let include_matcher = GlobDirFilter::from_globs(&include_globs).unwrap(); + + let mut exclude_builder = GlobSetBuilder::new(); + for exclude in excludes { + // Excludes are unanchored + let exclude = if let Some(exclude) = exclude.strip_prefix("/") { + exclude.to_string() + } else { + format!("**/{exclude}").to_string() + }; + let glob = parse_portable_glob(&exclude).unwrap(); + exclude_builder.add(glob); + } + // https://github.com/BurntSushi/ripgrep/discussions/2927 + let exclude_matcher = exclude_builder.build().unwrap(); + + let walkdir_root = args().next().unwrap(); + for entry in WalkDir::new(&walkdir_root) + .into_iter() + .filter_entry(|entry| { + // TODO(konsti): This should be prettier. + let relative = entry + .path() + .strip_prefix(&walkdir_root) + .expect("walkdir starts with root") + .to_path_buf(); + + include_matcher.match_directory(&relative) && !exclude_matcher.is_match(&relative) + }) + { + let entry = entry.unwrap(); + // TODO(konsti): This should be prettier. + let relative = entry + .path() + .strip_prefix(&walkdir_root) + .expect("walkdir starts with root") + .to_path_buf(); + + if !include_matcher.match_path(&relative) || exclude_matcher.is_match(&relative) { + trace!("Excluding: {}", relative.display()); + continue; + }; + println!("{}", relative.display()); + } +} diff --git a/crates/uv-globfilter/src/portable_glob.rs b/crates/uv-globfilter/src/portable_glob.rs new file mode 100644 index 000000000..095cbb7dd --- /dev/null +++ b/crates/uv-globfilter/src/portable_glob.rs @@ -0,0 +1,194 @@ +//! Cross-language glob syntax from [PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key). + +use globset::{Glob, GlobBuilder}; +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum PortableGlobError { + /// Shows the failing glob in the error message. + #[error(transparent)] + GlobError(#[from] globset::Error), + #[error( + "The parent directory operator (`..`) at position {pos} is not allowed in glob: `{glob}`" + )] + ParentDirectory { glob: String, pos: usize }, + #[error("Invalid character `{invalid}` at position {pos} in glob: `{glob}`")] + InvalidCharacter { + glob: String, + pos: usize, + invalid: char, + }, + #[error("Invalid character `{invalid}` at position {pos} in glob: `{glob}`")] + InvalidCharacterRange { + glob: String, + pos: usize, + invalid: char, + }, + #[error("Too many at stars at position {pos} in glob: `{glob}`")] + TooManyStars { glob: String, pos: usize }, +} + +/// Parse cross-language glob syntax from [PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key): +/// +/// - Alphanumeric characters, underscores (`_`), hyphens (`-`) and dots (`.`) are matched verbatim. +/// - The special glob characters are: +/// - `*`: Matches any number of characters except path separators +/// - `?`: Matches a single character except the path separator +/// - `**`: Matches any number of characters including path separators +/// - `[]`, containing only the verbatim matched characters: Matches a single of the characters contained. Within +/// `[...]`, the hyphen indicates a locale-agnostic range (e.g. `a-z`, order based on Unicode code points). Hyphens at +/// the start or end are matched literally. +/// - The path separator is the forward slash character (`/`). Patterns are relative to the given directory, a leading slash +/// character for absolute paths is not supported. +/// - Parent directory indicators (`..`) are not allowed. +/// +/// These rules mean that matching the backslash (`\`) is forbidden, which avoid collisions with the windows path separator. +pub fn parse_portable_glob(glob: &str) -> Result { + check_portable_glob(glob)?; + Ok(GlobBuilder::new(glob).literal_separator(true).build()?) +} + +/// See [`parse_portable_glob`]. +pub fn check_portable_glob(glob: &str) -> Result<(), PortableGlobError> { + let mut chars = glob.chars().enumerate().peekable(); + // A `..` is on a parent directory indicator at the start of the string or after a directory + // separator. + let mut start_or_slash = true; + // The number of consecutive stars before the current character. + while let Some((pos, c)) = chars.next() { + // `***` or `**literals` can be correctly represented with less stars. They are banned by + // `glob`, they are allowed by `globset` and PEP 639 is ambiguous, so we're filtering them + // out. + if c == '*' { + let mut star_run = 1; + while let Some((_, c)) = chars.peek() { + if *c == '*' { + star_run += 1; + chars.next(); + } else { + break; + } + } + if star_run >= 3 { + return Err(PortableGlobError::TooManyStars { + glob: glob.to_string(), + // We don't update pos for the stars. + pos, + }); + } else if star_run == 2 { + if chars.peek().is_some_and(|(_, c)| *c != '/') { + return Err(PortableGlobError::TooManyStars { + glob: glob.to_string(), + // We don't update pos for the stars. + pos, + }); + } + } + start_or_slash = false; + } else if c.is_alphanumeric() || matches!(c, '_' | '-' | '?') { + start_or_slash = false; + } else if c == '.' { + if start_or_slash && matches!(chars.peek(), Some((_, '.'))) { + return Err(PortableGlobError::ParentDirectory { + pos, + glob: glob.to_string(), + }); + } + start_or_slash = false; + } else if c == '/' { + start_or_slash = true; + } else if c == '[' { + for (pos, c) in chars.by_ref() { + if c.is_alphanumeric() || matches!(c, '_' | '-' | '.') { + // Allowed. + } else if c == ']' { + break; + } else { + return Err(PortableGlobError::InvalidCharacterRange { + glob: glob.to_string(), + pos, + invalid: c, + }); + } + } + start_or_slash = false; + } else { + return Err(PortableGlobError::InvalidCharacter { + glob: glob.to_string(), + pos, + invalid: c, + }); + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use insta::assert_snapshot; + + #[test] + fn test_error() { + let parse_err = |glob| parse_portable_glob(glob).unwrap_err().to_string(); + assert_snapshot!( + parse_err(".."), + @"The parent directory operator (`..`) at position 0 is not allowed in glob: `..`" + ); + assert_snapshot!( + parse_err("licenses/.."), + @"The parent directory operator (`..`) at position 9 is not allowed in glob: `licenses/..`" + ); + assert_snapshot!( + parse_err("licenses/LICEN!E.txt"), + @"Invalid character `!` at position 14 in glob: `licenses/LICEN!E.txt`" + ); + assert_snapshot!( + parse_err("licenses/LICEN[!C]E.txt"), + @"Invalid character `!` at position 15 in glob: `licenses/LICEN[!C]E.txt`" + ); + assert_snapshot!( + parse_err("licenses/LICEN[C?]E.txt"), + @"Invalid character `?` at position 16 in glob: `licenses/LICEN[C?]E.txt`" + ); + assert_snapshot!( + parse_err("******"), + @"Too many at stars at position 0 in glob: `******`" + ); + assert_snapshot!( + parse_err("licenses/**license"), + @"Too many at stars at position 9 in glob: `licenses/**license`" + ); + assert_snapshot!( + parse_err("licenses/***/licenses.csv"), + @"Too many at stars at position 9 in glob: `licenses/***/licenses.csv`" + ); + assert_snapshot!( + parse_err(r"licenses\eula.txt"), + @r"Invalid character `\` at position 8 in glob: `licenses\eula.txt`" + ); + } + + #[test] + fn test_valid() { + let cases = [ + "licenses/*.txt", + "licenses/**/*.txt", + "LICEN[CS]E.txt", + "LICEN?E.txt", + "[a-z].txt", + "[a-z._-].txt", + "*/**", + "LICENSE..txt", + "LICENSE_file-1.txt", + // (google translate) + "licenses/라이센스*.txt", + "licenses/ライセンス*.txt", + "licenses/执照*.txt", + "src/**", + ]; + for case in cases { + parse_portable_glob(case).unwrap(); + } + } +} diff --git a/crates/uv-install-wheel/src/linker.rs b/crates/uv-install-wheel/src/linker.rs index af00dc612..923be5ebf 100644 --- a/crates/uv-install-wheel/src/linker.rs +++ b/crates/uv-install-wheel/src/linker.rs @@ -470,7 +470,7 @@ fn copy_wheel_files( let entry = entry?; let path = entry.path(); - let relative = path.strip_prefix(&wheel).unwrap(); + let relative = path.strip_prefix(&wheel).expect("walkdir starts with root"); let out_path = site_packages.as_ref().join(relative); if entry.file_type().is_dir() { @@ -500,7 +500,7 @@ fn hardlink_wheel_files( let entry = entry?; let path = entry.path(); - let relative = path.strip_prefix(&wheel).unwrap(); + let relative = path.strip_prefix(&wheel).expect("walkdir starts with root"); let out_path = site_packages.as_ref().join(relative); if entry.file_type().is_dir() { diff --git a/crates/uv-install-wheel/src/wheel.rs b/crates/uv-install-wheel/src/wheel.rs index cf640348e..055a3d3d2 100644 --- a/crates/uv-install-wheel/src/wheel.rs +++ b/crates/uv-install-wheel/src/wheel.rs @@ -312,12 +312,14 @@ pub(crate) fn move_folder_recorded( let src = entry.path(); // This is the base path for moving to the actual target for the data // e.g. for data it's without <..>.data/data/ - let relative_to_data = src.strip_prefix(src_dir).expect("Prefix must no change"); + let relative_to_data = src + .strip_prefix(src_dir) + .expect("walkdir prefix must not change"); // This is the path stored in RECORD // e.g. for data it's with .data/data/ let relative_to_site_packages = src .strip_prefix(site_packages) - .expect("Prefix must no change"); + .expect("prefix must not change"); let target = dest_dir.join(relative_to_data); if entry.file_type().is_dir() { fs::create_dir_all(&target)?; diff --git a/crates/uv/src/commands/build_backend.rs b/crates/uv/src/commands/build_backend.rs index 475c59252..954450b79 100644 --- a/crates/uv/src/commands/build_backend.rs +++ b/crates/uv/src/commands/build_backend.rs @@ -4,11 +4,13 @@ use crate::commands::ExitStatus; use anyhow::Result; use std::env; use std::path::Path; +use uv_build_backend::SourceDistSettings; pub(crate) fn build_sdist(sdist_directory: &Path) -> Result { let filename = uv_build_backend::build_source_dist( &env::current_dir()?, sdist_directory, + SourceDistSettings::default(), uv_version::version(), )?; println!("{filename}");