From a43333351e53210f41e969041f5919fb1f534134 Mon Sep 17 00:00:00 2001 From: konsti Date: Wed, 7 May 2025 18:31:41 +0200 Subject: [PATCH] Build backend: Allow escaping in globs (#13313) PEP 639 does not allow any characters that aren't in either their limited glob syntax or the alphanumeric Unicode characters. This means there's no way to express a glob such as `**/@test` for the excludes. We extend the glob syntax from PEP 639 by introducing backslash escapes, which can escape all characters but path separators (forward and backwards slashes) to be parsed verbatim. This means we have two glob parsers: The strict PEP 639 parser for `project.license-files`, and our extended parser for `tool.uv`, with a slight difference if you need to use special characters, to both adhere to PEP 639 and to support cases such as #13280. Fixes #13280 --- Cargo.lock | 2 + crates/uv-build-backend/src/metadata.rs | 13 +- crates/uv-build-backend/src/source_dist.rs | 6 +- crates/uv-build-backend/src/wheel.rs | 4 +- crates/uv-globfilter/Cargo.toml | 2 + crates/uv-globfilter/src/glob_dir_filter.rs | 8 +- crates/uv-globfilter/src/main.rs | 4 +- crates/uv-globfilter/src/portable_glob.rs | 170 +++++++++++++++++--- docs/configuration/build-backend.md | 5 +- 9 files changed, 169 insertions(+), 45 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8d329c2d5..26bf8aef8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5258,9 +5258,11 @@ dependencies = [ name = "uv-globfilter" version = "0.1.0" dependencies = [ + "anstream", "fs-err 3.1.0", "globset", "insta", + "owo-colors", "regex", "regex-automata 0.4.9", "tempfile", diff --git a/crates/uv-build-backend/src/metadata.rs b/crates/uv-build-backend/src/metadata.rs index b91a2342c..36d2319fb 100644 --- a/crates/uv-build-backend/src/metadata.rs +++ b/crates/uv-build-backend/src/metadata.rs @@ -395,12 +395,13 @@ impl PyProjectToml { let mut license_files = Vec::new(); let mut license_globs_parsed = Vec::new(); for license_glob in license_globs { - let pep639_glob = PortableGlobParser.parse(license_glob).map_err(|err| { - Error::PortableGlob { - field: license_glob.to_string(), - source: err, - } - })?; + let pep639_glob = + PortableGlobParser::Pep639 + .parse(license_glob) + .map_err(|err| Error::PortableGlob { + field: license_glob.to_string(), + source: err, + })?; license_globs_parsed.push(pep639_glob); } let license_globs = diff --git a/crates/uv-build-backend/src/source_dist.rs b/crates/uv-build-backend/src/source_dist.rs index 03489275d..da26e02ad 100644 --- a/crates/uv-build-backend/src/source_dist.rs +++ b/crates/uv-build-backend/src/source_dist.rs @@ -86,7 +86,7 @@ fn source_dist_matcher( .to_string(); includes.push(format!("{}/**", globset::escape(import_path))); for include in includes { - let glob = PortableGlobParser + let glob = PortableGlobParser::Uv .parse(&include) .map_err(|err| Error::PortableGlob { field: "tool.uv.build-backend.source-include".to_string(), @@ -111,7 +111,7 @@ fn source_dist_matcher( // Include the license files for license_files in pyproject_toml.license_files_source_dist() { trace!("Including license files at: `{license_files}`"); - let glob = PortableGlobParser + let glob = PortableGlobParser::Pep639 .parse(license_files) .map_err(|err| Error::PortableGlob { field: "project.license-files".to_string(), @@ -122,7 +122,7 @@ fn source_dist_matcher( // Include the data files for (name, directory) in settings.data.iter() { - let glob = PortableGlobParser + let glob = PortableGlobParser::Uv .parse(&format!("{}/**", globset::escape(directory))) .map_err(|err| Error::PortableGlob { field: format!("tool.uv.build-backend.data.{name}"), diff --git a/crates/uv-build-backend/src/wheel.rs b/crates/uv-build-backend/src/wheel.rs index 6ec36dae4..d3126f88a 100644 --- a/crates/uv-build-backend/src/wheel.rs +++ b/crates/uv-build-backend/src/wheel.rs @@ -431,7 +431,7 @@ pub(crate) fn build_exclude_matcher( } else { format!("**/{exclude}").to_string() }; - let glob = PortableGlobParser + let glob = PortableGlobParser::Uv .parse(&exclude) .map_err(|err| Error::PortableGlob { field: "tool.uv.build-backend.*-exclude".to_string(), @@ -468,7 +468,7 @@ fn wheel_subdir_from_globs( src.user_display(), license_files ); - PortableGlobParser.parse(license_files) + PortableGlobParser::Pep639.parse(license_files) }) .collect::>() .map_err(|err| Error::PortableGlob { diff --git a/crates/uv-globfilter/Cargo.toml b/crates/uv-globfilter/Cargo.toml index 70ba3373a..ca45a92f6 100644 --- a/crates/uv-globfilter/Cargo.toml +++ b/crates/uv-globfilter/Cargo.toml @@ -12,6 +12,7 @@ license.workspace = true [dependencies] globset = { workspace = true } +owo-colors = { workspace = true } regex = { workspace = true } regex-automata = { workspace = true } thiserror = { workspace = true } @@ -19,6 +20,7 @@ tracing = { workspace = true } walkdir = { workspace = true } [dev-dependencies] +anstream = { workspace = true } fs-err = { workspace = true } insta = "1.41.1" tempfile = { workspace = true } diff --git a/crates/uv-globfilter/src/glob_dir_filter.rs b/crates/uv-globfilter/src/glob_dir_filter.rs index e8bece135..fd14bffff 100644 --- a/crates/uv-globfilter/src/glob_dir_filter.rs +++ b/crates/uv-globfilter/src/glob_dir_filter.rs @@ -124,7 +124,7 @@ impl GlobDirFilter { #[cfg(test)] mod tests { use crate::glob_dir_filter::GlobDirFilter; - use crate::portable_glob::PortableGlobParser; + use crate::PortableGlobParser; use std::path::{Path, MAIN_SEPARATOR}; use tempfile::tempdir; use walkdir::WalkDir; @@ -152,7 +152,7 @@ mod tests { #[test] fn match_directory() { - let patterns = PATTERNS.map(|pattern| PortableGlobParser.parse(pattern).unwrap()); + let patterns = PATTERNS.map(|pattern| PortableGlobParser::Pep639.parse(pattern).unwrap()); let matcher = GlobDirFilter::from_globs(&patterns).unwrap(); assert!(matcher.match_directory(&Path::new("path1").join("dir1"))); assert!(matcher.match_directory(&Path::new("path2").join("dir2"))); @@ -170,7 +170,7 @@ mod tests { fs_err::create_dir_all(file.parent().unwrap()).unwrap(); fs_err::File::create(file).unwrap(); } - let patterns = PATTERNS.map(|pattern| PortableGlobParser.parse(pattern).unwrap()); + let patterns = PATTERNS.map(|pattern| PortableGlobParser::Pep639.parse(pattern).unwrap()); let matcher = GlobDirFilter::from_globs(&patterns).unwrap(); // Test the prefix filtering @@ -228,7 +228,7 @@ mod tests { fs_err::create_dir_all(file.parent().unwrap()).unwrap(); fs_err::File::create(file).unwrap(); } - let patterns = PATTERNS.map(|pattern| PortableGlobParser.parse(pattern).unwrap()); + let patterns = PATTERNS.map(|pattern| PortableGlobParser::Pep639.parse(pattern).unwrap()); let include_matcher = GlobDirFilter::from_globs(&patterns).unwrap(); diff --git a/crates/uv-globfilter/src/main.rs b/crates/uv-globfilter/src/main.rs index 602a46e0e..0f54d75ad 100644 --- a/crates/uv-globfilter/src/main.rs +++ b/crates/uv-globfilter/src/main.rs @@ -12,7 +12,7 @@ fn main() { let mut include_globs = Vec::new(); for include in includes { - let glob = PortableGlobParser.parse(include).unwrap(); + let glob = PortableGlobParser::Pep639.parse(include).unwrap(); include_globs.push(glob.clone()); } let include_matcher = GlobDirFilter::from_globs(&include_globs).unwrap(); @@ -25,7 +25,7 @@ fn main() { } else { format!("**/{exclude}").to_string() }; - let glob = PortableGlobParser.parse(&exclude).unwrap(); + let glob = PortableGlobParser::Pep639.parse(&exclude).unwrap(); exclude_builder.add(glob); } // https://github.com/BurntSushi/ripgrep/discussions/2927 diff --git a/crates/uv-globfilter/src/portable_glob.rs b/crates/uv-globfilter/src/portable_glob.rs index 7496938d4..367b7db0f 100644 --- a/crates/uv-globfilter/src/portable_glob.rs +++ b/crates/uv-globfilter/src/portable_glob.rs @@ -2,6 +2,7 @@ //! [PEP 639](https://packaging.python.org/en/latest/specifications/glob-patterns/). use globset::{Glob, GlobBuilder}; +use owo_colors::OwoColorize; use thiserror::Error; #[derive(Debug, Error)] @@ -19,6 +20,24 @@ pub enum PortableGlobError { pos: usize, invalid: char, }, + #[error( + "Invalid character `{invalid}` at position {pos} in glob: `{glob}`. {}{} Characters can be escaped with a backslash", + "hint".bold().cyan(), + ":".bold() + )] + InvalidCharacterUv { + glob: String, + pos: usize, + invalid: char, + }, + #[error( + "Only forward slashes are allowed as path separator, invalid character at position {pos} in glob: `{glob}`" + )] + InvalidBackslash { glob: String, pos: usize }, + #[error( + "Path separators can't be escaped, invalid character at position {pos} in glob: `{glob}`" + )] + InvalidEscapee { glob: String, pos: usize }, #[error("Invalid character `{invalid}` in range at position {pos} in glob: `{glob}`")] InvalidCharacterRange { glob: String, @@ -27,15 +46,35 @@ pub enum PortableGlobError { }, #[error("Too many at stars at position {pos} in glob: `{glob}`")] TooManyStars { glob: String, pos: usize }, + #[error("Trailing backslash at position {pos} in glob: `{glob}`")] + TrailingEscape { glob: String, pos: usize }, } -/// Cross-language glob parser with the glob syntax from +/// Cross-language glob syntax from /// [PEP 639](https://packaging.python.org/en/latest/specifications/glob-patterns/). +/// +/// The variant determines whether the parser strictly adheres to PEP 639 rules or allows extensions +/// such as backslash escapes. #[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub struct PortableGlobParser; +pub enum PortableGlobParser { + /// Follow the PEP 639 rules strictly. + Pep639, + /// In addition to the PEP 639 syntax, allow escaping characters with backslashes. + /// + /// For cross-platform compatibility, escaping path separators is not allowed, i.e., forward + /// slashes and backslashes can't be escaped. + Uv, +} impl PortableGlobParser { - /// Parse cross-language glob syntax from [PEP 639](https://packaging.python.org/en/latest/specifications/glob-patterns/): + fn backslash_escape(self) -> bool { + match self { + PortableGlobParser::Pep639 => false, + PortableGlobParser::Uv => true, + } + } + + /// Parse cross-language glob syntax based on [PEP 639](https://packaging.python.org/en/latest/specifications/glob-patterns/): /// /// - Alphanumeric characters, underscores (`_`), hyphens (`-`) and dots (`.`) are matched verbatim. /// - The special glob characters are: @@ -45,6 +84,7 @@ impl PortableGlobParser { /// - `[]`, containing only the verbatim matched characters: Matches a single of the characters contained. Within /// `[...]`, the hyphen indicates a locale-agnostic range (e.g. `a-z`, order based on Unicode code points). Hyphens at /// the start or end are matched literally. + /// - `\`: Disallowed in PEP 639 mode. In uv mode, it escapes the following character to be matched verbatim. /// - The path separator is the forward slash character (`/`). Patterns are relative to the given directory, a leading slash /// character for absolute paths is not supported. /// - Parent directory indicators (`..`) are not allowed. @@ -52,10 +92,13 @@ impl PortableGlobParser { /// These rules mean that matching the backslash (`\`) is forbidden, which avoid collisions with the windows path separator. pub fn parse(&self, glob: &str) -> Result { self.check(glob)?; - Ok(GlobBuilder::new(glob).literal_separator(true).build()?) + Ok(GlobBuilder::new(glob) + .literal_separator(true) + .backslash_escape(self.backslash_escape()) + .build()?) } - /// See [`Self::parse`]. + /// See [`parse_portable_glob`]. pub fn check(&self, glob: &str) -> Result<(), PortableGlobError> { let mut chars = glob.chars().enumerate().peekable(); // A `..` is on a parent directory indicator at the start of the string or after a directory @@ -119,12 +162,50 @@ impl PortableGlobParser { } } start_or_slash = false; + } else if c == '\\' { + match *self { + PortableGlobParser::Pep639 => { + return Err(PortableGlobError::InvalidBackslash { + glob: glob.to_string(), + pos, + }); + } + PortableGlobParser::Uv => { + match chars.next() { + Some((pos, '/' | '\\')) => { + // For cross-platform compatibility, we don't allow forward slashes or + // backslashes to be escaped. + return Err(PortableGlobError::InvalidEscapee { + glob: glob.to_string(), + pos, + }); + } + Some(_) => { + // Escaped character + } + None => { + return Err(PortableGlobError::TrailingEscape { + glob: glob.to_string(), + pos, + }); + } + } + } + } } else { - return Err(PortableGlobError::InvalidCharacter { - glob: glob.to_string(), - pos, - invalid: c, - }); + let err = match *self { + PortableGlobParser::Pep639 => PortableGlobError::InvalidCharacter { + glob: glob.to_string(), + pos, + invalid: c, + }, + PortableGlobParser::Uv => PortableGlobError::InvalidCharacterUv { + glob: glob.to_string(), + pos, + invalid: c, + }, + }; + return Err(err); } } Ok(()) @@ -138,7 +219,10 @@ mod tests { #[test] fn test_error() { - let parse_err = |glob| PortableGlobParser.parse(glob).unwrap_err().to_string(); + let parse_err = |glob| { + let error = PortableGlobParser::Pep639.parse(glob).unwrap_err(); + anstream::adapter::strip_str(&error.to_string()).to_string() + }; assert_snapshot!( parse_err(".."), @"The parent directory operator (`..`) at position 0 is not allowed in glob: `..`" @@ -173,30 +257,64 @@ mod tests { ); assert_snapshot!( parse_err(r"licenses\eula.txt"), - @r"Invalid character `\` at position 8 in glob: `licenses\eula.txt`" + @r"Only forward slashes are allowed as path separator, invalid character at position 8 in glob: `licenses\eula.txt`" + ); + assert_snapshot!( + parse_err(r"**/@test"), + @"Invalid character `@` at position 3 in glob: `**/@test`" + ); + // Escapes are not allowed in strict PEP 639 mode + assert_snapshot!( + parse_err(r"public domain/Gulliver\\’s Travels.txt"), + @r"Invalid character ` ` at position 6 in glob: `public domain/Gulliver\\’s Travels.txt`" + ); + let parse_err_uv = |glob| { + let error = PortableGlobParser::Uv.parse(glob).unwrap_err(); + anstream::adapter::strip_str(&error.to_string()).to_string() + }; + assert_snapshot!( + parse_err_uv(r"**/@test"), + @"Invalid character `@` at position 3 in glob: `**/@test`. hint: Characters can be escaped with a backslash" + ); + // Escaping slashes is not allowed. + assert_snapshot!( + parse_err_uv(r"licenses\\MIT.txt"), + @r"Path separators can't be escaped, invalid character at position 9 in glob: `licenses\\MIT.txt`" + ); + assert_snapshot!( + parse_err_uv(r"licenses\/MIT.txt"), + @r"Path separators can't be escaped, invalid character at position 9 in glob: `licenses\/MIT.txt`" ); } #[test] fn test_valid() { let cases = [ - "licenses/*.txt", - "licenses/**/*.txt", - "LICEN[CS]E.txt", - "LICEN?E.txt", - "[a-z].txt", - "[a-z._-].txt", - "*/**", - "LICENSE..txt", - "LICENSE_file-1.txt", + r"licenses/*.txt", + r"licenses/**/*.txt", + r"LICEN[CS]E.txt", + r"LICEN?E.txt", + r"[a-z].txt", + r"[a-z._-].txt", + r"*/**", + r"LICENSE..txt", + r"LICENSE_file-1.txt", // (google translate) - "licenses/라이센스*.txt", - "licenses/ライセンス*.txt", - "licenses/执照*.txt", - "src/**", + r"licenses/라이센스*.txt", + r"licenses/ライセンス*.txt", + r"licenses/执照*.txt", + r"src/**", + ]; + let cases_uv = [ + r"public-domain/Gulliver\’s\ Travels.txt", + // https://github.com/astral-sh/uv/issues/13280 + r"**/\@test", ]; for case in cases { - PortableGlobParser.parse(case).unwrap(); + PortableGlobParser::Pep639.parse(case).unwrap(); + } + for case in cases.iter().chain(cases_uv.iter()) { + PortableGlobParser::Uv.parse(case).unwrap(); } } } diff --git a/docs/configuration/build-backend.md b/docs/configuration/build-backend.md index 6825919f0..b05856bdd 100644 --- a/docs/configuration/build-backend.md +++ b/docs/configuration/build-backend.md @@ -19,7 +19,7 @@ existing project, add it to the `[build-system]` section in your `pyproject.toml ```toml [build-system] -requires = ["uv_build>=0.6.13,<0.7"] +requires = ["uv_build>=0.7.2,<0.8.0"] build-backend = "uv_build" ``` @@ -89,4 +89,5 @@ Excludes are not anchored, which means that `__pycache__` excludes all directori exclude only `/dist`. All fields accepting patterns use the reduced portable glob syntax from -[PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key). +[PEP 639](https://peps.python.org/pep-0639/#add-license-FILES-key), with the addition that +characters can be escaped with a backslash.