diff --git a/Cargo.lock b/Cargo.lock index a3d7ce814..5c6038aa6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2844,12 +2844,6 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "640c9bd8497b02465aeef5375144c26062e0dcd5939dfcbb0f5db76cb8c17c73" -[[package]] -name = "r-shquote" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d76b7a332c240b0b30ad8b52cc9aecf8ec96878ccb927ce1d2feb03920e0f711" - [[package]] name = "rancor" version = "0.1.0" @@ -5537,7 +5531,6 @@ dependencies = [ "indoc", "insta", "itertools 0.14.0", - "r-shquote", "regex", "reqwest", "reqwest-middleware", diff --git a/Cargo.toml b/Cargo.toml index 3bd1d7e16..6a8df273b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -134,7 +134,6 @@ proc-macro2 = { version = "1.0.86" } procfs = { version = "0.17.0", default-features = false, features = ["flate2"] } pubgrub = { git = "https://github.com/astral-sh/pubgrub", rev = "b70cf707aa43f21b32f3a61b8a0889b15032d5c4" } quote = { version = "1.0.37" } -r-shquote = { version = "0.1.1" } rayon = { version = "1.10.0" } reflink-copy = { version = "0.1.19" } regex = { version = "1.10.6" } diff --git a/crates/uv-requirements-txt/Cargo.toml b/crates/uv-requirements-txt/Cargo.toml index 0f7a4f0fa..7dcc2964e 100644 --- a/crates/uv-requirements-txt/Cargo.toml +++ b/crates/uv-requirements-txt/Cargo.toml @@ -26,7 +26,6 @@ uv-pypi-types = { workspace = true } uv-warnings = { workspace = true } fs-err = { workspace = true } -r-shquote = { workspace = true } regex = { workspace = true } reqwest = { workspace = true, optional = true } reqwest-middleware = { workspace = true, optional = true } diff --git a/crates/uv-requirements-txt/src/lib.rs b/crates/uv-requirements-txt/src/lib.rs index 8953b82ab..8840dcc18 100644 --- a/crates/uv-requirements-txt/src/lib.rs +++ b/crates/uv-requirements-txt/src/lib.rs @@ -40,7 +40,6 @@ use std::io; use std::path::{Path, PathBuf}; use std::str::FromStr; -use r_shquote::unquote; use tracing::instrument; use unscanny::{Pattern, Scanner}; use url::Url; @@ -56,8 +55,10 @@ use uv_pypi_types::{Requirement, VerbatimParsedUrl}; use crate::requirement::EditableError; pub use crate::requirement::RequirementsTxtRequirement; +use crate::shquote::unquote; mod requirement; +mod shquote; /// We emit one of those for each `requirements.txt` entry. enum RequirementsTxtStatement { diff --git a/crates/uv-requirements-txt/src/shquote.rs b/crates/uv-requirements-txt/src/shquote.rs new file mode 100644 index 000000000..27eecb065 --- /dev/null +++ b/crates/uv-requirements-txt/src/shquote.rs @@ -0,0 +1,203 @@ +//! POSIX Shell Compatible Argument Parser +//! +//! This implementation is vendored from the [`r-shquote`](https://github.com/r-util/r-shquote) +//! crate under the Apache 2.0 license: +//! +//! ```text +//! Licensed under the Apache License, Version 2.0 (the "License"); +//! you may not use this file except in compliance with the License. +//! You may obtain a copy of the License at +//! +//! https://www.apache.org/licenses/LICENSE-2.0 +//! +//! Unless required by applicable law or agreed to in writing, software +//! distributed under the License is distributed on an "AS IS" BASIS, +//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//! See the License for the specific language governing permissions and +//! limitations under the License. +//! ``` +#[derive(Debug, Clone)] +#[allow(dead_code)] +pub(crate) enum UnquoteError { + UnterminatedSingleQuote { + char_cursor: usize, + byte_cursor: usize, + }, + UnterminatedDoubleQuote { + char_cursor: usize, + byte_cursor: usize, + }, +} + +impl std::fmt::Display for UnquoteError { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{self:?}") + } +} + +impl std::error::Error for UnquoteError {} + +fn unquote_open_single( + acc: &mut String, + cursor: &mut std::iter::Enumerate, +) -> bool { + // This decodes a single-quote sequence. The opening single-quote was already parsed by + // the caller. Both `&source[start]` and `cursor` point to the first character following + // the opening single-quote. + // Anything inside the single-quote sequence is copied verbatim to the output until the + // next single-quote. No escape sequences are supported, not even a single-quote can be + // escaped. However, if the sequence is not terminated, the entire operation is considered + // invalid. + for i in cursor { + match i { + (_, (_, '\'')) => return true, + (_, (_, c)) => acc.push(c), + } + } + + false +} + +fn unquote_open_double( + acc: &mut String, + cursor: &mut std::iter::Enumerate, +) -> bool { + // This decodes a double-quote sequence. The opening double-quote was already parsed by + // the caller. Both `&source[start]` and `cursor` point to the first character following + // the opening double-quote. + // A double-quote sequence allows escape-sequences and goes until the closing + // double-quote. If the sequence is not terminated, though, the entire operation is + // considered invalid. + loop { + match cursor.next() { + Some((_, (_, '"'))) => { + // An unescaped double-quote character terminates the double-quote sequence. + // It produces no output. + return true; + } + Some((_, (_, '\\'))) => { + // Inside a double-quote sequence several escape sequences are allowed. In + // general, any unknown sequence is copied verbatim in its entirety including + // the backslash. Known sequences produce the escaped character in its output + // and makes the parser not interpret it. If the sequence is non-terminated, + // it implies that the double-quote sequence is non-terminated and thus + // invokes the same behavior, meaning the entire operation is refused. + match cursor.next() { + Some((_, (_, esc_ch))) + if esc_ch == '"' + || esc_ch == '\\' + || esc_ch == '`' + || esc_ch == '$' + || esc_ch == '\n' => + { + acc.push(esc_ch); + } + Some((_, (_, esc_ch))) => { + acc.push('\\'); + acc.push(esc_ch); + } + None => { + return false; + } + } + } + Some((_, (_, inner_ch))) => { + // Any non-special character inside a double-quote is copied + // literally just like characters outside of it. + acc.push(inner_ch); + } + None => { + // The double-quote sequence was not terminated. The entire + // operation is considered invalid and we have to refuse producing + // any resulting value. + return false; + } + } + } +} + +fn unquote_open_escape(acc: &mut String, cursor: &mut std::iter::Enumerate) { + // This decodes an escape sequence outside of any quote. The opening backslash was already + // parsed by the caller. Both `&source[start]` and `cursor` point to the first character + // following the opening backslash. + // Outside of quotes, an escape sequence simply treats the next character literally, and + // does not interpret it. The exceptions are literal (newline character) and a single + // backslash as last character in the string. In these cases the escape-sequence is + // stripped and produces no output. The case is a remnant of human shell input, where + // you can input multiple lines by appending a backslash to the previous line. This causes + // both the backslash and to be ignore, since they purely serve readability of user + // input. + if let Some((_, (_, esc_ch))) = cursor.next() { + if esc_ch != '\n' { + acc.push(esc_ch); + } + } +} + +/// Unquote String +/// +/// Unquote a single string according to POSIX Shell quoting and escaping rules. If the input +/// string is not a valid input, the operation will fail and provide diagnosis information on +/// where the first invalid part was encountered. +/// +/// The result is canonical. There is only one valid unquoted result for a given input. +/// +/// # Examples +/// +/// ``` +/// assert_eq!(r_shquote::unquote("foobar").unwrap(), "foobar"); +/// ``` +pub(crate) fn unquote(source: &str) -> Result { + // An unquote-operation never results in a longer string. Furthermore, the common case is + // most of the string is unquoted / unescaped. Hence, we simply allocate the same space + // for the resulting string as the input. + let mut acc = String::with_capacity(source.len()); + + // We loop over the string. When a single-quote, double-quote, or escape sequence is + // opened, we let our helpers parse the sub-strings. Anything else is copied over + // literally until the end of the line. + let mut cursor = source.char_indices().enumerate(); + loop { + match cursor.next() { + Some((next_idx, (next_pos, '\''))) => { + if !unquote_open_single(&mut acc, &mut cursor) { + break Err(UnquoteError::UnterminatedSingleQuote { + char_cursor: next_idx, + byte_cursor: next_pos, + }); + } + } + Some((next_idx, (next_pos, '"'))) => { + if !unquote_open_double(&mut acc, &mut cursor) { + break Err(UnquoteError::UnterminatedDoubleQuote { + char_cursor: next_idx, + byte_cursor: next_pos, + }); + } + } + Some((_, (_, '\\'))) => { + unquote_open_escape(&mut acc, &mut cursor); + } + Some((_, (_, next_ch))) => { + acc.push(next_ch); + } + None => { + break Ok(acc); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn basic() { + assert_eq!(unquote("foobar").unwrap(), "foobar"); + assert_eq!(unquote("foo'bar'").unwrap(), "foobar"); + assert_eq!(unquote("foo\"bar\"").unwrap(), "foobar"); + assert_eq!(unquote("\\foobar\\").unwrap(), "foobar"); + assert_eq!(unquote("\\'foobar\\'").unwrap(), "'foobar'"); + } +}