Vendor r-shquote's unquote implementation (#11812)

## Summary This project is archived, so while it's tiny and arguably complete, I'd rather just vendor the one function we need.
2025-02-26 16:45:35 -05:00 · 2025-02-26 16:45:35 -05:00 · a439b7944d
parent 8f0c6f5a6f
commit a439b7944d
5 changed files with 205 additions and 10 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2844,12 +2844,6 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "640c9bd8497b02465aeef5375144c26062e0dcd5939dfcbb0f5db76cb8c17c73"
 [[package]]
 name = "r-shquote"
 version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d76b7a332c240b0b30ad8b52cc9aecf8ec96878ccb927ce1d2feb03920e0f711"
 [[package]]
 name = "rancor"
 version = "0.1.0"
@ -5537,7 +5531,6 @@ dependencies = [
 "indoc",
 "insta",
 "itertools 0.14.0",
 "r-shquote",
 "regex",
 "reqwest",
 "reqwest-middleware",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -134,7 +134,6 @@ proc-macro2 = { version = "1.0.86" }
 procfs = { version = "0.17.0", default-features = false, features = ["flate2"] }
 pubgrub = { git = "https://github.com/astral-sh/pubgrub", rev = "b70cf707aa43f21b32f3a61b8a0889b15032d5c4" }
 quote = { version = "1.0.37" }
 r-shquote = { version = "0.1.1" }
 rayon = { version = "1.10.0" }
 reflink-copy = { version = "0.1.19" }
 regex = { version = "1.10.6" }
--- a/crates/uv-requirements-txt/Cargo.toml
+++ b/crates/uv-requirements-txt/Cargo.toml
@ -26,7 +26,6 @@ uv-pypi-types = { workspace = true }
 uv-warnings = { workspace = true }
 fs-err = { workspace = true }
 r-shquote = { workspace = true }
 regex = { workspace = true }
 reqwest = { workspace = true, optional = true }
 reqwest-middleware = { workspace = true, optional = true }
--- a/crates/uv-requirements-txt/src/lib.rs
+++ b/crates/uv-requirements-txt/src/lib.rs
@ -40,7 +40,6 @@ use std::io;
 use std::path::{Path, PathBuf};
 use std::str::FromStr;
 use r_shquote::unquote;
 use tracing::instrument;
 use unscanny::{Pattern, Scanner};
 use url::Url;
@ -56,8 +55,10 @@ use uv_pypi_types::{Requirement, VerbatimParsedUrl};
 use crate::requirement::EditableError;
 pub use crate::requirement::RequirementsTxtRequirement;
 use crate::shquote::unquote;
 mod requirement;
 mod shquote;
 /// We emit one of those for each `requirements.txt` entry.
 enum RequirementsTxtStatement {
--- a/crates/uv-requirements-txt/src/shquote.rs
+++ b/crates/uv-requirements-txt/src/shquote.rs
@ -0,0 +1,203 @@
 //! POSIX Shell Compatible Argument Parser
 //!
 //! This implementation is vendored from the [`r-shquote`](https://github.com/r-util/r-shquote)
 //! crate under the Apache 2.0 license:
 //!
 //! ```text
 //! Licensed under the Apache License, Version 2.0 (the "License");
 //! you may not use this file except in compliance with the License.
 //! You may obtain a copy of the License at
 //!
 //!         https://www.apache.org/licenses/LICENSE-2.0
 //!
 //! Unless required by applicable law or agreed to in writing, software
 //! distributed under the License is distributed on an "AS IS" BASIS,
 //! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 //! See the License for the specific language governing permissions and
 //! limitations under the License.
 //! ```
 #[derive(Debug, Clone)]
 #[allow(dead_code)]
 pub(crate) enum UnquoteError {
    UnterminatedSingleQuote {
        char_cursor: usize,
        byte_cursor: usize,
    },
    UnterminatedDoubleQuote {
        char_cursor: usize,
        byte_cursor: usize,
    },
 }
 impl std::fmt::Display for UnquoteError {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        write!(f, "{self:?}")
    }
 }
 impl std::error::Error for UnquoteError {}
 fn unquote_open_single(
    acc: &mut String,
    cursor: &mut std::iter::Enumerate<std::str::CharIndices>,
 ) -> bool {
    // This decodes a single-quote sequence. The opening single-quote was already parsed by
    // the caller. Both `&source[start]` and `cursor` point to the first character following
    // the opening single-quote.
    // Anything inside the single-quote sequence is copied verbatim to the output until the
    // next single-quote. No escape sequences are supported, not even a single-quote can be
    // escaped. However, if the sequence is not terminated, the entire operation is considered
    // invalid.
    for i in cursor {
        match i {
            (_, (_, '\'')) => return true,
            (_, (_, c)) => acc.push(c),
        }
    }
    false
 }
 fn unquote_open_double(
    acc: &mut String,
    cursor: &mut std::iter::Enumerate<std::str::CharIndices>,
 ) -> bool {
    // This decodes a double-quote sequence. The opening double-quote was already parsed by
    // the caller. Both `&source[start]` and `cursor` point to the first character following
    // the opening double-quote.
    // A double-quote sequence allows escape-sequences and goes until the closing
    // double-quote. If the sequence is not terminated, though, the entire operation is
    // considered invalid.
    loop {
        match cursor.next() {
            Some((_, (_, '"'))) => {
                // An unescaped double-quote character terminates the double-quote sequence.
                // It produces no output.
                return true;
            }
            Some((_, (_, '\\'))) => {
                // Inside a double-quote sequence several escape sequences are allowed. In
                // general, any unknown sequence is copied verbatim in its entirety including
                // the backslash. Known sequences produce the escaped character in its output
                // and makes the parser not interpret it. If the sequence is non-terminated,
                // it implies that the double-quote sequence is non-terminated and thus
                // invokes the same behavior, meaning the entire operation is refused.
                match cursor.next() {
                    Some((_, (_, esc_ch)))
                        if esc_ch == '"'
                            || esc_ch == '\\'
                            || esc_ch == '`'
                            || esc_ch == '$'
                            || esc_ch == '\n' =>
                    {
                        acc.push(esc_ch);
                    }
                    Some((_, (_, esc_ch))) => {
                        acc.push('\\');
                        acc.push(esc_ch);
                    }
                    None => {
                        return false;
                    }
                }
            }
            Some((_, (_, inner_ch))) => {
                // Any non-special character inside a double-quote is copied
                // literally just like characters outside of it.
                acc.push(inner_ch);
            }
            None => {
                // The double-quote sequence was not terminated. The entire
                // operation is considered invalid and we have to refuse producing
                // any resulting value.
                return false;
            }
        }
    }
 }
 fn unquote_open_escape(acc: &mut String, cursor: &mut std::iter::Enumerate<std::str::CharIndices>) {
    // This decodes an escape sequence outside of any quote. The opening backslash was already
    // parsed by the caller. Both `&source[start]` and `cursor` point to the first character
    // following the opening backslash.
    // Outside of quotes, an escape sequence simply treats the next character literally, and
    // does not interpret it. The exceptions are literal <NL> (newline character) and a single
    // backslash as last character in the string. In these cases the escape-sequence is
    // stripped and produces no output. The <NL> case is a remnant of human shell input, where
    // you can input multiple lines by appending a backslash to the previous line. This causes
    // both the backslash and <NL> to be ignore, since they purely serve readability of user
    // input.
    if let Some((_, (_, esc_ch))) = cursor.next() {
        if esc_ch != '\n' {
            acc.push(esc_ch);
        }
    }
 }
 /// Unquote String
 ///
 /// Unquote a single string according to POSIX Shell quoting and escaping rules. If the input
 /// string is not a valid input, the operation will fail and provide diagnosis information on
 /// where the first invalid part was encountered.
 ///
 /// The result is canonical. There is only one valid unquoted result for a given input.
 ///
 /// # Examples
 ///
 /// ```
 /// assert_eq!(r_shquote::unquote("foobar").unwrap(), "foobar");
 /// ```
 pub(crate) fn unquote(source: &str) -> Result<String, UnquoteError> {
    // An unquote-operation never results in a longer string. Furthermore, the common case is
    // most of the string is unquoted / unescaped. Hence, we simply allocate the same space
    // for the resulting string as the input.
    let mut acc = String::with_capacity(source.len());
    // We loop over the string. When a single-quote, double-quote, or escape sequence is
    // opened, we let our helpers parse the sub-strings. Anything else is copied over
    // literally until the end of the line.
    let mut cursor = source.char_indices().enumerate();
    loop {
        match cursor.next() {
            Some((next_idx, (next_pos, '\''))) => {
                if !unquote_open_single(&mut acc, &mut cursor) {
                    break Err(UnquoteError::UnterminatedSingleQuote {
                        char_cursor: next_idx,
                        byte_cursor: next_pos,
                    });
                }
            }
            Some((next_idx, (next_pos, '"'))) => {
                if !unquote_open_double(&mut acc, &mut cursor) {
                    break Err(UnquoteError::UnterminatedDoubleQuote {
                        char_cursor: next_idx,
                        byte_cursor: next_pos,
                    });
                }
            }
            Some((_, (_, '\\'))) => {
                unquote_open_escape(&mut acc, &mut cursor);
            }
            Some((_, (_, next_ch))) => {
                acc.push(next_ch);
            }
            None => {
                break Ok(acc);
            }
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn basic() {
        assert_eq!(unquote("foobar").unwrap(), "foobar");
        assert_eq!(unquote("foo'bar'").unwrap(), "foobar");
        assert_eq!(unquote("foo\"bar\"").unwrap(), "foobar");
        assert_eq!(unquote("\\foobar\\").unwrap(), "foobar");
        assert_eq!(unquote("\\'foobar\\'").unwrap(), "'foobar'");
    }
 }