Parse JSONPath string literal

This commit is contained in:
Fabrice Reix 2025-12-15 09:11:13 +01:00
parent ed92031d87
commit c79707cf59
No known key found for this signature in database
GPG Key ID: 71B8BAD935E3190A
3 changed files with 439 additions and 8 deletions

View File

@ -33,4 +33,7 @@ impl ParseError {
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum ParseErrorKind {
Expecting(String),
InvalidCharacter(char),
InvalidEscapeSequence(String),
InvalidUnicodeEscape(String),
}

View File

@ -17,7 +17,7 @@
*/
use crate::jsonpath2::parser::primitives::{expect_str, match_str};
use crate::jsonpath2::parser::ParseResult;
use crate::jsonpath2::parser::{ParseError, ParseErrorKind, ParseResult};
use hurl_core::reader::Reader;
/// Try to parse a string literal
@ -32,26 +32,256 @@ pub fn try_parse(reader: &mut Reader) -> ParseResult<Option<String>> {
}
}
/// Try to parse a double-quoted string
fn try_double_quoted_string(reader: &mut Reader) -> ParseResult<Option<String>> {
if match_str("\"", reader) {
let s = reader.read_while(|c| c != '"');
expect_str("\"", reader)?;
Ok(Some(s))
let mut result = String::new();
while !reader.is_eof() {
let pos = reader.cursor().pos;
let ch = reader.peek().unwrap_or('\0');
if ch == '"' {
// End of string
reader.read();
return Ok(Some(result));
} else if ch == '\'' {
// Single quote is allowed in double-quoted strings
result.push(ch);
reader.read();
} else if ch == '\\' {
// Escape sequence
reader.read(); // consume backslash
let escaped = parse_escape_sequence(reader, '"')?;
result.push(escaped);
} else if is_unescaped_char(ch) {
result.push(ch);
reader.read();
} else {
return Err(ParseError::new(pos, ParseErrorKind::InvalidCharacter(ch)));
}
}
Err(ParseError::new(
reader.cursor().pos,
ParseErrorKind::Expecting("\"".to_string()),
))
} else {
Ok(None)
}
}
/// Try to parse a single-quoted string literal
fn try_single_quoted_string(reader: &mut Reader) -> ParseResult<Option<String>> {
if match_str("\'", reader) {
let s = reader.read_while(|c| c != '\'');
expect_str("\'", reader)?;
Ok(Some(s))
let mut result = String::new();
while !reader.is_eof() {
let pos = reader.cursor().pos;
let ch = reader.peek().unwrap_or('\0');
if ch == '\'' {
// End of string
reader.read();
return Ok(Some(result));
} else if ch == '"' {
// Double quote is allowed in single-quoted strings
result.push(ch);
reader.read();
} else if ch == '\\' {
// Escape sequence
reader.read(); // consume backslash
let escaped = parse_escape_sequence(reader, '\'')?;
result.push(escaped);
} else if is_unescaped_char(ch) {
result.push(ch);
reader.read();
} else {
return Err(ParseError::new(pos, ParseErrorKind::InvalidCharacter(ch)));
}
}
Err(ParseError::new(
reader.cursor().pos,
ParseErrorKind::Expecting("'".to_string()),
))
} else {
Ok(None)
}
}
/// Check if character is unescaped according to the spec
fn is_unescaped_char(ch: char) -> bool {
let code = ch as u32;
// unescaped = %x20-21 / %x23-26 / %x28-5B / %x5D-D7FF / %xE000-10FFFF
(0x20..=0x21).contains(&code) || // omit 0x22 "
(0x23..=0x26).contains(&code) || // omit 0x27 '
(0x28..=0x5B).contains(&code) || // omit 0x5C \
(0x5D..=0xD7FF).contains(&code) || // skip surrogate code points
(0xE000..=0x10FFFF).contains(&code)
}
/// Parse escape sequence after backslash
fn parse_escape_sequence(reader: &mut Reader, quote_char: char) -> ParseResult<char> {
let pos = reader.cursor().pos;
let ch = if let Some(value) = reader.read() {
value
} else {
return Err(ParseError::new(
pos,
ParseErrorKind::Expecting("escape character".to_string()),
));
};
match ch {
'b' => Ok('\u{0008}'), // BS backspace
'f' => Ok('\u{000C}'), // FF form feed
'n' => Ok('\n'), // LF line feed
'r' => Ok('\r'), // CR carriage return
't' => Ok('\t'), // HT horizontal tab
'/' => Ok('/'), // slash
'\\' => Ok('\\'), // backslash
'"' if quote_char == '"' => Ok('"'), // escaped double quote in double-quoted string
'\'' if quote_char == '\'' => Ok('\''), // escaped single quote in single-quoted string
'u' => {
// Unicode escape sequence \uXXXX
parse_unicode_escape(reader)
}
_ => Err(ParseError::new(
pos,
ParseErrorKind::InvalidEscapeSequence(format!("\\{}", ch)),
)),
}
}
/// Parse Unicode escape sequence after \u
fn parse_unicode_escape(reader: &mut Reader) -> ParseResult<char> {
if let Some(ch) = try_non_surrogate(reader)? {
Ok(ch)
} else if let Some(ch) = try_surrogate_pair(reader)? {
Ok(ch)
} else {
Err(ParseError::new(
reader.cursor().pos,
ParseErrorKind::InvalidUnicodeEscape("invalid unicode escape".to_string()),
))
}
}
/// Try to parse a non-surrogate Unicode code unit
fn try_non_surrogate(reader: &mut Reader) -> ParseResult<Option<char>> {
let save = reader.cursor();
let c1 = hex_digit(reader)?;
if c1 == 13 {
// D
let c2 = hex_digit(reader)?;
if c2 >= 8 {
reader.seek(save);
Ok(None)
} else {
let c3 = hex_digit(reader)?;
let c4 = hex_digit(reader)?;
let code_point = c1 * 4096 + c2 * 256 + c3 * 16 + c4;
Ok(Some(char::from_u32(code_point).ok_or_else(|| {
ParseError::new(
save.pos,
ParseErrorKind::InvalidUnicodeEscape(format!("{:04X}", code_point)),
)
})?))
}
} else {
let c2 = hex_digit(reader)?;
let c3 = hex_digit(reader)?;
let c4 = hex_digit(reader)?;
let code_point = c1 * 4096 + c2 * 256 + c3 * 16 + c4;
Ok(Some(char::from_u32(code_point).ok_or_else(|| {
ParseError::new(
save.pos,
ParseErrorKind::InvalidUnicodeEscape(format!("{:04X}", code_point)),
)
})?))
}
}
/// Try to parse a surrogate pair Unicode code unit
fn try_surrogate_pair(reader: &mut Reader) -> ParseResult<Option<char>> {
let pos = reader.cursor().pos;
if let Some(high_surrogate) = try_high_surrogate(reader)? {
expect_str("\\u", reader)?;
let low_surrogate = low_surrogate(reader)?;
let combined = 0x10000 + (high_surrogate << 10) + low_surrogate;
Ok(Some(char::from_u32(combined).ok_or_else(|| {
ParseError::new(
pos,
ParseErrorKind::InvalidUnicodeEscape(format!("{:06X}", combined)),
)
})?))
} else {
Ok(None)
}
}
/// Try to parse a high surrogate code unit
/// If found, returns the value of the high surrogate 10 bits
fn try_high_surrogate(reader: &mut Reader) -> ParseResult<Option<u32>> {
if match_str("D", reader) {
let c1 = hex_digit(reader)?;
if (8..=11).contains(&c1) {
let c2 = hex_digit(reader)?;
let c3 = hex_digit(reader)?;
Ok(Some((c1 - 8) * 256 + c2 * 16 + c3))
} else {
Ok(None)
}
} else {
Ok(None)
}
}
/// Parse a low surrogate code unit
/// If found, returns the value of the low surrogate 10 bits
fn low_surrogate(reader: &mut Reader) -> ParseResult<u32> {
let pos = reader.cursor().pos;
expect_str("D", reader).map_err(|_| {
ParseError::new(pos, ParseErrorKind::Expecting("low surrogate".to_string()))
})?;
let c1 = hex_digit(reader)?;
if c1 >= 12 {
let c2 = hex_digit(reader)?;
let c3 = hex_digit(reader)?;
Ok((c1 - 12) * 256 + c2 * 16 + c3)
} else {
Err(ParseError::new(
pos,
ParseErrorKind::Expecting("low surrogate".to_string()),
))
}
}
/// Parse a single hex digit and return its value
fn hex_digit(reader: &mut Reader) -> ParseResult<u32> {
let pos = reader.cursor().pos;
if let Some(ch) = reader.read() {
if ch.is_ascii_hexdigit() {
let value = ch.to_digit(16).unwrap();
Ok(value)
} else {
Err(ParseError::new(
pos,
ParseErrorKind::Expecting("hex digit".to_string()),
))
}
} else {
Err(ParseError::new(
pos,
ParseErrorKind::Expecting("hex digit".to_string()),
))
}
}
#[cfg(test)]
mod tests {
@ -67,6 +297,7 @@ mod tests {
"store".to_string()
);
assert_eq!(reader.cursor().index, CharPos(7));
let mut reader = Reader::new("\"store\"");
assert_eq!(
try_parse(&mut reader).unwrap().unwrap(),
@ -79,6 +310,86 @@ mod tests {
assert_eq!(reader.cursor().index, CharPos(0));
}
#[test]
fn test_escape_character() {
// Test escaped quotes
let mut reader = Reader::new("'quoted\\' literal'");
assert_eq!(
try_parse(&mut reader).unwrap().unwrap(),
"quoted' literal".to_string()
);
let mut reader = Reader::new("\"quoted\\\" literal\"");
assert_eq!(
try_parse(&mut reader).unwrap().unwrap(),
"quoted\" literal".to_string()
);
// Test standard escape sequences
let mut reader = Reader::new("\"line1\\nline2\"");
assert_eq!(
try_parse(&mut reader).unwrap().unwrap(),
"line1\nline2".to_string()
);
let mut reader = Reader::new("\"tab\\there\"");
assert_eq!(
try_parse(&mut reader).unwrap().unwrap(),
"tab\there".to_string()
);
let mut reader = Reader::new("\"back\\\\slash\"");
assert_eq!(
try_parse(&mut reader).unwrap().unwrap(),
"back\\slash".to_string()
);
let mut reader = Reader::new("\"slash\\/here\"");
assert_eq!(
try_parse(&mut reader).unwrap().unwrap(),
"slash/here".to_string()
);
}
#[test]
fn test_unicode_escape() {
// Basic Unicode escape
let mut reader = Reader::new("\"Hello \\u0041\"");
assert_eq!(
try_parse(&mut reader).unwrap().unwrap(),
"Hello A".to_string()
);
// Test valid 4-digit Unicode character (π - pi symbol)
let mut reader = Reader::new("\"\\u03C0\"");
assert_eq!(try_parse(&mut reader).unwrap().unwrap(), "π".to_string());
// Test another valid 4-digit Unicode character (© - copyright symbol)
let mut reader = Reader::new("\"\\u00A9\"");
assert_eq!(try_parse(&mut reader).unwrap().unwrap(), "©".to_string());
// Unicode surrogate pair - emoji
let mut reader = Reader::new("\"\\uD83D\\uDE00\"");
assert_eq!(try_parse(&mut reader).unwrap().unwrap(), "😀".to_string());
}
#[test]
fn test_mixed_quotes() {
// Single quote inside double-quoted string
let mut reader = Reader::new("\"it's fine\"");
assert_eq!(
try_parse(&mut reader).unwrap().unwrap(),
"it's fine".to_string()
);
// Double quote inside single-quoted string
let mut reader = Reader::new("'say \"hello\"'");
assert_eq!(
try_parse(&mut reader).unwrap().unwrap(),
"say \"hello\"".to_string()
);
}
#[test]
fn test_string_literal_error() {
let mut reader = Reader::new("'store");
@ -86,5 +397,122 @@ mod tests {
try_parse(&mut reader).unwrap_err(),
ParseError::new(Pos::new(1, 7), ParseErrorKind::Expecting("'".to_string()))
);
let mut reader = Reader::new("\"store");
assert_eq!(
try_parse(&mut reader).unwrap_err(),
ParseError::new(Pos::new(1, 7), ParseErrorKind::Expecting("\"".to_string()))
);
}
#[test]
fn test_invalid_escape_sequences() {
let mut reader = Reader::new("\"invalid\\x escape\"");
assert!(try_parse(&mut reader).is_err());
let mut reader = Reader::new("\"incomplete\\u123\"");
assert!(try_parse(&mut reader).is_err());
let mut reader = Reader::new("\"incomplete\\u\"");
assert!(try_parse(&mut reader).is_err());
}
#[test]
fn test_parse_escape_sequence() {
let mut reader = Reader::new("/");
assert_eq!(parse_escape_sequence(&mut reader, '"').unwrap(), '/');
assert_eq!(reader.cursor().index, CharPos(1));
// Unicode Character 'GRINNING FACE' (U+1F600)
let mut reader = Reader::new("uD83D\\uDE00\"");
assert_eq!(parse_escape_sequence(&mut reader, '"').unwrap(), '😀');
assert_eq!(reader.cursor().index, CharPos(11));
}
#[test]
fn test_parse_unicode_escape() {
let mut reader = Reader::new("00E9");
assert_eq!(parse_unicode_escape(&mut reader).unwrap(), 'é');
assert_eq!(reader.cursor().index, CharPos(4));
// Unicode Character 'GRINNING FACE' (U+1F600)
let mut reader = Reader::new("D83D\\uDE00\"");
assert_eq!(parse_unicode_escape(&mut reader).unwrap(), '😀');
assert_eq!(reader.cursor().index, CharPos(10));
}
#[test]
fn test_non_surrogate() {
let mut reader = Reader::new("00E9");
assert_eq!(try_non_surrogate(&mut reader).unwrap().unwrap(), 'é');
assert_eq!(reader.cursor().index, CharPos(4));
let mut reader = Reader::new("D83D");
assert!(try_non_surrogate(&mut reader).unwrap().is_none());
assert_eq!(reader.cursor().index, CharPos(0));
}
#[test]
fn test_surrogate_pairs() {
// Unicode Character 'GRINNING FACE' (U+1F600)
let mut reader = Reader::new("D83D\\uDE00\"");
assert_eq!(try_surrogate_pair(&mut reader).unwrap().unwrap(), '😀');
assert_eq!(reader.cursor().index, CharPos(10));
let mut reader = Reader::new("00E9");
assert!(try_surrogate_pair(&mut reader).unwrap().is_none());
assert_eq!(reader.cursor().index, CharPos(0));
let mut reader = Reader::new("D83D\\u00E9\"");
assert_eq!(
try_surrogate_pair(&mut reader).unwrap_err(),
ParseError::new(
Pos::new(1, 7),
ParseErrorKind::Expecting("low surrogate".to_string())
)
);
}
#[test]
fn test_high_surrogate() {
let mut reader = Reader::new("D83D");
assert_eq!(try_high_surrogate(&mut reader).unwrap().unwrap(), 61);
assert_eq!(reader.cursor().index, CharPos(4));
let mut reader = Reader::new("00E9");
assert!(try_high_surrogate(&mut reader).unwrap().is_none());
assert_eq!(reader.cursor().index, CharPos(0));
}
#[test]
fn test_low_surrogate() {
let mut reader = Reader::new("DE00");
assert_eq!(low_surrogate(&mut reader).unwrap(), 512);
assert_eq!(reader.cursor().index, CharPos(4));
let mut reader = Reader::new("00E9");
assert_eq!(
low_surrogate(&mut reader).unwrap_err(),
ParseError::new(
Pos::new(1, 1),
ParseErrorKind::Expecting("low surrogate".to_string())
)
);
}
#[test]
fn test_hex_digit() {
let mut reader = Reader::new("D83D");
assert_eq!(hex_digit(&mut reader).unwrap(), 13);
assert_eq!(reader.cursor().index, CharPos(1));
let mut reader = Reader::new("x");
assert_eq!(
hex_digit(&mut reader).unwrap_err(),
ParseError::new(
Pos::new(1, 1),
ParseErrorKind::Expecting("hex digit".to_string())
)
);
}
}

View File

@ -273,7 +273,7 @@ fn load_testcases() -> Vec<TestCase> {
fn run() {
let testcases = load_testcases();
// TODO: Remove Limit when spec is fully implemented
let testcases = testcases.iter().take(200);
let testcases = testcases.iter().take(220);
let count_total = testcases.len();
let testcases = testcases.filter(|tc| !IGNORED_TESTS.contains(&tc.name.as_str()));
let count_ignored = count_total - testcases.clone().count();