Avoid lookaheads in `sysconfig` parser (#9879)

## Summary

Based on some review feedback from
https://github.com/astral-sh/uv/pull/9857.
This commit is contained in:
Charlie Marsh 2024-12-13 15:02:52 -05:00 committed by GitHub
parent 08ea79cc33
commit 53dfe0de52
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 114 additions and 46 deletions

View File

@ -4,7 +4,7 @@ use std::str::FromStr;
use serde::Serialize; use serde::Serialize;
use serde_json::ser::PrettyFormatter; use serde_json::ser::PrettyFormatter;
use crate::sysconfig::cursor::{Cursor, EOF_CHAR}; use crate::sysconfig::cursor::Cursor;
/// A value in the [`SysconfigData`] map. /// A value in the [`SysconfigData`] map.
/// ///
@ -106,7 +106,11 @@ impl FromStr for SysconfigData {
let mut map = BTreeMap::new(); let mut map = BTreeMap::new();
loop { loop {
match cursor.first() { let Some(next) = cursor.bump() else {
return Err(Error::UnexpectedEof);
};
match next {
'\'' | '"' => { '\'' | '"' => {
// Parse key. // Parse key.
let key = parse_string(&mut cursor)?; let key = parse_string(&mut cursor)?;
@ -136,13 +140,10 @@ impl FromStr for SysconfigData {
} }
// Skip whitespace. // Skip whitespace.
' ' | '\n' | '\r' | '\t' => { ' ' | '\n' | '\r' | '\t' => {}
cursor.bump();
}
// When we see a closing brace, we're done. // When we see a closing brace, we're done.
'}' => { '}' => {
cursor.bump();
break; break;
} }
@ -155,23 +156,24 @@ impl FromStr for SysconfigData {
} }
/// Parse a Python string literal. /// Parse a Python string literal.
///
/// Expects the previous character to be the opening quote character.
fn parse_string(cursor: &mut Cursor) -> Result<String, Error> { fn parse_string(cursor: &mut Cursor) -> Result<String, Error> {
let quote = cursor.bump().expect("Expected opening quote"); let quote = cursor.previous();
assert!(quote == '\'' || quote == '"', "Invalid quote character"); assert!(quote == '\'' || quote == '"', "Invalid quote character");
let mut result = String::new(); let mut result = String::new();
loop { loop {
if cursor.first() == EOF_CHAR { let Some(c) = cursor.bump() else {
return Err(Error::UnexpectedCharacter(EOF_CHAR)); return Err(Error::UnexpectedEof);
} };
match c {
'\\' => {
// Handle escaped quotes. // Handle escaped quotes.
if cursor.first() == '\\' { if cursor.first() == quote {
// Consume the backslash. // Consume the backslash.
cursor.bump(); cursor.bump();
if cursor.first() == quote {
result.push(quote); result.push(quote);
cursor.bump();
continue; continue;
} }
@ -179,53 +181,56 @@ fn parse_string(cursor: &mut Cursor) -> Result<String, Error> {
result.push('\\'); result.push('\\');
result.push(cursor.first()); result.push(cursor.first());
cursor.bump(); cursor.bump();
continue;
} }
// Consume closing quote. // Consume closing quote.
if cursor.first() == quote { c if c == quote => {
cursor.bump();
break; break;
} }
result.push(cursor.first()); c => {
cursor.bump(); result.push(c);
}
}
} }
Ok(result) Ok(result)
} }
/// Parse a Python string, which may be a concatenation of multiple string literals. /// Parse a Python string, which may be a concatenation of multiple string literals.
///
/// Expects the cursor to start at an opening quote character.
fn parse_concatenated_string(cursor: &mut Cursor) -> Result<String, Error> { fn parse_concatenated_string(cursor: &mut Cursor) -> Result<String, Error> {
let mut result = String::new(); let mut result = String::new();
loop { loop {
let c = cursor.first(); let Some(c) = cursor.bump() else {
if c == EOF_CHAR { return Err(Error::UnexpectedEof);
break; };
} match c {
if c == '\'' || c == '"' { '\'' | '"' => {
// Parse a new string fragment and append it. // Parse a new string fragment and append it.
result.push_str(&parse_string(cursor)?); result.push_str(&parse_string(cursor)?);
} else if is_python_whitespace(c) { }
c if is_python_whitespace(c) => {
// Skip whitespace between fragments // Skip whitespace between fragments
cursor.bump(); }
} else if c == ',' || c == '}' { c => return Err(Error::UnexpectedCharacter(c)),
// End of value. }
// Lookahead to the end of the string.
if matches!(cursor.first(), ',' | '}') {
break; break;
} else {
return Err(Error::UnexpectedCharacter(c));
} }
} }
Ok(result) Ok(result)
} }
/// Parse an integer literal. /// Parse an integer literal.
///
/// Expects the cursor to start at the first digit of the integer.
fn parse_int(cursor: &mut Cursor) -> Result<i32, std::num::ParseIntError> { fn parse_int(cursor: &mut Cursor) -> Result<i32, std::num::ParseIntError> {
let mut result = String::new(); let mut result = String::new();
loop { loop {
let c = cursor.first(); let c = cursor.first();
if c == EOF_CHAR {
break;
}
if !c.is_ascii_digit() { if !c.is_ascii_digit() {
break; break;
} }
@ -251,6 +256,8 @@ pub enum Error {
MissingOpenBrace, MissingOpenBrace,
#[error("Unexpected character: {0}")] #[error("Unexpected character: {0}")]
UnexpectedCharacter(char), UnexpectedCharacter(char),
#[error("Unexpected end of file")]
UnexpectedEof,
#[error("Failed to parse integer")] #[error("Failed to parse integer")]
ParseInt(#[from] std::num::ParseIntError), ParseInt(#[from] std::num::ParseIntError),
#[error("`_sysconfigdata_` is missing a header comment")] #[error("`_sysconfigdata_` is missing a header comment")]
@ -289,6 +296,32 @@ mod tests {
"###); "###);
} }
#[test]
fn test_parse_trailing_comma() {
let input = indoc::indoc!(
r#"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": "value1",
"key2": 42,
"key3": "multi-part" " string",
}
"#
);
let result = input.parse::<SysconfigData>().expect("Parsing failed");
let snapshot = result.to_string_pretty().unwrap();
insta::assert_snapshot!(snapshot, @r###"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": "value1",
"key2": 42,
"key3": "multi-part string"
}
"###);
}
#[test] #[test]
fn test_parse_integer_values() { fn test_parse_integer_values() {
let input = indoc::indoc!( let input = indoc::indoc!(
@ -407,4 +440,39 @@ mod tests {
"Expected parsing to fail due to unexpected character" "Expected parsing to fail due to unexpected character"
); );
} }
#[test]
fn test_unexpected_eof() {
let input = indoc::indoc!(
r#"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": 123
"#
);
let result = input.parse::<SysconfigData>();
assert!(
result.is_err(),
"Expected parsing to fail due to unexpected character"
);
}
#[test]
fn test_unexpected_comma() {
let input = indoc::indoc!(
r#"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": 123,,
}
"#
);
let result = input.parse::<SysconfigData>();
assert!(
result.is_err(),
"Expected parsing to fail due to unexpected character"
);
}
} }