uv/crates/uv-python/src/sysconfig/parser.rs

538 lines
16 KiB
Rust

use std::collections::BTreeMap;
use std::str::FromStr;
use serde::Serialize;
use serde_json::ser::PrettyFormatter;
use crate::sysconfig::cursor::Cursor;
/// A value in the [`SysconfigData`] map.
///
/// Values are assumed to be either strings or integers.
#[derive(Debug, Clone, Eq, PartialEq, serde::Serialize)]
#[serde(untagged)]
pub(super) enum Value {
String(String),
Int(i32),
}
/// The data extracted from a `_sysconfigdata_` file.
#[derive(Debug, Clone, Eq, PartialEq, serde::Serialize)]
pub(super) struct SysconfigData(BTreeMap<String, Value>);
impl SysconfigData {
/// Returns an iterator over the key-value pairs in the map.
pub(super) fn iter_mut(&mut self) -> std::collections::btree_map::IterMut<String, Value> {
self.0.iter_mut()
}
/// Inserts a key-value pair into the map.
pub(super) fn insert(&mut self, key: String, value: Value) -> Option<Value> {
self.0.insert(key, value)
}
/// Formats the `sysconfig` data as a pretty-printed string.
pub(super) fn to_string_pretty(&self) -> Result<String, serde_json::Error> {
let output = {
let mut buf = Vec::new();
let mut serializer = serde_json::Serializer::with_formatter(
&mut buf,
PrettyFormatter::with_indent(b" "),
);
self.0.serialize(&mut serializer)?;
String::from_utf8(buf).unwrap()
};
Ok(format!(
"# system configuration generated and used by the sysconfig module\nbuild_time_vars = {output}\n",
))
}
}
impl std::fmt::Display for SysconfigData {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let output = {
let mut buf = Vec::new();
let mut serializer = serde_json::Serializer::new(&mut buf);
self.0.serialize(&mut serializer).unwrap();
String::from_utf8(buf).unwrap()
};
write!(f, "{output}")
}
}
impl FromIterator<(String, Value)> for SysconfigData {
fn from_iter<T: IntoIterator<Item = (String, Value)>>(iter: T) -> Self {
Self(iter.into_iter().collect())
}
}
/// Parse the `_sysconfigdata_` file (e.g., `{real_prefix}/lib/python3.12/_sysconfigdata__darwin_darwin.py"`
/// on macOS).
///
/// `_sysconfigdata_` is structured as follows:
///
/// 1. A comment on the first line (e.g., `# system configuration generated and used by the sysconfig module`).
/// 2. An assignment to `build_time_vars` (e.g., `build_time_vars = { ... }`).
///
/// The right-hand side of the assignment is a JSON object. The keys are strings, and the values
/// are strings or numbers.
impl FromStr for SysconfigData {
type Err = Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
// Read the first line of the file.
let Some(s) =
s.strip_prefix("# system configuration generated and used by the sysconfig module\n")
else {
return Err(Error::MissingHeader);
};
// Read the assignment to `build_time_vars`.
let Some(s) = s.strip_prefix("build_time_vars") else {
return Err(Error::MissingAssignment);
};
let mut cursor = Cursor::new(s);
cursor.eat_while(is_python_whitespace);
if !cursor.eat_char('=') {
return Err(Error::MissingAssignment);
}
cursor.eat_while(is_python_whitespace);
if !cursor.eat_char('{') {
return Err(Error::MissingOpenBrace);
}
let mut map = BTreeMap::new();
loop {
let Some(next) = cursor.bump() else {
return Err(Error::UnexpectedEof);
};
match next {
'\'' | '"' => {
// Parse key.
let key = parse_string(&mut cursor, next)?;
cursor.eat_while(is_python_whitespace);
cursor.eat_char(':');
cursor.eat_while(is_python_whitespace);
// Parse value
let value = match cursor.first() {
'\'' | '"' => Value::String(parse_concatenated_string(&mut cursor)?),
'-' => {
cursor.bump();
Value::Int(-parse_int(&mut cursor)?)
}
c if c.is_ascii_digit() => Value::Int(parse_int(&mut cursor)?),
c => return Err(Error::UnexpectedCharacter(c)),
};
// Insert into map.
map.insert(key, value);
// Skip optional comma.
cursor.eat_while(is_python_whitespace);
cursor.eat_char(',');
cursor.eat_while(is_python_whitespace);
}
// Skip whitespace.
' ' | '\n' | '\r' | '\t' => {}
// When we see a closing brace, we're done.
'}' => {
break;
}
c => return Err(Error::UnexpectedCharacter(c)),
}
}
Ok(Self(map))
}
}
/// Parse a Python string literal.
///
/// Expects the previous character to be the opening quote character.
fn parse_string(cursor: &mut Cursor, quote: char) -> Result<String, Error> {
let mut result = String::new();
loop {
let Some(c) = cursor.bump() else {
return Err(Error::UnexpectedEof);
};
match c {
'\\' => {
// Treat the next character as a literal.
//
// See: https://github.com/astral-sh/ruff/blob/d47fba1e4aeeb18085900dfbbcd187e90d536913/crates/ruff_python_parser/src/string.rs#L194
let Some(c) = cursor.bump() else {
return Err(Error::UnexpectedEof);
};
result.push(match c {
'\\' => '\\',
'\'' => '\'',
'\"' => '"',
_ => {
return Err(Error::UnrecognizedEscape(c));
}
});
}
// Consume closing quote.
c if c == quote => {
break;
}
c => {
result.push(c);
}
}
}
Ok(result)
}
/// Parse a Python string, which may be a concatenation of multiple string literals.
///
/// Expects the cursor to start at an opening quote character.
fn parse_concatenated_string(cursor: &mut Cursor) -> Result<String, Error> {
let mut result = String::new();
loop {
let Some(c) = cursor.bump() else {
return Err(Error::UnexpectedEof);
};
match c {
'\'' | '"' => {
// Parse a new string fragment and append it.
result.push_str(&parse_string(cursor, c)?);
}
c if is_python_whitespace(c) => {
// Skip whitespace between fragments
}
c => return Err(Error::UnexpectedCharacter(c)),
}
// Lookahead to the end of the string.
if matches!(cursor.first(), ',' | '}') {
break;
}
}
Ok(result)
}
/// Parse an integer literal.
///
/// Expects the cursor to start at the first digit of the integer.
fn parse_int(cursor: &mut Cursor) -> Result<i32, std::num::ParseIntError> {
let mut result = String::new();
loop {
let c = cursor.first();
if !c.is_ascii_digit() {
break;
}
result.push(c);
cursor.bump();
}
result.parse()
}
/// Returns `true` for [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens)
/// characters.
const fn is_python_whitespace(c: char) -> bool {
matches!(
c,
// Space, tab, form-feed, newline, or carriage return
' ' | '\t' | '\x0C' | '\n' | '\r'
)
}
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("Missing opening brace")]
MissingOpenBrace,
#[error("Unexpected character: {0}")]
UnexpectedCharacter(char),
#[error("Unexpected end of file")]
UnexpectedEof,
#[error("Unrecognized escape sequence: {0}")]
UnrecognizedEscape(char),
#[error("Failed to parse integer")]
ParseInt(#[from] std::num::ParseIntError),
#[error("`_sysconfigdata_` is missing a header comment")]
MissingHeader,
#[error("`_sysconfigdata_` is missing an assignment to `build_time_vars`")]
MissingAssignment,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_string() {
let input = indoc::indoc!(
r#"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": "value1",
"key2": 42,
"key3": "multi-part" " string"
}
"#
);
let result = input.parse::<SysconfigData>().expect("Parsing failed");
let snapshot = result.to_string_pretty().unwrap();
insta::assert_snapshot!(snapshot, @r###"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": "value1",
"key2": 42,
"key3": "multi-part string"
}
"###);
}
#[test]
fn test_parse_backslash() {
let input = indoc::indoc!(
r#"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": "value1\"value2\"value3",
"key2": "value1\\value2\\value3",
"key3": "value1\\\"value2\\\"value3",
"key4": "value1\\\\value2\\\\value3",
}
"#
);
let result = input.parse::<SysconfigData>().expect("Parsing failed");
let snapshot = result.to_string_pretty().unwrap();
insta::assert_snapshot!(snapshot, @r###"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": "value1\"value2\"value3",
"key2": "value1\\value2\\value3",
"key3": "value1\\\"value2\\\"value3",
"key4": "value1\\\\value2\\\\value3"
}
"###);
}
#[test]
fn test_parse_trailing_backslash() {
let input = indoc::indoc!(
r#"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": "value1\\value2\\value3\",
}
"#
);
let result = input.parse::<SysconfigData>();
assert!(matches!(result, Err(Error::UnexpectedEof)));
}
#[test]
fn test_parse_unrecognized_escape() {
let input = indoc::indoc!(
r#"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": "value1\value2",
}
"#
);
let result = input.parse::<SysconfigData>();
assert!(matches!(result, Err(Error::UnrecognizedEscape('v'))));
}
#[test]
fn test_parse_trailing_comma() {
let input = indoc::indoc!(
r#"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": "value1",
"key2": 42,
"key3": "multi-part" " string",
}
"#
);
let result = input.parse::<SysconfigData>().expect("Parsing failed");
let snapshot = result.to_string_pretty().unwrap();
insta::assert_snapshot!(snapshot, @r###"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": "value1",
"key2": 42,
"key3": "multi-part string"
}
"###);
}
#[test]
fn test_parse_integer_values() {
let input = indoc::indoc!(
r#"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": 12345,
"key2": -15
}
"#
);
let result = input.parse::<SysconfigData>().expect("Parsing failed");
let snapshot = result.to_string_pretty().unwrap();
insta::assert_snapshot!(snapshot, @r###"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": 12345,
"key2": -15
}
"###);
}
#[test]
fn test_parse_escaped_quotes() {
let input = indoc::indoc!(
r#"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": "value with \"escaped quotes\"",
"key2": 'single-quoted \'escaped\''
}
"#
);
let result = input.parse::<SysconfigData>().expect("Parsing failed");
let snapshot = result.to_string_pretty().unwrap();
insta::assert_snapshot!(snapshot, @r###"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": "value with \"escaped quotes\"",
"key2": "single-quoted 'escaped'"
}
"###);
}
#[test]
fn test_parse_concatenated_strings() {
let input = indoc::indoc!(
r#"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": "multi-"
"line "
"string"
}
"#
);
let result = input.parse::<SysconfigData>().expect("Parsing failed");
let snapshot = result.to_string_pretty().unwrap();
insta::assert_snapshot!(snapshot, @r###"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": "multi-line string"
}
"###);
}
#[test]
fn test_missing_header_error() {
let input = indoc::indoc!(
r#"
build_time_vars = {
"key1": "value1"
}
"#
);
let result = input.parse::<SysconfigData>();
assert!(matches!(result, Err(Error::MissingHeader)));
}
#[test]
fn test_missing_assignment_error() {
let input = indoc::indoc!(
r#"
# system configuration generated and used by the sysconfig module
{
"key1": "value1"
}
"#
);
let result = input.parse::<SysconfigData>();
assert!(matches!(result, Err(Error::MissingAssignment)));
}
#[test]
fn test_unexpected_character_error() {
let input = indoc::indoc!(
r#"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": &123
}
"#
);
let result = input.parse::<SysconfigData>();
assert!(
result.is_err(),
"Expected parsing to fail due to unexpected character"
);
}
#[test]
fn test_unexpected_eof() {
let input = indoc::indoc!(
r#"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": 123
"#
);
let result = input.parse::<SysconfigData>();
assert!(
result.is_err(),
"Expected parsing to fail due to unexpected character"
);
}
#[test]
fn test_unexpected_comma() {
let input = indoc::indoc!(
r#"
# system configuration generated and used by the sysconfig module
build_time_vars = {
"key1": 123,,
}
"#
);
let result = input.parse::<SysconfigData>();
assert!(
result.is_err(),
"Expected parsing to fail due to unexpected character"
);
}
}