Allow spaces in path requirements (#7767)

## Summary

This PR modifies our parsing to allow spaces in URLs. I don't know if
this is a correct change... But we now parse URLs until we see:

- A newline.
- A semicolon (marker) or hash (comment), _preceded_ by a space. We
parse the URL until the last
  non-whitespace character (inclusive).
- A semicolon (marker) or hash (comment) _followed_ by a space. We treat
this as an error, since
the end of the URL is ambiguous (e.g., `https://foo.com; marker`) would
be a URL that ends in `;`).

Closes https://github.com/astral-sh/uv/issues/6032.
This commit is contained in:
Charlie Marsh 2024-09-30 12:38:38 -04:00 committed by GitHub
parent 5d165b5607
commit 7435ee3b24
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 451 additions and 50 deletions

View File

@ -834,11 +834,19 @@ fn parse_extras_cursor<T: Pep508Url>(
/// Parse a raw string for a URL requirement, which could be either a URL or a local path, and which
/// could contain unexpanded environment variables.
///
/// When parsing, we eat characters until we see any of the following:
/// - A newline.
/// - A semicolon (marker) or hash (comment), _preceded_ by a space. We parse the URL until the last
/// non-whitespace character (inclusive).
/// - A semicolon (marker) or hash (comment) _followed_ by a space. We treat this as an error, since
/// the end of the URL is ambiguous.
///
/// For example:
/// - `https://pypi.org/project/requests/...`
/// - `file:///home/ferris/project/scripts/...`
/// - `file:../editable/`
/// - `../editable/`
/// - `../path to editable/`
/// - `https://download.pytorch.org/whl/torch_stable.html`
fn parse_url<T: Pep508Url>(
cursor: &mut Cursor,
@ -847,7 +855,40 @@ fn parse_url<T: Pep508Url>(
// wsp*
cursor.eat_whitespace();
// <URI_reference>
let (start, len) = cursor.take_while(|char| !char.is_whitespace());
let (start, len) = {
let start = cursor.pos();
let mut len = 0;
while let Some((_, c)) = cursor.next() {
// If we see a line break, we're done.
if matches!(c, '\r' | '\n') {
break;
}
// If we see top-level whitespace, check if it's followed by a semicolon or hash. If so,
// end the URL at the last non-whitespace character.
if c.is_whitespace() {
let mut cursor = cursor.clone();
cursor.eat_whitespace();
if matches!(cursor.peek_char(), None | Some(';' | '#')) {
break;
}
}
len += c.len_utf8();
// If we see a top-level semicolon or hash followed by whitespace, we're done.
match c {
';' if cursor.peek_char().is_some_and(char::is_whitespace) => {
break;
}
'#' if cursor.peek_char().is_some_and(char::is_whitespace) => {
break;
}
_ => {}
}
}
(start, len)
};
let url = cursor.slice(start, len);
if url.is_empty() {
return Err(Pep508Error {
@ -1087,16 +1128,21 @@ fn parse_pep508_requirement<T: Pep508Url>(
// wsp*
cursor.eat_whitespace();
if let Some((pos, char)) = cursor.next() {
if let Some(VersionOrUrl::Url(url)) = requirement_kind {
if marker.is_none() && url.to_string().ends_with(';') {
return Err(Pep508Error {
message: Pep508ErrorSource::String(
"Missing space before ';', the end of the URL is ambiguous".to_string(),
),
start: requirement_end - ';'.len_utf8(),
len: ';'.len_utf8(),
input: cursor.to_string(),
});
if marker.is_none() {
if let Some(VersionOrUrl::Url(url)) = requirement_kind {
let url = url.to_string();
for c in [';', '#'] {
if url.ends_with(c) {
return Err(Pep508Error {
message: Pep508ErrorSource::String(format!(
"Missing space before '{c}', the end of the URL is ambiguous"
)),
start: requirement_end - c.len_utf8(),
len: c.len_utf8(),
input: cursor.to_string(),
});
}
}
}
}
let message = if marker.is_none() {

View File

@ -175,16 +175,20 @@ fn parse_unnamed_requirement<Url: UnnamedRequirementUrl>(
// wsp*
cursor.eat_whitespace();
if let Some((pos, char)) = cursor.next() {
if let Some(given) = url.given() {
if given.ends_with(';') && marker.is_none() {
return Err(Pep508Error {
message: Pep508ErrorSource::String(
"Missing space before ';', the end of the URL is ambiguous".to_string(),
),
start: requirement_end - ';'.len_utf8(),
len: ';'.len_utf8(),
input: cursor.to_string(),
});
if marker.is_none() {
if let Some(given) = url.given() {
for c in [';', '#'] {
if given.ends_with(c) {
return Err(Pep508Error {
message: Pep508ErrorSource::String(format!(
"Missing space before '{c}', the end of the URL is ambiguous"
)),
start: requirement_end - c.len_utf8(),
len: c.len_utf8(),
input: cursor.to_string(),
});
}
}
}
}
let message = if marker.is_none() {
@ -349,9 +353,20 @@ fn preprocess_unnamed_url<Url: UnnamedRequirementUrl>(
/// Like [`crate::parse_url`], but allows for extras to be present at the end of the URL, to comply
/// with the non-PEP 508 extensions.
///
/// When parsing, we eat characters until we see any of the following:
/// - A newline.
/// - A semicolon (marker) or hash (comment), _preceded_ by a space. We parse the URL until the last
/// non-whitespace character (inclusive).
/// - A semicolon (marker) or hash (comment) _followed_ by a space. We treat this as an error, since
/// the end of the URL is ambiguous.
///
/// URLs can include extras at the end, enclosed in square brackets.
///
/// For example:
/// - `https://download.pytorch.org/whl/torch_stable.html[dev]`
/// - `../editable[dev]`
/// - `https://download.pytorch.org/whl/torch_stable.html ; python_version > "3.8"`
/// - `https://download.pytorch.org/whl/torch_stable.html # this is a comment`
fn parse_unnamed_url<Url: UnnamedRequirementUrl>(
cursor: &mut Cursor,
working_dir: Option<&Path>,
@ -363,30 +378,44 @@ fn parse_unnamed_url<Url: UnnamedRequirementUrl>(
let (start, len) = {
let start = cursor.pos();
let mut len = 0;
let mut backslash = false;
let mut depth = 0u32;
while let Some((_, c)) = cursor.next() {
if backslash {
backslash = false;
} else if c == '\\' {
backslash = true;
} else if c == '[' {
depth = depth.saturating_add(1);
} else if c == ']' {
depth = depth.saturating_sub(1);
}
// If we see top-level whitespace, we're done.
if depth == 0 && c.is_whitespace() {
break;
}
// If we see a line break, we're done.
if matches!(c, '\r' | '\n') {
break;
}
// Track the depth of brackets.
if c == '[' {
depth = depth.saturating_add(1);
} else if c == ']' {
depth = depth.saturating_sub(1);
}
// If we see top-level whitespace, check if it's followed by a semicolon or hash. If so,
// end the URL at the last non-whitespace character.
if depth == 0 && c.is_whitespace() {
let mut cursor = cursor.clone();
cursor.eat_whitespace();
if matches!(cursor.peek_char(), None | Some(';' | '#')) {
break;
}
}
len += c.len_utf8();
// If we see a top-level semicolon or hash followed by whitespace, we're done.
if depth == 0 {
match c {
';' if cursor.peek_char().is_some_and(char::is_whitespace) => {
break;
}
'#' if cursor.peek_char().is_some_and(char::is_whitespace) => {
break;
}
_ => {}
}
}
}
(start, len)
};

View File

@ -1325,6 +1325,7 @@ mod test {
#[cfg(unix)]
#[test_case(Path::new("semicolon.txt"))]
#[test_case(Path::new("hash.txt"))]
#[tokio::test]
async fn parse_err(path: &Path) {
let working_dir = workspace_test_data_dir().join("requirements-txt");

View File

@ -158,6 +158,156 @@ RequirementsTxt {
),
hashes: [],
},
RequirementEntry {
requirement: Unnamed(
UnnamedRequirement {
url: VerbatimParsedUrl {
parsed_url: Directory(
ParsedDirectoryUrl {
url: Url {
scheme: "file",
cannot_be_a_base: false,
username: "",
password: None,
host: None,
port: None,
path: "<REQUIREMENTS_DIR>/scripts/packages/black%20editable",
query: None,
fragment: None,
},
install_path: "<REQUIREMENTS_DIR>/scripts/packages/black editable",
editable: false,
virtual: false,
},
),
verbatim: VerbatimUrl {
url: Url {
scheme: "file",
cannot_be_a_base: false,
username: "",
password: None,
host: None,
port: None,
path: "<REQUIREMENTS_DIR>/scripts/packages/black%20editable",
query: None,
fragment: None,
},
given: Some(
"./scripts/packages/black editable",
),
},
},
extras: [],
marker: true,
origin: Some(
File(
"<REQUIREMENTS_DIR>/bare-url.txt",
),
),
},
),
hashes: [],
},
RequirementEntry {
requirement: Unnamed(
UnnamedRequirement {
url: VerbatimParsedUrl {
parsed_url: Directory(
ParsedDirectoryUrl {
url: Url {
scheme: "file",
cannot_be_a_base: false,
username: "",
password: None,
host: None,
port: None,
path: "<REQUIREMENTS_DIR>/scripts/packages/black%20editable",
query: None,
fragment: None,
},
install_path: "<REQUIREMENTS_DIR>/scripts/packages/black editable",
editable: false,
virtual: false,
},
),
verbatim: VerbatimUrl {
url: Url {
scheme: "file",
cannot_be_a_base: false,
username: "",
password: None,
host: None,
port: None,
path: "<REQUIREMENTS_DIR>/scripts/packages/black%20editable",
query: None,
fragment: None,
},
given: Some(
"./scripts/packages/black editable",
),
},
},
extras: [],
marker: python_full_version >= '3.9',
origin: Some(
File(
"<REQUIREMENTS_DIR>/bare-url.txt",
),
),
},
),
hashes: [],
},
RequirementEntry {
requirement: Unnamed(
UnnamedRequirement {
url: VerbatimParsedUrl {
parsed_url: Directory(
ParsedDirectoryUrl {
url: Url {
scheme: "file",
cannot_be_a_base: false,
username: "",
password: None,
host: None,
port: None,
path: "<REQUIREMENTS_DIR>/scripts/packages/black%20editable",
query: None,
fragment: None,
},
install_path: "<REQUIREMENTS_DIR>/scripts/packages/black editable",
editable: false,
virtual: false,
},
),
verbatim: VerbatimUrl {
url: Url {
scheme: "file",
cannot_be_a_base: false,
username: "",
password: None,
host: None,
port: None,
path: "<REQUIREMENTS_DIR>/scripts/packages/black%20editable",
query: None,
fragment: None,
},
given: Some(
"./scripts/packages/black editable",
),
},
},
extras: [],
marker: true,
origin: Some(
File(
"<REQUIREMENTS_DIR>/bare-url.txt",
),
),
},
),
hashes: [],
},
],
constraints: [],
editables: [],

View File

@ -0,0 +1,19 @@
---
source: crates/requirements-txt/src/lib.rs
expression: actual
---
RequirementsTxtFileError {
file: "<REQUIREMENTS_DIR>/hash.txt",
error: Pep508 {
source: Pep508Error {
message: String(
"Missing space before '#', the end of the URL is ambiguous",
),
start: 10,
len: 1,
input: "./editable# comment",
},
start: 49,
end: 68,
},
}

View File

@ -9,7 +9,7 @@ RequirementsTxtFileError {
message: String(
"Missing space before ';', the end of the URL is ambiguous",
),
start: 11,
start: 10,
len: 1,
input: "./editable; python_version >= \"3.9\" and os_name == \"posix\"",
},

View File

@ -158,6 +158,156 @@ RequirementsTxt {
),
hashes: [],
},
RequirementEntry {
requirement: Unnamed(
UnnamedRequirement {
url: VerbatimParsedUrl {
parsed_url: Directory(
ParsedDirectoryUrl {
url: Url {
scheme: "file",
cannot_be_a_base: false,
username: "",
password: None,
host: None,
port: None,
path: "/<REQUIREMENTS_DIR>/scripts/packages/black%20editable",
query: None,
fragment: None,
},
install_path: "<REQUIREMENTS_DIR>/scripts/packages/black editable",
editable: false,
virtual: false,
},
),
verbatim: VerbatimUrl {
url: Url {
scheme: "file",
cannot_be_a_base: false,
username: "",
password: None,
host: None,
port: None,
path: "/<REQUIREMENTS_DIR>/scripts/packages/black%20editable",
query: None,
fragment: None,
},
given: Some(
"./scripts/packages/black editable",
),
},
},
extras: [],
marker: true,
origin: Some(
File(
"<REQUIREMENTS_DIR>/bare-url.txt",
),
),
},
),
hashes: [],
},
RequirementEntry {
requirement: Unnamed(
UnnamedRequirement {
url: VerbatimParsedUrl {
parsed_url: Directory(
ParsedDirectoryUrl {
url: Url {
scheme: "file",
cannot_be_a_base: false,
username: "",
password: None,
host: None,
port: None,
path: "/<REQUIREMENTS_DIR>/scripts/packages/black%20editable",
query: None,
fragment: None,
},
install_path: "<REQUIREMENTS_DIR>/scripts/packages/black editable",
editable: false,
virtual: false,
},
),
verbatim: VerbatimUrl {
url: Url {
scheme: "file",
cannot_be_a_base: false,
username: "",
password: None,
host: None,
port: None,
path: "/<REQUIREMENTS_DIR>/scripts/packages/black%20editable",
query: None,
fragment: None,
},
given: Some(
"./scripts/packages/black editable",
),
},
},
extras: [],
marker: python_full_version >= '3.9',
origin: Some(
File(
"<REQUIREMENTS_DIR>/bare-url.txt",
),
),
},
),
hashes: [],
},
RequirementEntry {
requirement: Unnamed(
UnnamedRequirement {
url: VerbatimParsedUrl {
parsed_url: Directory(
ParsedDirectoryUrl {
url: Url {
scheme: "file",
cannot_be_a_base: false,
username: "",
password: None,
host: None,
port: None,
path: "/<REQUIREMENTS_DIR>/scripts/packages/black%20editable",
query: None,
fragment: None,
},
install_path: "<REQUIREMENTS_DIR>/scripts/packages/black editable",
editable: false,
virtual: false,
},
),
verbatim: VerbatimUrl {
url: Url {
scheme: "file",
cannot_be_a_base: false,
username: "",
password: None,
host: None,
port: None,
path: "/<REQUIREMENTS_DIR>/scripts/packages/black%20editable",
query: None,
fragment: None,
},
given: Some(
"./scripts/packages/black editable",
),
},
},
extras: [],
marker: true,
origin: Some(
File(
"<REQUIREMENTS_DIR>/bare-url.txt",
),
),
},
),
hashes: [],
},
],
constraints: [],
editables: [],

View File

@ -1,3 +1,6 @@
./scripts/packages/black_editable
./scripts/packages/black_editable[dev]
file:///scripts/packages/black_editable
./scripts/packages/black editable
./scripts/packages/black editable ; python_version >= "3.9"
./scripts/packages/black editable # comment

View File

@ -0,0 +1,2 @@
# Disallowed (missing whitespace before hash)
-e ./editable# comment

View File

@ -1,11 +1,7 @@
use fs2::FileExt;
use std::fmt::Display;
use std::io;
use std::path::{Path, PathBuf};
use std::pin::Pin;
use std::task::{Context, Poll};
use tempfile::NamedTempFile;
use tokio::io::{AsyncRead, ReadBuf};
use tracing::{debug, error, info, trace, warn};
pub use crate::path::*;
@ -393,27 +389,32 @@ impl Drop for LockedFile {
}
/// An asynchronous reader that reports progress as bytes are read.
pub struct ProgressReader<Reader: AsyncRead + Unpin, Callback: Fn(usize) + Unpin> {
#[cfg(feature = "tokio")]
pub struct ProgressReader<Reader: tokio::io::AsyncRead + Unpin, Callback: Fn(usize) + Unpin> {
reader: Reader,
callback: Callback,
}
impl<Reader: AsyncRead + Unpin, Callback: Fn(usize) + Unpin> ProgressReader<Reader, Callback> {
#[cfg(feature = "tokio")]
impl<Reader: tokio::io::AsyncRead + Unpin, Callback: Fn(usize) + Unpin>
ProgressReader<Reader, Callback>
{
/// Create a new [`ProgressReader`] that wraps another reader.
pub fn new(reader: Reader, callback: Callback) -> Self {
Self { reader, callback }
}
}
impl<Reader: AsyncRead + Unpin, Callback: Fn(usize) + Unpin> AsyncRead
#[cfg(feature = "tokio")]
impl<Reader: tokio::io::AsyncRead + Unpin, Callback: Fn(usize) + Unpin> tokio::io::AsyncRead
for ProgressReader<Reader, Callback>
{
fn poll_read(
mut self: Pin<&mut Self>,
cx: &mut Context<'_>,
buf: &mut ReadBuf<'_>,
) -> Poll<io::Result<()>> {
Pin::new(&mut self.as_mut().reader)
mut self: std::pin::Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
buf: &mut tokio::io::ReadBuf<'_>,
) -> std::task::Poll<std::io::Result<()>> {
std::pin::Pin::new(&mut self.as_mut().reader)
.poll_read(cx, buf)
.map_ok(|()| {
(self.callback)(buf.filled().len());