Apply percent-decoding to filepaths in HTML find-links (#1544)

## Summary

Closes https://github.com/astral-sh/uv/issues/1542.
This commit is contained in:
Charlie Marsh 2024-02-16 16:47:04 -05:00 committed by GitHub
parent 3aa7a6b796
commit 4f216f3a74
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 61 additions and 1 deletions

1
Cargo.lock generated
View File

@ -4267,6 +4267,7 @@ dependencies = [
"tokio-util", "tokio-util",
"tracing", "tracing",
"url", "url",
"urlencoding",
"uv-cache", "uv-cache",
"uv-fs", "uv-fs",
"uv-normalize", "uv-normalize",

View File

@ -41,6 +41,7 @@ tokio = { workspace = true, features = ["fs"] }
tokio-util = { workspace = true } tokio-util = { workspace = true }
tracing = { workspace = true } tracing = { workspace = true }
url = { workspace = true } url = { workspace = true }
urlencoding = { workspace = true }
[dev-dependencies] [dev-dependencies]
anyhow = { workspace = true } anyhow = { workspace = true }

View File

@ -120,7 +120,7 @@ impl SimpleHtml {
}, },
) )
} else { } else {
(href, Hashes::default()) (decoded.as_ref(), Hashes::default())
}; };
// Extract the filename from the body text, which MUST match that of // Extract the filename from the body text, which MUST match that of
@ -130,6 +130,10 @@ impl SimpleHtml {
.last() .last()
.ok_or_else(|| Error::MissingFilename(href.to_string()))?; .ok_or_else(|| Error::MissingFilename(href.to_string()))?;
// Unquote the filename.
let filename = urlencoding::decode(filename)
.map_err(|_| Error::UnsupportedFilename(filename.to_string()))?;
// Extract the `requires-python` field, which should be set on the // Extract the `requires-python` field, which should be set on the
// `data-requires-python` attribute. // `data-requires-python` attribute.
let requires_python = if let Some(requires_python) = let requires_python = if let Some(requires_python) =
@ -198,6 +202,9 @@ pub enum Error {
#[error("Expected distribution filename as last path component of URL: {0}")] #[error("Expected distribution filename as last path component of URL: {0}")]
MissingFilename(String), MissingFilename(String),
#[error("Expected distribution filename to be UTF-8: {0}")]
UnsupportedFilename(String),
#[error("Missing hash attribute on URL: {0}")] #[error("Missing hash attribute on URL: {0}")]
MissingHash(String), MissingHash(String),
@ -377,6 +384,57 @@ mod tests {
"###); "###);
} }
#[test]
fn parse_quoted_filepath() {
let text = r#"
<!DOCTYPE html>
<html>
<body>
<h1>Links for jinja2</h1>
<a href="cpu/torchtext-0.17.0%2Bcpu-cp39-cp39-win_amd64.whl">cpu/torchtext-0.17.0%2Bcpu-cp39-cp39-win_amd64.whl</a><br/>
</body>
</html>
<!--TIMESTAMP 1703347410-->
"#;
let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap();
let result = SimpleHtml::parse(text, &base).unwrap();
insta::assert_debug_snapshot!(result, @r###"
SimpleHtml {
base: BaseUrl(
Url {
scheme: "https",
cannot_be_a_base: false,
username: "",
password: None,
host: Some(
Domain(
"download.pytorch.org",
),
),
port: None,
path: "/whl/jinja2/",
query: None,
fragment: None,
},
),
files: [
File {
dist_info_metadata: None,
filename: "torchtext-0.17.0+cpu-cp39-cp39-win_amd64.whl",
hashes: Hashes {
sha256: None,
},
requires_python: None,
size: None,
upload_time: None,
url: "cpu/torchtext-0.17.0%2Bcpu-cp39-cp39-win_amd64.whl",
yanked: None,
},
],
}
"###);
}
#[test] #[test]
fn parse_missing_hash() { fn parse_missing_hash() {
let text = r#" let text = r#"