uv-fs: transparently support reading UTF-16 files

This PR tweaks uv to support reading `requirements.txt` regardless of
whether it is encoded as UTF-8 or UTF-16. This is particularly relevant
on Windows where `uv pip freeze > requirements.txt` will likely write a
UTF-16 encoded `requirements.txt` file.

There is some discussion on #1666 where it's suggested that perhaps
we should explicitly not support this. I didn't see that until I
had already put this PR together, but even so, I think it's worth
considering this. UTF-16 is predominant on Windows. It is very easy
to produce a UTF-16 encoded file. Moreover, there is an easy and well
specified way to recognize and transcode UTF-16 encoded data to UTF-8.

I think the downside of this is that it could encourage the use UTF-16
encoded `requirements.txt` files *in addition* to UTF-8 encoded
files, and it would probably be nice to converge and standardize on
one encoding. One possible alternative to this PR is that we provide
a better error message. Another alternative is to ensure that a
`-o/--output` flag exists for all commands (neither `uv pip freeze` nor
`pip freeze` have such a flag) so that users can always write output
to a file without relying on their environment's piping behavior.
(Although this last alternative seems a little sad to me.)

It's also worth noting the [PEP-0508] doesn't seem to mention file
encoding at all. So I think from a "do the standards allow this"
perspective, this change is OK.

Finally, `pip` itself seems to work with UTF-16 encoded
`requirements.txt` files.

I think I personally overall lean towards supporting UTF-16 for
`requirements.txt` files. In part because I think it smoothes out the
UX a little bit, in part because there is no obvious specification
(that I'm aware of) that mandates that these files are UTF-8, and
finally in part because `pip` supports it too.

Fixes #1666, Fixes #2276

[PEP-0508]: https://peps.python.org/pep-0508/
This commit is contained in:
Andrew Gallant 2024-03-07 12:35:54 -05:00 committed by Andrew Gallant
parent ef806dcb6e
commit b3b5afaf78
7 changed files with 126 additions and 3 deletions

12
Cargo.lock generated
View File

@ -972,6 +972,15 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "encoding_rs_io"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83"
dependencies = [
"encoding_rs",
]
[[package]]
name = "equivalent"
version = "1.0.1"
@ -4186,6 +4195,7 @@ dependencies = [
"assert_cmd",
"assert_fs",
"base64 0.21.7",
"byteorder",
"chrono",
"clap",
"clap_complete_command",
@ -4499,10 +4509,12 @@ name = "uv-fs"
version = "0.0.1"
dependencies = [
"dunce",
"encoding_rs_io",
"fs-err",
"fs2",
"junction",
"tempfile",
"tokio",
"tracing",
"urlencoding",
"uv-warnings",

View File

@ -41,6 +41,7 @@ derivative = { version = "2.2.0" }
directories = { version = "5.0.1" }
dunce = { version = "1.0.4" }
either = { version = "1.9.0" }
encoding_rs_io = { version = "0.1.7" }
flate2 = { version = "1.0.28", default-features = false }
fs-err = { version = "2.11.0" }
fs2 = { version = "0.4.3" }

View File

@ -359,7 +359,7 @@ impl RequirementsTxt {
read_url_to_string(&requirements_txt, client).await
}
} else {
uv_fs::read_to_string(&requirements_txt)
uv_fs::read_to_string_transcode(&requirements_txt)
.await
.map_err(RequirementsTxtParserError::IO)
}

View File

@ -16,13 +16,15 @@ workspace = true
uv-warnings = { path = "../uv-warnings" }
dunce = { workspace = true }
encoding_rs_io = { workspace = true }
fs-err = { workspace = true }
fs2 = { workspace = true }
junction = { workspace = true }
tempfile = { workspace = true }
tokio = { workspace = true, optional = true }
tracing = { workspace = true }
urlencoding = { workspace = true }
[features]
default = []
tokio = ["fs-err/tokio"]
tokio = ["fs-err/tokio", "dep:tokio"]

View File

@ -12,7 +12,7 @@ pub use crate::path::*;
mod path;
/// Reads the contents of the file path into memory as a `String`.
/// Reads data from the path and requires that it be valid UTF-8.
///
/// If the file path is `-`, then contents are read from stdin instead.
#[cfg(feature = "tokio")]
@ -29,6 +29,39 @@ pub async fn read_to_string(path: impl AsRef<Path>) -> std::io::Result<String> {
}
}
/// Reads data from the path and requires that it be valid UTF-8 or UTF-16.
///
/// This uses BOM sniffing to determine if the data should be transcoded
/// from UTF-16 to Rust's `String` type (which uses UTF-8).
///
/// This should generally only be used when one specifically wants to support
/// reading UTF-16 transparently.
///
/// If the file path is `-`, then contents are read from stdin instead.
#[cfg(feature = "tokio")]
pub async fn read_to_string_transcode(path: impl AsRef<Path>) -> std::io::Result<String> {
use std::io::Read;
use encoding_rs_io::DecodeReaderBytes;
let path = path.as_ref();
let raw = if path == Path::new("-") {
let mut buf = Vec::with_capacity(1024);
std::io::stdin().read_to_end(&mut buf)?;
buf
} else {
fs_err::tokio::read(path).await?
};
let mut buf = String::with_capacity(1024);
DecodeReaderBytes::new(&*raw)
.read_to_string(&mut buf)
.map_err(|err| {
let path = path.display();
std::io::Error::other(format!("failed to decode file {path}: {err}"))
})?;
Ok(buf)
}
/// Create a symlink from `src` to `dst`, replacing any existing symlink.
///
/// On Windows, this uses the `junction` crate to create a junction point.

View File

@ -82,6 +82,7 @@ tikv-jemallocator = { version = "0.5.4" }
[dev-dependencies]
assert_cmd = { version = "2.0.14" }
assert_fs = { version = "1.1.0" }
byteorder = { version = "1.5.0" }
filetime = { version = "0.2.23" }
indoc = { version = "2.0.4" }
insta = { version = "1.36.1", features = ["filters", "json"] }

View File

@ -2419,3 +2419,77 @@ fn no_build_isolation() -> Result<()> {
Ok(())
}
/// This tests that `uv` can read UTF-16LE encoded requirements.txt files.
///
/// Ref: <https://github.com/astral-sh/uv/issues/2276>
#[test]
fn install_utf16le_requirements() -> Result<()> {
let context = TestContext::new("3.12");
let requirements_txt = context.temp_dir.child("requirements.txt");
requirements_txt.touch()?;
requirements_txt.write_binary(&utf8_to_utf16_with_bom_le("tomli"))?;
uv_snapshot!(command_without_exclude_newer(&context)
.arg("-r")
.arg("requirements.txt"), @r###"
success: true
exit_code: 0
----- stdout -----
----- stderr -----
Resolved 1 package in [TIME]
Downloaded 1 package in [TIME]
Installed 1 package in [TIME]
+ tomli==2.0.1
"###
);
Ok(())
}
/// This tests that `uv` can read UTF-16BE encoded requirements.txt files.
///
/// Ref: <https://github.com/astral-sh/uv/issues/2276>
#[test]
fn install_utf16be_requirements() -> Result<()> {
let context = TestContext::new("3.12");
let requirements_txt = context.temp_dir.child("requirements.txt");
requirements_txt.touch()?;
requirements_txt.write_binary(&utf8_to_utf16_with_bom_be("tomli"))?;
uv_snapshot!(command_without_exclude_newer(&context)
.arg("-r")
.arg("requirements.txt"), @r###"
success: true
exit_code: 0
----- stdout -----
----- stderr -----
Resolved 1 package in [TIME]
Downloaded 1 package in [TIME]
Installed 1 package in [TIME]
+ tomli==2.0.1
"###
);
Ok(())
}
fn utf8_to_utf16_with_bom_le(s: &str) -> Vec<u8> {
use byteorder::ByteOrder;
let mut u16s = vec![0xFEFF];
u16s.extend(s.encode_utf16());
let mut u8s = vec![0; u16s.len() * 2];
byteorder::LittleEndian::write_u16_into(&u16s, &mut u8s);
u8s
}
fn utf8_to_utf16_with_bom_be(s: &str) -> Vec<u8> {
use byteorder::ByteOrder;
let mut u16s = vec![0xFEFF];
u16s.extend(s.encode_utf16());
let mut u8s = vec![0; u16s.len() * 2];
byteorder::BigEndian::write_u16_into(&u16s, &mut u8s);
u8s
}