diff --git a/Cargo.lock b/Cargo.lock index e1acf6f0f..93fde529a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -972,6 +972,15 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "encoding_rs_io" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" +dependencies = [ + "encoding_rs", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -4186,6 +4195,7 @@ dependencies = [ "assert_cmd", "assert_fs", "base64 0.21.7", + "byteorder", "chrono", "clap", "clap_complete_command", @@ -4499,10 +4509,12 @@ name = "uv-fs" version = "0.0.1" dependencies = [ "dunce", + "encoding_rs_io", "fs-err", "fs2", "junction", "tempfile", + "tokio", "tracing", "urlencoding", "uv-warnings", diff --git a/Cargo.toml b/Cargo.toml index 0dc4be9bb..18d978e63 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ derivative = { version = "2.2.0" } directories = { version = "5.0.1" } dunce = { version = "1.0.4" } either = { version = "1.9.0" } +encoding_rs_io = { version = "0.1.7" } flate2 = { version = "1.0.28", default-features = false } fs-err = { version = "2.11.0" } fs2 = { version = "0.4.3" } diff --git a/crates/requirements-txt/src/lib.rs b/crates/requirements-txt/src/lib.rs index 9efd56522..ca3d0935d 100644 --- a/crates/requirements-txt/src/lib.rs +++ b/crates/requirements-txt/src/lib.rs @@ -359,7 +359,7 @@ impl RequirementsTxt { read_url_to_string(&requirements_txt, client).await } } else { - uv_fs::read_to_string(&requirements_txt) + uv_fs::read_to_string_transcode(&requirements_txt) .await .map_err(RequirementsTxtParserError::IO) } diff --git a/crates/uv-fs/Cargo.toml b/crates/uv-fs/Cargo.toml index 5af8bd625..e76d99805 100644 --- a/crates/uv-fs/Cargo.toml +++ b/crates/uv-fs/Cargo.toml @@ -16,13 +16,15 @@ workspace = true uv-warnings = { path = "../uv-warnings" } dunce = { workspace = true } +encoding_rs_io = { workspace = true } fs-err = { workspace = true } fs2 = { workspace = true } junction = { workspace = true } tempfile = { workspace = true } +tokio = { workspace = true, optional = true } tracing = { workspace = true } urlencoding = { workspace = true } [features] default = [] -tokio = ["fs-err/tokio"] +tokio = ["fs-err/tokio", "dep:tokio"] diff --git a/crates/uv-fs/src/lib.rs b/crates/uv-fs/src/lib.rs index 72e75c282..195286219 100644 --- a/crates/uv-fs/src/lib.rs +++ b/crates/uv-fs/src/lib.rs @@ -12,7 +12,7 @@ pub use crate::path::*; mod path; -/// Reads the contents of the file path into memory as a `String`. +/// Reads data from the path and requires that it be valid UTF-8. /// /// If the file path is `-`, then contents are read from stdin instead. #[cfg(feature = "tokio")] @@ -29,6 +29,39 @@ pub async fn read_to_string(path: impl AsRef) -> std::io::Result { } } +/// Reads data from the path and requires that it be valid UTF-8 or UTF-16. +/// +/// This uses BOM sniffing to determine if the data should be transcoded +/// from UTF-16 to Rust's `String` type (which uses UTF-8). +/// +/// This should generally only be used when one specifically wants to support +/// reading UTF-16 transparently. +/// +/// If the file path is `-`, then contents are read from stdin instead. +#[cfg(feature = "tokio")] +pub async fn read_to_string_transcode(path: impl AsRef) -> std::io::Result { + use std::io::Read; + + use encoding_rs_io::DecodeReaderBytes; + + let path = path.as_ref(); + let raw = if path == Path::new("-") { + let mut buf = Vec::with_capacity(1024); + std::io::stdin().read_to_end(&mut buf)?; + buf + } else { + fs_err::tokio::read(path).await? + }; + let mut buf = String::with_capacity(1024); + DecodeReaderBytes::new(&*raw) + .read_to_string(&mut buf) + .map_err(|err| { + let path = path.display(); + std::io::Error::other(format!("failed to decode file {path}: {err}")) + })?; + Ok(buf) +} + /// Create a symlink from `src` to `dst`, replacing any existing symlink. /// /// On Windows, this uses the `junction` crate to create a junction point. diff --git a/crates/uv/Cargo.toml b/crates/uv/Cargo.toml index a043f8aab..c37c28799 100644 --- a/crates/uv/Cargo.toml +++ b/crates/uv/Cargo.toml @@ -82,6 +82,7 @@ tikv-jemallocator = { version = "0.5.4" } [dev-dependencies] assert_cmd = { version = "2.0.14" } assert_fs = { version = "1.1.0" } +byteorder = { version = "1.5.0" } filetime = { version = "0.2.23" } indoc = { version = "2.0.4" } insta = { version = "1.36.1", features = ["filters", "json"] } diff --git a/crates/uv/tests/pip_install.rs b/crates/uv/tests/pip_install.rs index cc6b45852..d58a89aca 100644 --- a/crates/uv/tests/pip_install.rs +++ b/crates/uv/tests/pip_install.rs @@ -2419,3 +2419,77 @@ fn no_build_isolation() -> Result<()> { Ok(()) } + +/// This tests that `uv` can read UTF-16LE encoded requirements.txt files. +/// +/// Ref: +#[test] +fn install_utf16le_requirements() -> Result<()> { + let context = TestContext::new("3.12"); + let requirements_txt = context.temp_dir.child("requirements.txt"); + requirements_txt.touch()?; + requirements_txt.write_binary(&utf8_to_utf16_with_bom_le("tomli"))?; + + uv_snapshot!(command_without_exclude_newer(&context) + .arg("-r") + .arg("requirements.txt"), @r###" + success: true + exit_code: 0 + ----- stdout ----- + + ----- stderr ----- + Resolved 1 package in [TIME] + Downloaded 1 package in [TIME] + Installed 1 package in [TIME] + + tomli==2.0.1 + "### + ); + Ok(()) +} + +/// This tests that `uv` can read UTF-16BE encoded requirements.txt files. +/// +/// Ref: +#[test] +fn install_utf16be_requirements() -> Result<()> { + let context = TestContext::new("3.12"); + let requirements_txt = context.temp_dir.child("requirements.txt"); + requirements_txt.touch()?; + requirements_txt.write_binary(&utf8_to_utf16_with_bom_be("tomli"))?; + + uv_snapshot!(command_without_exclude_newer(&context) + .arg("-r") + .arg("requirements.txt"), @r###" + success: true + exit_code: 0 + ----- stdout ----- + + ----- stderr ----- + Resolved 1 package in [TIME] + Downloaded 1 package in [TIME] + Installed 1 package in [TIME] + + tomli==2.0.1 + "### + ); + Ok(()) +} + +fn utf8_to_utf16_with_bom_le(s: &str) -> Vec { + use byteorder::ByteOrder; + + let mut u16s = vec![0xFEFF]; + u16s.extend(s.encode_utf16()); + let mut u8s = vec![0; u16s.len() * 2]; + byteorder::LittleEndian::write_u16_into(&u16s, &mut u8s); + u8s +} + +fn utf8_to_utf16_with_bom_be(s: &str) -> Vec { + use byteorder::ByteOrder; + + let mut u16s = vec![0xFEFF]; + u16s.extend(s.encode_utf16()); + let mut u8s = vec![0; u16s.len() * 2]; + byteorder::BigEndian::write_u16_into(&u16s, &mut u8s); + u8s +}