mirror of https://github.com/astral-sh/uv
uv-fs: transparently support reading UTF-16 files
This PR tweaks uv to support reading `requirements.txt` regardless of whether it is encoded as UTF-8 or UTF-16. This is particularly relevant on Windows where `uv pip freeze > requirements.txt` will likely write a UTF-16 encoded `requirements.txt` file. There is some discussion on #1666 where it's suggested that perhaps we should explicitly not support this. I didn't see that until I had already put this PR together, but even so, I think it's worth considering this. UTF-16 is predominant on Windows. It is very easy to produce a UTF-16 encoded file. Moreover, there is an easy and well specified way to recognize and transcode UTF-16 encoded data to UTF-8. I think the downside of this is that it could encourage the use UTF-16 encoded `requirements.txt` files *in addition* to UTF-8 encoded files, and it would probably be nice to converge and standardize on one encoding. One possible alternative to this PR is that we provide a better error message. Another alternative is to ensure that a `-o/--output` flag exists for all commands (neither `uv pip freeze` nor `pip freeze` have such a flag) so that users can always write output to a file without relying on their environment's piping behavior. (Although this last alternative seems a little sad to me.) It's also worth noting the [PEP-0508] doesn't seem to mention file encoding at all. So I think from a "do the standards allow this" perspective, this change is OK. Finally, `pip` itself seems to work with UTF-16 encoded `requirements.txt` files. I think I personally overall lean towards supporting UTF-16 for `requirements.txt` files. In part because I think it smoothes out the UX a little bit, in part because there is no obvious specification (that I'm aware of) that mandates that these files are UTF-8, and finally in part because `pip` supports it too. Fixes #1666, Fixes #2276 [PEP-0508]: https://peps.python.org/pep-0508/
This commit is contained in:
parent
ef806dcb6e
commit
b3b5afaf78
|
|
@ -972,6 +972,15 @@ dependencies = [
|
|||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs_io"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83"
|
||||
dependencies = [
|
||||
"encoding_rs",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "equivalent"
|
||||
version = "1.0.1"
|
||||
|
|
@ -4186,6 +4195,7 @@ dependencies = [
|
|||
"assert_cmd",
|
||||
"assert_fs",
|
||||
"base64 0.21.7",
|
||||
"byteorder",
|
||||
"chrono",
|
||||
"clap",
|
||||
"clap_complete_command",
|
||||
|
|
@ -4499,10 +4509,12 @@ name = "uv-fs"
|
|||
version = "0.0.1"
|
||||
dependencies = [
|
||||
"dunce",
|
||||
"encoding_rs_io",
|
||||
"fs-err",
|
||||
"fs2",
|
||||
"junction",
|
||||
"tempfile",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"urlencoding",
|
||||
"uv-warnings",
|
||||
|
|
|
|||
|
|
@ -41,6 +41,7 @@ derivative = { version = "2.2.0" }
|
|||
directories = { version = "5.0.1" }
|
||||
dunce = { version = "1.0.4" }
|
||||
either = { version = "1.9.0" }
|
||||
encoding_rs_io = { version = "0.1.7" }
|
||||
flate2 = { version = "1.0.28", default-features = false }
|
||||
fs-err = { version = "2.11.0" }
|
||||
fs2 = { version = "0.4.3" }
|
||||
|
|
|
|||
|
|
@ -359,7 +359,7 @@ impl RequirementsTxt {
|
|||
read_url_to_string(&requirements_txt, client).await
|
||||
}
|
||||
} else {
|
||||
uv_fs::read_to_string(&requirements_txt)
|
||||
uv_fs::read_to_string_transcode(&requirements_txt)
|
||||
.await
|
||||
.map_err(RequirementsTxtParserError::IO)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -16,13 +16,15 @@ workspace = true
|
|||
uv-warnings = { path = "../uv-warnings" }
|
||||
|
||||
dunce = { workspace = true }
|
||||
encoding_rs_io = { workspace = true }
|
||||
fs-err = { workspace = true }
|
||||
fs2 = { workspace = true }
|
||||
junction = { workspace = true }
|
||||
tempfile = { workspace = true }
|
||||
tokio = { workspace = true, optional = true }
|
||||
tracing = { workspace = true }
|
||||
urlencoding = { workspace = true }
|
||||
|
||||
[features]
|
||||
default = []
|
||||
tokio = ["fs-err/tokio"]
|
||||
tokio = ["fs-err/tokio", "dep:tokio"]
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ pub use crate::path::*;
|
|||
|
||||
mod path;
|
||||
|
||||
/// Reads the contents of the file path into memory as a `String`.
|
||||
/// Reads data from the path and requires that it be valid UTF-8.
|
||||
///
|
||||
/// If the file path is `-`, then contents are read from stdin instead.
|
||||
#[cfg(feature = "tokio")]
|
||||
|
|
@ -29,6 +29,39 @@ pub async fn read_to_string(path: impl AsRef<Path>) -> std::io::Result<String> {
|
|||
}
|
||||
}
|
||||
|
||||
/// Reads data from the path and requires that it be valid UTF-8 or UTF-16.
|
||||
///
|
||||
/// This uses BOM sniffing to determine if the data should be transcoded
|
||||
/// from UTF-16 to Rust's `String` type (which uses UTF-8).
|
||||
///
|
||||
/// This should generally only be used when one specifically wants to support
|
||||
/// reading UTF-16 transparently.
|
||||
///
|
||||
/// If the file path is `-`, then contents are read from stdin instead.
|
||||
#[cfg(feature = "tokio")]
|
||||
pub async fn read_to_string_transcode(path: impl AsRef<Path>) -> std::io::Result<String> {
|
||||
use std::io::Read;
|
||||
|
||||
use encoding_rs_io::DecodeReaderBytes;
|
||||
|
||||
let path = path.as_ref();
|
||||
let raw = if path == Path::new("-") {
|
||||
let mut buf = Vec::with_capacity(1024);
|
||||
std::io::stdin().read_to_end(&mut buf)?;
|
||||
buf
|
||||
} else {
|
||||
fs_err::tokio::read(path).await?
|
||||
};
|
||||
let mut buf = String::with_capacity(1024);
|
||||
DecodeReaderBytes::new(&*raw)
|
||||
.read_to_string(&mut buf)
|
||||
.map_err(|err| {
|
||||
let path = path.display();
|
||||
std::io::Error::other(format!("failed to decode file {path}: {err}"))
|
||||
})?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
/// Create a symlink from `src` to `dst`, replacing any existing symlink.
|
||||
///
|
||||
/// On Windows, this uses the `junction` crate to create a junction point.
|
||||
|
|
|
|||
|
|
@ -82,6 +82,7 @@ tikv-jemallocator = { version = "0.5.4" }
|
|||
[dev-dependencies]
|
||||
assert_cmd = { version = "2.0.14" }
|
||||
assert_fs = { version = "1.1.0" }
|
||||
byteorder = { version = "1.5.0" }
|
||||
filetime = { version = "0.2.23" }
|
||||
indoc = { version = "2.0.4" }
|
||||
insta = { version = "1.36.1", features = ["filters", "json"] }
|
||||
|
|
|
|||
|
|
@ -2419,3 +2419,77 @@ fn no_build_isolation() -> Result<()> {
|
|||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This tests that `uv` can read UTF-16LE encoded requirements.txt files.
|
||||
///
|
||||
/// Ref: <https://github.com/astral-sh/uv/issues/2276>
|
||||
#[test]
|
||||
fn install_utf16le_requirements() -> Result<()> {
|
||||
let context = TestContext::new("3.12");
|
||||
let requirements_txt = context.temp_dir.child("requirements.txt");
|
||||
requirements_txt.touch()?;
|
||||
requirements_txt.write_binary(&utf8_to_utf16_with_bom_le("tomli"))?;
|
||||
|
||||
uv_snapshot!(command_without_exclude_newer(&context)
|
||||
.arg("-r")
|
||||
.arg("requirements.txt"), @r###"
|
||||
success: true
|
||||
exit_code: 0
|
||||
----- stdout -----
|
||||
|
||||
----- stderr -----
|
||||
Resolved 1 package in [TIME]
|
||||
Downloaded 1 package in [TIME]
|
||||
Installed 1 package in [TIME]
|
||||
+ tomli==2.0.1
|
||||
"###
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This tests that `uv` can read UTF-16BE encoded requirements.txt files.
|
||||
///
|
||||
/// Ref: <https://github.com/astral-sh/uv/issues/2276>
|
||||
#[test]
|
||||
fn install_utf16be_requirements() -> Result<()> {
|
||||
let context = TestContext::new("3.12");
|
||||
let requirements_txt = context.temp_dir.child("requirements.txt");
|
||||
requirements_txt.touch()?;
|
||||
requirements_txt.write_binary(&utf8_to_utf16_with_bom_be("tomli"))?;
|
||||
|
||||
uv_snapshot!(command_without_exclude_newer(&context)
|
||||
.arg("-r")
|
||||
.arg("requirements.txt"), @r###"
|
||||
success: true
|
||||
exit_code: 0
|
||||
----- stdout -----
|
||||
|
||||
----- stderr -----
|
||||
Resolved 1 package in [TIME]
|
||||
Downloaded 1 package in [TIME]
|
||||
Installed 1 package in [TIME]
|
||||
+ tomli==2.0.1
|
||||
"###
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn utf8_to_utf16_with_bom_le(s: &str) -> Vec<u8> {
|
||||
use byteorder::ByteOrder;
|
||||
|
||||
let mut u16s = vec![0xFEFF];
|
||||
u16s.extend(s.encode_utf16());
|
||||
let mut u8s = vec![0; u16s.len() * 2];
|
||||
byteorder::LittleEndian::write_u16_into(&u16s, &mut u8s);
|
||||
u8s
|
||||
}
|
||||
|
||||
fn utf8_to_utf16_with_bom_be(s: &str) -> Vec<u8> {
|
||||
use byteorder::ByteOrder;
|
||||
|
||||
let mut u16s = vec![0xFEFF];
|
||||
u16s.extend(s.encode_utf16());
|
||||
let mut u8s = vec![0; u16s.len() * 2];
|
||||
byteorder::BigEndian::write_u16_into(&u16s, &mut u8s);
|
||||
u8s
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue