mirror of https://github.com/astral-sh/uv
uv-fs: transparently support reading UTF-16 files
This PR tweaks uv to support reading `requirements.txt` regardless of whether it is encoded as UTF-8 or UTF-16. This is particularly relevant on Windows where `uv pip freeze > requirements.txt` will likely write a UTF-16 encoded `requirements.txt` file. There is some discussion on #1666 where it's suggested that perhaps we should explicitly not support this. I didn't see that until I had already put this PR together, but even so, I think it's worth considering this. UTF-16 is predominant on Windows. It is very easy to produce a UTF-16 encoded file. Moreover, there is an easy and well specified way to recognize and transcode UTF-16 encoded data to UTF-8. I think the downside of this is that it could encourage the use UTF-16 encoded `requirements.txt` files *in addition* to UTF-8 encoded files, and it would probably be nice to converge and standardize on one encoding. One possible alternative to this PR is that we provide a better error message. Another alternative is to ensure that a `-o/--output` flag exists for all commands (neither `uv pip freeze` nor `pip freeze` have such a flag) so that users can always write output to a file without relying on their environment's piping behavior. (Although this last alternative seems a little sad to me.) It's also worth noting the [PEP-0508] doesn't seem to mention file encoding at all. So I think from a "do the standards allow this" perspective, this change is OK. Finally, `pip` itself seems to work with UTF-16 encoded `requirements.txt` files. I think I personally overall lean towards supporting UTF-16 for `requirements.txt` files. In part because I think it smoothes out the UX a little bit, in part because there is no obvious specification (that I'm aware of) that mandates that these files are UTF-8, and finally in part because `pip` supports it too. Fixes #1666, Fixes #2276 [PEP-0508]: https://peps.python.org/pep-0508/
This commit is contained in:
parent
ef806dcb6e
commit
b3b5afaf78
|
|
@ -972,6 +972,15 @@ dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "encoding_rs_io"
|
||||||
|
version = "0.1.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83"
|
||||||
|
dependencies = [
|
||||||
|
"encoding_rs",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "equivalent"
|
name = "equivalent"
|
||||||
version = "1.0.1"
|
version = "1.0.1"
|
||||||
|
|
@ -4186,6 +4195,7 @@ dependencies = [
|
||||||
"assert_cmd",
|
"assert_cmd",
|
||||||
"assert_fs",
|
"assert_fs",
|
||||||
"base64 0.21.7",
|
"base64 0.21.7",
|
||||||
|
"byteorder",
|
||||||
"chrono",
|
"chrono",
|
||||||
"clap",
|
"clap",
|
||||||
"clap_complete_command",
|
"clap_complete_command",
|
||||||
|
|
@ -4499,10 +4509,12 @@ name = "uv-fs"
|
||||||
version = "0.0.1"
|
version = "0.0.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"dunce",
|
"dunce",
|
||||||
|
"encoding_rs_io",
|
||||||
"fs-err",
|
"fs-err",
|
||||||
"fs2",
|
"fs2",
|
||||||
"junction",
|
"junction",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
|
"tokio",
|
||||||
"tracing",
|
"tracing",
|
||||||
"urlencoding",
|
"urlencoding",
|
||||||
"uv-warnings",
|
"uv-warnings",
|
||||||
|
|
|
||||||
|
|
@ -41,6 +41,7 @@ derivative = { version = "2.2.0" }
|
||||||
directories = { version = "5.0.1" }
|
directories = { version = "5.0.1" }
|
||||||
dunce = { version = "1.0.4" }
|
dunce = { version = "1.0.4" }
|
||||||
either = { version = "1.9.0" }
|
either = { version = "1.9.0" }
|
||||||
|
encoding_rs_io = { version = "0.1.7" }
|
||||||
flate2 = { version = "1.0.28", default-features = false }
|
flate2 = { version = "1.0.28", default-features = false }
|
||||||
fs-err = { version = "2.11.0" }
|
fs-err = { version = "2.11.0" }
|
||||||
fs2 = { version = "0.4.3" }
|
fs2 = { version = "0.4.3" }
|
||||||
|
|
|
||||||
|
|
@ -359,7 +359,7 @@ impl RequirementsTxt {
|
||||||
read_url_to_string(&requirements_txt, client).await
|
read_url_to_string(&requirements_txt, client).await
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
uv_fs::read_to_string(&requirements_txt)
|
uv_fs::read_to_string_transcode(&requirements_txt)
|
||||||
.await
|
.await
|
||||||
.map_err(RequirementsTxtParserError::IO)
|
.map_err(RequirementsTxtParserError::IO)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -16,13 +16,15 @@ workspace = true
|
||||||
uv-warnings = { path = "../uv-warnings" }
|
uv-warnings = { path = "../uv-warnings" }
|
||||||
|
|
||||||
dunce = { workspace = true }
|
dunce = { workspace = true }
|
||||||
|
encoding_rs_io = { workspace = true }
|
||||||
fs-err = { workspace = true }
|
fs-err = { workspace = true }
|
||||||
fs2 = { workspace = true }
|
fs2 = { workspace = true }
|
||||||
junction = { workspace = true }
|
junction = { workspace = true }
|
||||||
tempfile = { workspace = true }
|
tempfile = { workspace = true }
|
||||||
|
tokio = { workspace = true, optional = true }
|
||||||
tracing = { workspace = true }
|
tracing = { workspace = true }
|
||||||
urlencoding = { workspace = true }
|
urlencoding = { workspace = true }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = []
|
default = []
|
||||||
tokio = ["fs-err/tokio"]
|
tokio = ["fs-err/tokio", "dep:tokio"]
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@ pub use crate::path::*;
|
||||||
|
|
||||||
mod path;
|
mod path;
|
||||||
|
|
||||||
/// Reads the contents of the file path into memory as a `String`.
|
/// Reads data from the path and requires that it be valid UTF-8.
|
||||||
///
|
///
|
||||||
/// If the file path is `-`, then contents are read from stdin instead.
|
/// If the file path is `-`, then contents are read from stdin instead.
|
||||||
#[cfg(feature = "tokio")]
|
#[cfg(feature = "tokio")]
|
||||||
|
|
@ -29,6 +29,39 @@ pub async fn read_to_string(path: impl AsRef<Path>) -> std::io::Result<String> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Reads data from the path and requires that it be valid UTF-8 or UTF-16.
|
||||||
|
///
|
||||||
|
/// This uses BOM sniffing to determine if the data should be transcoded
|
||||||
|
/// from UTF-16 to Rust's `String` type (which uses UTF-8).
|
||||||
|
///
|
||||||
|
/// This should generally only be used when one specifically wants to support
|
||||||
|
/// reading UTF-16 transparently.
|
||||||
|
///
|
||||||
|
/// If the file path is `-`, then contents are read from stdin instead.
|
||||||
|
#[cfg(feature = "tokio")]
|
||||||
|
pub async fn read_to_string_transcode(path: impl AsRef<Path>) -> std::io::Result<String> {
|
||||||
|
use std::io::Read;
|
||||||
|
|
||||||
|
use encoding_rs_io::DecodeReaderBytes;
|
||||||
|
|
||||||
|
let path = path.as_ref();
|
||||||
|
let raw = if path == Path::new("-") {
|
||||||
|
let mut buf = Vec::with_capacity(1024);
|
||||||
|
std::io::stdin().read_to_end(&mut buf)?;
|
||||||
|
buf
|
||||||
|
} else {
|
||||||
|
fs_err::tokio::read(path).await?
|
||||||
|
};
|
||||||
|
let mut buf = String::with_capacity(1024);
|
||||||
|
DecodeReaderBytes::new(&*raw)
|
||||||
|
.read_to_string(&mut buf)
|
||||||
|
.map_err(|err| {
|
||||||
|
let path = path.display();
|
||||||
|
std::io::Error::other(format!("failed to decode file {path}: {err}"))
|
||||||
|
})?;
|
||||||
|
Ok(buf)
|
||||||
|
}
|
||||||
|
|
||||||
/// Create a symlink from `src` to `dst`, replacing any existing symlink.
|
/// Create a symlink from `src` to `dst`, replacing any existing symlink.
|
||||||
///
|
///
|
||||||
/// On Windows, this uses the `junction` crate to create a junction point.
|
/// On Windows, this uses the `junction` crate to create a junction point.
|
||||||
|
|
|
||||||
|
|
@ -82,6 +82,7 @@ tikv-jemallocator = { version = "0.5.4" }
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
assert_cmd = { version = "2.0.14" }
|
assert_cmd = { version = "2.0.14" }
|
||||||
assert_fs = { version = "1.1.0" }
|
assert_fs = { version = "1.1.0" }
|
||||||
|
byteorder = { version = "1.5.0" }
|
||||||
filetime = { version = "0.2.23" }
|
filetime = { version = "0.2.23" }
|
||||||
indoc = { version = "2.0.4" }
|
indoc = { version = "2.0.4" }
|
||||||
insta = { version = "1.36.1", features = ["filters", "json"] }
|
insta = { version = "1.36.1", features = ["filters", "json"] }
|
||||||
|
|
|
||||||
|
|
@ -2419,3 +2419,77 @@ fn no_build_isolation() -> Result<()> {
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// This tests that `uv` can read UTF-16LE encoded requirements.txt files.
|
||||||
|
///
|
||||||
|
/// Ref: <https://github.com/astral-sh/uv/issues/2276>
|
||||||
|
#[test]
|
||||||
|
fn install_utf16le_requirements() -> Result<()> {
|
||||||
|
let context = TestContext::new("3.12");
|
||||||
|
let requirements_txt = context.temp_dir.child("requirements.txt");
|
||||||
|
requirements_txt.touch()?;
|
||||||
|
requirements_txt.write_binary(&utf8_to_utf16_with_bom_le("tomli"))?;
|
||||||
|
|
||||||
|
uv_snapshot!(command_without_exclude_newer(&context)
|
||||||
|
.arg("-r")
|
||||||
|
.arg("requirements.txt"), @r###"
|
||||||
|
success: true
|
||||||
|
exit_code: 0
|
||||||
|
----- stdout -----
|
||||||
|
|
||||||
|
----- stderr -----
|
||||||
|
Resolved 1 package in [TIME]
|
||||||
|
Downloaded 1 package in [TIME]
|
||||||
|
Installed 1 package in [TIME]
|
||||||
|
+ tomli==2.0.1
|
||||||
|
"###
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This tests that `uv` can read UTF-16BE encoded requirements.txt files.
|
||||||
|
///
|
||||||
|
/// Ref: <https://github.com/astral-sh/uv/issues/2276>
|
||||||
|
#[test]
|
||||||
|
fn install_utf16be_requirements() -> Result<()> {
|
||||||
|
let context = TestContext::new("3.12");
|
||||||
|
let requirements_txt = context.temp_dir.child("requirements.txt");
|
||||||
|
requirements_txt.touch()?;
|
||||||
|
requirements_txt.write_binary(&utf8_to_utf16_with_bom_be("tomli"))?;
|
||||||
|
|
||||||
|
uv_snapshot!(command_without_exclude_newer(&context)
|
||||||
|
.arg("-r")
|
||||||
|
.arg("requirements.txt"), @r###"
|
||||||
|
success: true
|
||||||
|
exit_code: 0
|
||||||
|
----- stdout -----
|
||||||
|
|
||||||
|
----- stderr -----
|
||||||
|
Resolved 1 package in [TIME]
|
||||||
|
Downloaded 1 package in [TIME]
|
||||||
|
Installed 1 package in [TIME]
|
||||||
|
+ tomli==2.0.1
|
||||||
|
"###
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn utf8_to_utf16_with_bom_le(s: &str) -> Vec<u8> {
|
||||||
|
use byteorder::ByteOrder;
|
||||||
|
|
||||||
|
let mut u16s = vec![0xFEFF];
|
||||||
|
u16s.extend(s.encode_utf16());
|
||||||
|
let mut u8s = vec![0; u16s.len() * 2];
|
||||||
|
byteorder::LittleEndian::write_u16_into(&u16s, &mut u8s);
|
||||||
|
u8s
|
||||||
|
}
|
||||||
|
|
||||||
|
fn utf8_to_utf16_with_bom_be(s: &str) -> Vec<u8> {
|
||||||
|
use byteorder::ByteOrder;
|
||||||
|
|
||||||
|
let mut u16s = vec![0xFEFF];
|
||||||
|
u16s.extend(s.encode_utf16());
|
||||||
|
let mut u8s = vec![0; u16s.len() * 2];
|
||||||
|
byteorder::BigEndian::write_u16_into(&u16s, &mut u8s);
|
||||||
|
u8s
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue