ruff/crates/ruff_db/src/vendored.rs

738 lines
24 KiB
Rust

use std::borrow::Cow;
use std::collections::BTreeMap;
use std::fmt::{self, Debug};
use std::io::{self, Read, Write};
use std::sync::{Arc, Mutex, MutexGuard};
use zip::result::ZipResult;
use zip::write::FileOptions;
use zip::{CompressionMethod, ZipArchive, ZipWriter, read::ZipFile};
pub use self::path::{VendoredPath, VendoredPathBuf};
use crate::file_revision::FileRevision;
mod path;
type Result<T> = io::Result<T>;
type LockedZipArchive<'a> = MutexGuard<'a, VendoredZipArchive>;
/// File system that stores all content in a static zip archive
/// bundled as part of the Ruff binary.
///
/// "Files" in the `VendoredFileSystem` are read-only and immutable.
/// Directories are supported, but symlinks and hardlinks cannot exist.
///
/// # Path separators
///
/// At time of writing (2025-07-11), this implementation always uses `/` as a
/// path separator, even in Windows environments where `\` is traditionally
/// used as a file path separator. Namely, this is only currently used with zip
/// files built by `crates/ty_vendored/build.rs`.
///
/// Callers using this may provide paths that use a `\` as a separator. It will
/// be transparently normalized to `/`.
///
/// This is particularly important because the presence of a trailing separator
/// in a zip file is conventionally used to indicate a directory entry.
#[derive(Clone)]
pub struct VendoredFileSystem {
inner: Arc<Mutex<VendoredZipArchive>>,
}
impl VendoredFileSystem {
pub fn new_static(raw_bytes: &'static [u8]) -> Result<Self> {
Self::new_impl(Cow::Borrowed(raw_bytes))
}
pub fn new(raw_bytes: Vec<u8>) -> Result<Self> {
Self::new_impl(Cow::Owned(raw_bytes))
}
fn new_impl(data: Cow<'static, [u8]>) -> Result<Self> {
Ok(Self {
inner: Arc::new(Mutex::new(VendoredZipArchive::new(data)?)),
})
}
pub fn exists(&self, path: impl AsRef<VendoredPath>) -> bool {
fn exists(fs: &VendoredFileSystem, path: &VendoredPath) -> bool {
let normalized = NormalizedVendoredPath::from(path);
let mut archive = fs.lock_archive();
// Must probe the zipfile twice, as "stdlib" and "stdlib/" are considered
// different paths in a zip file, but we want to abstract over that difference here
// so that paths relative to the `VendoredFileSystem`
// work the same as other paths in Ruff.
archive.lookup_path(&normalized).is_ok()
|| archive
.lookup_path(&normalized.with_trailing_slash())
.is_ok()
}
exists(self, path.as_ref())
}
pub fn metadata(&self, path: impl AsRef<VendoredPath>) -> Result<Metadata> {
fn metadata(fs: &VendoredFileSystem, path: &VendoredPath) -> Result<Metadata> {
let normalized = NormalizedVendoredPath::from(path);
let mut archive = fs.lock_archive();
// Must probe the zipfile twice, as "stdlib" and "stdlib/" are considered
// different paths in a zip file, but we want to abstract over that difference here
// so that paths relative to the `VendoredFileSystem`
// work the same as other paths in Ruff.
if let Ok(zip_file) = archive.lookup_path(&normalized) {
return Ok(Metadata::from_zip_file(zip_file));
}
let zip_file = archive.lookup_path(&normalized.with_trailing_slash())?;
Ok(Metadata::from_zip_file(zip_file))
}
metadata(self, path.as_ref())
}
pub fn is_directory(&self, path: impl AsRef<VendoredPath>) -> bool {
self.metadata(path)
.is_ok_and(|metadata| metadata.kind().is_directory())
}
pub fn is_file(&self, path: impl AsRef<VendoredPath>) -> bool {
self.metadata(path)
.is_ok_and(|metadata| metadata.kind().is_file())
}
/// Read the entire contents of the zip file at `path` into a string
///
/// Returns an Err() if any of the following are true:
/// - The path does not exist in the underlying zip archive
/// - The path exists in the underlying zip archive, but represents a directory
/// - The contents of the zip file at `path` contain invalid UTF-8
pub fn read_to_string(&self, path: impl AsRef<VendoredPath>) -> Result<String> {
fn read_to_string(fs: &VendoredFileSystem, path: &VendoredPath) -> Result<String> {
let mut archive = fs.lock_archive();
let mut zip_file = archive.lookup_path(&NormalizedVendoredPath::from(path))?;
// Pre-allocate the buffer with the size specified in the ZIP file metadata
// because `read_to_string` passes `None` as the size hint.
// But let's not trust the zip file metadata (even though it's vendored)
// and limit it to a reasonable size.
let mut buffer = String::with_capacity(
usize::try_from(zip_file.size())
.unwrap_or(usize::MAX)
.min(10_000_000),
);
zip_file.read_to_string(&mut buffer)?;
Ok(buffer)
}
read_to_string(self, path.as_ref())
}
/// Read the direct children of the directory
/// identified by `path`.
///
/// If `path` is not a directory, then this will
/// return an empty `Vec`.
pub fn read_directory(&self, dir: impl AsRef<VendoredPath>) -> Vec<DirectoryEntry> {
// N.B. We specifically do not return an iterator here to avoid
// holding a lock for the lifetime of the iterator returned.
// That is, it seems like a footgun to keep the zip archive
// locked during iteration, since the unit of work for each
// item in the iterator could be arbitrarily long. Allocating
// up front and stuffing all entries into it is probably the
// simplest solution and what we do here. If this becomes
// a problem, there are other strategies we could pursue.
// (Amortizing allocs, using a different synchronization
// behavior or even exposing additional APIs.) ---AG
fn read_directory(fs: &VendoredFileSystem, dir: &VendoredPath) -> Vec<DirectoryEntry> {
let mut normalized = NormalizedVendoredPath::from(dir);
if !normalized.as_str().ends_with('/') {
normalized = normalized.with_trailing_slash();
}
let archive = fs.lock_archive();
let mut entries = vec![];
for name in archive.0.file_names() {
// Any entry that doesn't have the `path` (with a
// trailing slash) as a prefix cannot possibly be in
// the directory referenced by `path`.
let Some(without_dir_prefix) = name.strip_prefix(normalized.as_str()) else {
continue;
};
// Filter out an entry equivalent to the path given
// since we only want children of the directory.
if without_dir_prefix.is_empty() {
continue;
}
// We only want *direct* children. Files that are
// direct children cannot have any slashes (or else
// they are not direct children). Directories that
// are direct children can only have one slash and
// it must be at the end.
//
// (We do this manually ourselves to avoid doing a
// full file lookup and metadata retrieval via the
// `zip` crate.)
let file_type = FileType::from_zip_file_name(without_dir_prefix);
let slash_count = without_dir_prefix.matches('/').count();
match file_type {
FileType::File if slash_count > 0 => continue,
FileType::Directory if slash_count > 1 => continue,
_ => {}
}
entries.push(DirectoryEntry {
path: VendoredPathBuf::from(name),
file_type,
});
}
entries
}
read_directory(self, dir.as_ref())
}
/// Acquire a lock on the underlying zip archive.
/// The call will block until it is able to acquire the lock.
///
/// ## Panics:
/// If the current thread already holds the lock.
fn lock_archive(&self) -> LockedZipArchive<'_> {
self.inner.lock().unwrap()
}
}
impl fmt::Debug for VendoredFileSystem {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let mut archive = self.lock_archive();
if f.alternate() {
let mut paths: Vec<String> = archive.0.file_names().map(String::from).collect();
paths.sort();
let debug_info: BTreeMap<String, ZipFileDebugInfo> = paths
.iter()
.map(|path| {
(
path.to_owned(),
ZipFileDebugInfo::from(archive.0.by_name(path).unwrap()),
)
})
.collect();
f.debug_struct("VendoredFileSystem")
.field("inner_mutex_poisoned", &self.inner.is_poisoned())
.field("paths", &paths)
.field("data_by_path", &debug_info)
.finish()
} else {
write!(f, "VendoredFileSystem(<{} paths>)", archive.len())
}
}
}
impl Default for VendoredFileSystem {
fn default() -> Self {
let mut bytes: Vec<u8> = Vec::new();
let mut cursor = io::Cursor::new(&mut bytes);
{
let mut writer = ZipWriter::new(&mut cursor);
writer.finish().unwrap();
}
VendoredFileSystem::new(bytes).unwrap()
}
}
/// Private struct only used in `Debug` implementations
///
/// This could possibly be unified with the `Metadata` struct,
/// but that is deliberately kept small, and only exposes metadata
/// that users of the `VendoredFileSystem` could realistically need.
/// For debugging purposes, however, we want to have all information
/// available.
#[expect(unused)]
#[derive(Debug)]
struct ZipFileDebugInfo {
crc32_hash: u32,
compressed_size: u64,
uncompressed_size: u64,
kind: FileType,
}
impl<'a> From<ZipFile<'a>> for ZipFileDebugInfo {
fn from(value: ZipFile<'a>) -> Self {
Self {
crc32_hash: value.crc32(),
compressed_size: value.compressed_size(),
uncompressed_size: value.size(),
kind: if value.is_dir() {
FileType::Directory
} else {
FileType::File
},
}
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub enum FileType {
/// The path exists in the zip archive and represents a vendored file
File,
/// The path exists in the zip archive and represents a vendored directory of files
Directory,
}
impl FileType {
fn from_zip_file_name(name: &str) -> FileType {
if name.ends_with('/') {
FileType::Directory
} else {
FileType::File
}
}
pub const fn is_file(self) -> bool {
matches!(self, Self::File)
}
pub const fn is_directory(self) -> bool {
matches!(self, Self::Directory)
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Metadata {
kind: FileType,
revision: FileRevision,
}
impl Metadata {
fn from_zip_file(zip_file: ZipFile) -> Self {
let kind = if zip_file.is_dir() {
FileType::Directory
} else {
FileType::File
};
Self {
kind,
revision: FileRevision::new(u128::from(zip_file.crc32())),
}
}
pub fn kind(&self) -> FileType {
self.kind
}
pub fn revision(&self) -> FileRevision {
self.revision
}
}
#[derive(Debug, PartialEq, Eq)]
pub struct DirectoryEntry {
path: VendoredPathBuf,
file_type: FileType,
}
impl DirectoryEntry {
pub fn new(path: VendoredPathBuf, file_type: FileType) -> Self {
Self { path, file_type }
}
pub fn into_path(self) -> VendoredPathBuf {
self.path
}
pub fn path(&self) -> &VendoredPath {
&self.path
}
pub fn file_type(&self) -> FileType {
self.file_type
}
}
/// Newtype wrapper around a ZipArchive.
#[derive(Debug)]
struct VendoredZipArchive(ZipArchive<io::Cursor<Cow<'static, [u8]>>>);
impl VendoredZipArchive {
fn new(data: Cow<'static, [u8]>) -> Result<Self> {
Ok(Self(ZipArchive::new(io::Cursor::new(data))?))
}
fn lookup_path(&mut self, path: &NormalizedVendoredPath) -> Result<ZipFile<'_>> {
Ok(self.0.by_name(path.as_str())?)
}
fn len(&self) -> usize {
self.0.len()
}
}
/// A path that has been normalized via the `normalize_vendored_path` function.
///
/// Trailing slashes are normalized away by `camino::Utf8PathBuf`s,
/// but trailing slashes are crucial for distinguishing between
/// files and directories inside zip archives.
#[derive(Debug, Clone, PartialEq, Eq)]
struct NormalizedVendoredPath<'a>(Cow<'a, str>);
impl NormalizedVendoredPath<'_> {
fn with_trailing_slash(self) -> Self {
debug_assert!(!self.0.ends_with('/'));
let mut data = self.0.into_owned();
data.push('/');
Self(Cow::Owned(data))
}
fn as_str(&self) -> &str {
&self.0
}
}
impl<'a> From<&'a VendoredPath> for NormalizedVendoredPath<'a> {
/// Normalize the path.
///
/// The normalizations are:
/// - Remove `.` and `..` components
/// - Strip trailing slashes
/// - Normalize `\\` separators to `/`
/// - Validate that the path does not have any unsupported components
///
/// ## Panics:
/// If a path with an unsupported component for vendored paths is passed.
/// Unsupported components are path prefixes and path root directories.
fn from(path: &'a VendoredPath) -> Self {
/// Remove `.` and `..` components, and validate that unsupported components are not present.
///
/// This inner routine also strips trailing slashes,
/// and normalizes paths to use Unix `/` separators.
/// However, it always allocates, so avoid calling it if possible.
/// In most cases, the path should already be normalized.
fn normalize_unnormalized_path(path: &VendoredPath) -> String {
let mut normalized_parts = Vec::new();
for component in path.components() {
match component {
camino::Utf8Component::Normal(part) => normalized_parts.push(part),
camino::Utf8Component::CurDir => continue,
camino::Utf8Component::ParentDir => {
// `VendoredPath("")`, `VendoredPath("..")` and `VendoredPath("../..")`
// all resolve to the same path relative to the zip archive
// (see https://github.com/astral-sh/ruff/pull/11991#issuecomment-2185278014)
normalized_parts.pop();
}
unsupported => {
panic!("Unsupported component in a vendored path: {unsupported}")
}
}
}
normalized_parts.join("/")
}
let path_str = path.as_str();
if std::path::MAIN_SEPARATOR == '\\' && path_str.contains('\\') {
// Normalize paths so that they always use Unix path separators
NormalizedVendoredPath(Cow::Owned(normalize_unnormalized_path(path)))
} else if !path
.components()
.all(|component| matches!(component, camino::Utf8Component::Normal(_)))
{
// Remove non-`Normal` components
NormalizedVendoredPath(Cow::Owned(normalize_unnormalized_path(path)))
} else {
// Strip trailing slashes from the path
NormalizedVendoredPath(Cow::Borrowed(path_str.trim_end_matches('/')))
}
}
}
pub struct VendoredFileSystemBuilder {
writer: ZipWriter<io::Cursor<Vec<u8>>>,
compression_method: CompressionMethod,
}
impl VendoredFileSystemBuilder {
pub fn new(compression_method: CompressionMethod) -> Self {
let buffer = io::Cursor::new(Vec::new());
Self {
writer: ZipWriter::new(buffer),
compression_method,
}
}
pub fn add_file(
&mut self,
path: impl AsRef<VendoredPath>,
content: &str,
) -> std::io::Result<()> {
self.writer
.start_file(path.as_ref().as_str(), self.options())?;
self.writer.write_all(content.as_bytes())
}
pub fn add_directory(&mut self, path: impl AsRef<VendoredPath>) -> ZipResult<()> {
self.writer
.add_directory(path.as_ref().as_str(), self.options())
}
pub fn finish(mut self) -> Result<VendoredFileSystem> {
let buffer = self.writer.finish()?;
VendoredFileSystem::new(buffer.into_inner())
}
fn options(&self) -> FileOptions {
FileOptions::default()
.compression_method(self.compression_method)
.unix_permissions(0o644)
}
}
#[cfg(test)]
pub(crate) mod tests {
use insta::assert_snapshot;
use super::*;
const FUNCTOOLS_CONTENTS: &str = "def update_wrapper(): ...";
const ASYNCIO_TASKS_CONTENTS: &str = "class Task: ...";
fn mock_typeshed() -> VendoredFileSystem {
let mut builder = VendoredFileSystemBuilder::new(CompressionMethod::Stored);
builder.add_directory("stdlib/").unwrap();
builder
.add_file("stdlib/functools.pyi", FUNCTOOLS_CONTENTS)
.unwrap();
builder.add_directory("stdlib/asyncio/").unwrap();
builder
.add_file("stdlib/asyncio/tasks.pyi", ASYNCIO_TASKS_CONTENTS)
.unwrap();
builder.finish().unwrap()
}
#[test]
fn filesystem_debug_implementation() {
assert_snapshot!(
format!("{:?}", mock_typeshed()),
@"VendoredFileSystem(<4 paths>)"
);
}
#[test]
fn filesystem_debug_implementation_alternate() {
assert_snapshot!(format!("{:#?}", mock_typeshed()), @r#"
VendoredFileSystem {
inner_mutex_poisoned: false,
paths: [
"stdlib/",
"stdlib/asyncio/",
"stdlib/asyncio/tasks.pyi",
"stdlib/functools.pyi",
],
data_by_path: {
"stdlib/": ZipFileDebugInfo {
crc32_hash: 0,
compressed_size: 0,
uncompressed_size: 0,
kind: Directory,
},
"stdlib/asyncio/": ZipFileDebugInfo {
crc32_hash: 0,
compressed_size: 0,
uncompressed_size: 0,
kind: Directory,
},
"stdlib/asyncio/tasks.pyi": ZipFileDebugInfo {
crc32_hash: 2826547428,
compressed_size: 15,
uncompressed_size: 15,
kind: File,
},
"stdlib/functools.pyi": ZipFileDebugInfo {
crc32_hash: 1099005079,
compressed_size: 25,
uncompressed_size: 25,
kind: File,
},
},
}
"#);
}
fn test_directory(dirname: &str) {
let mock_typeshed = mock_typeshed();
let path = VendoredPath::new(dirname);
assert!(mock_typeshed.exists(path));
assert!(mock_typeshed.read_to_string(path).is_err());
let metadata = mock_typeshed.metadata(path).unwrap();
assert!(metadata.kind().is_directory());
}
#[test]
fn stdlib_dir_no_trailing_slash() {
test_directory("stdlib")
}
#[test]
fn stdlib_dir_trailing_slash() {
test_directory("stdlib/")
}
#[test]
fn asyncio_dir_no_trailing_slash() {
test_directory("stdlib/asyncio")
}
#[test]
fn asyncio_dir_trailing_slash() {
test_directory("stdlib/asyncio/")
}
#[test]
fn stdlib_dir_parent_components() {
test_directory("stdlib/asyncio/../../stdlib")
}
#[test]
fn asyncio_dir_odd_components() {
test_directory("./stdlib/asyncio/../asyncio/")
}
fn readdir_snapshot(fs: &VendoredFileSystem, path: &str) -> String {
let mut paths = fs
.read_directory(VendoredPath::new(path))
.into_iter()
.map(|entry| entry.path().to_string())
.collect::<Vec<String>>();
paths.sort();
paths.join("\n")
}
#[test]
fn read_directory_stdlib() {
let mock_typeshed = mock_typeshed();
assert_snapshot!(readdir_snapshot(&mock_typeshed, "stdlib"), @r"
vendored://stdlib/asyncio/
vendored://stdlib/functools.pyi
");
assert_snapshot!(readdir_snapshot(&mock_typeshed, "stdlib/"), @r"
vendored://stdlib/asyncio/
vendored://stdlib/functools.pyi
");
assert_snapshot!(readdir_snapshot(&mock_typeshed, "./stdlib"), @r"
vendored://stdlib/asyncio/
vendored://stdlib/functools.pyi
");
assert_snapshot!(readdir_snapshot(&mock_typeshed, "./stdlib/"), @r"
vendored://stdlib/asyncio/
vendored://stdlib/functools.pyi
");
}
#[test]
fn read_directory_asyncio() {
let mock_typeshed = mock_typeshed();
assert_snapshot!(
readdir_snapshot(&mock_typeshed, "stdlib/asyncio"),
@"vendored://stdlib/asyncio/tasks.pyi",
);
assert_snapshot!(
readdir_snapshot(&mock_typeshed, "./stdlib/asyncio"),
@"vendored://stdlib/asyncio/tasks.pyi",
);
assert_snapshot!(
readdir_snapshot(&mock_typeshed, "stdlib/asyncio/"),
@"vendored://stdlib/asyncio/tasks.pyi",
);
assert_snapshot!(
readdir_snapshot(&mock_typeshed, "./stdlib/asyncio/"),
@"vendored://stdlib/asyncio/tasks.pyi",
);
}
fn test_nonexistent_path(path: &str) {
let mock_typeshed = mock_typeshed();
let path = VendoredPath::new(path);
assert!(!mock_typeshed.exists(path));
assert!(mock_typeshed.metadata(path).is_err());
assert!(
mock_typeshed
.read_to_string(path)
.is_err_and(|err| err.to_string().contains("file not found"))
);
}
#[test]
fn simple_nonexistent_path() {
test_nonexistent_path("foo")
}
#[test]
fn nonexistent_path_with_extension() {
test_nonexistent_path("foo.pyi")
}
#[test]
fn nonexistent_path_with_trailing_slash() {
test_nonexistent_path("foo/")
}
#[test]
fn nonexistent_path_with_fancy_components() {
test_nonexistent_path("./foo/../../../foo")
}
fn test_file(mock_typeshed: &VendoredFileSystem, path: &VendoredPath) {
assert!(mock_typeshed.exists(path));
let metadata = mock_typeshed.metadata(path).unwrap();
assert!(metadata.kind().is_file());
}
#[test]
fn functools_file_contents() {
let mock_typeshed = mock_typeshed();
let path = VendoredPath::new("stdlib/functools.pyi");
test_file(&mock_typeshed, path);
let functools_stub = mock_typeshed.read_to_string(path).unwrap();
assert_eq!(functools_stub.as_str(), FUNCTOOLS_CONTENTS);
// Test that using the RefCell doesn't mutate
// the internal state of the underlying zip archive incorrectly:
let functools_stub_again = mock_typeshed.read_to_string(path).unwrap();
assert_eq!(functools_stub_again.as_str(), FUNCTOOLS_CONTENTS);
}
#[test]
fn functools_file_other_path() {
test_file(
&mock_typeshed(),
VendoredPath::new("stdlib/../stdlib/../stdlib/functools.pyi"),
)
}
#[test]
fn asyncio_file_contents() {
let mock_typeshed = mock_typeshed();
let path = VendoredPath::new("stdlib/asyncio/tasks.pyi");
test_file(&mock_typeshed, path);
let asyncio_stub = mock_typeshed.read_to_string(path).unwrap();
assert_eq!(asyncio_stub.as_str(), ASYNCIO_TASKS_CONTENTS);
}
#[test]
fn asyncio_file_other_path() {
test_file(
&mock_typeshed(),
VendoredPath::new("./stdlib/asyncio/../asyncio/tasks.pyi"),
)
}
}