diff --git a/Cargo.lock b/Cargo.lock index 08554f181..64cba2060 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2243,7 +2243,6 @@ name = "puffin-build" version = "0.0.1" dependencies = [ "anyhow", - "flate2", "fs-err", "gourgeist", "indoc", @@ -2252,6 +2251,7 @@ dependencies = [ "once_cell", "pep508_rs", "platform-host", + "puffin-extract", "puffin-fs", "puffin-interpreter", "puffin-traits", @@ -2260,14 +2260,11 @@ dependencies = [ "regex", "serde", "serde_json", - "tar", "tempfile", "thiserror", "tokio", "toml 0.8.8", "tracing", - "which", - "zip", ] [[package]] @@ -2463,17 +2460,16 @@ dependencies = [ "platform-tags", "puffin-cache", "puffin-client", + "puffin-extract", "puffin-fs", "puffin-git", "puffin-normalize", "puffin-traits", "pypi-types", - "rayon", "reqwest", "rustc-hash", "serde", "serde_json", - "sha2", "tempfile", "thiserror", "tokio", @@ -2483,6 +2479,19 @@ dependencies = [ "zip", ] +[[package]] +name = "puffin-extract" +version = "0.0.1" +dependencies = [ + "flate2", + "fs-err", + "rayon", + "tar", + "thiserror", + "tokio", + "zip", +] + [[package]] name = "puffin-fs" version = "0.0.1" @@ -2527,6 +2536,7 @@ dependencies = [ "puffin-cache", "puffin-client", "puffin-distribution", + "puffin-extract", "puffin-fs", "puffin-git", "puffin-interpreter", diff --git a/crates/README.md b/crates/README.md index 4cfc2a113..221877eb0 100644 --- a/crates/README.md +++ b/crates/README.md @@ -68,6 +68,10 @@ Implements the traits defined in `puffin-traits`. Client for interacting with built distributions (wheels) and source distributions (sdists). Capable of fetching metadata, distribution contents, etc. +## [puffin-extract](./puffin-extract) + +Utilities for extracting files from archives. + ## [puffin-fs](./puffin-fs) Utilities for interacting with the filesystem. diff --git a/crates/puffin-build/Cargo.toml b/crates/puffin-build/Cargo.toml index d2162b26f..c8be7ff26 100644 --- a/crates/puffin-build/Cargo.toml +++ b/crates/puffin-build/Cargo.toml @@ -17,13 +17,13 @@ workspace = true gourgeist = { path = "../gourgeist" } pep508_rs = { path = "../pep508-rs" } platform-host = { path = "../platform-host" } +puffin-extract = { path = "../puffin-extract" } puffin-fs = { path = "../puffin-fs" } puffin-interpreter = { path = "../puffin-interpreter" } puffin-traits = { path = "../puffin-traits" } pypi-types = { path = "../pypi-types" } anyhow = { workspace = true } -flate2 = { workspace = true } fs-err = { workspace = true } indoc = { workspace = true } itertools = { workspace = true } @@ -32,14 +32,11 @@ pyproject-toml = { workspace = true } regex = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } -tar = { workspace = true } tempfile = { workspace = true } thiserror = { workspace = true } tokio = { workspace = true, features = ["sync", "process"] } toml = { workspace = true } tracing = { workspace = true } -which = { workspace = true} -zip = { workspace = true } [dev-dependencies] insta = { version = "1.34.0" } diff --git a/crates/puffin-build/src/lib.rs b/crates/puffin-build/src/lib.rs index ee4570754..52626ab02 100644 --- a/crates/puffin-build/src/lib.rs +++ b/crates/puffin-build/src/lib.rs @@ -13,22 +13,19 @@ use std::process::Output; use std::str::FromStr; use std::sync::Arc; -use flate2::read::GzDecoder; use fs_err as fs; -use fs_err::{DirEntry, File}; +use fs_err::DirEntry; use indoc::formatdoc; use itertools::Itertools; use once_cell::sync::Lazy; use pyproject_toml::{BuildSystem, Project}; use regex::Regex; use serde::{Deserialize, Serialize}; -use tar::Archive; use tempfile::{tempdir, tempdir_in, TempDir}; use thiserror::Error; use tokio::process::Command; use tokio::sync::Mutex; use tracing::{debug, info_span, instrument}; -use zip::ZipArchive; use pep508_rs::Requirement; use puffin_interpreter::{Interpreter, Virtualenv}; @@ -46,8 +43,8 @@ static MISSING_HEADER_RE: Lazy = Lazy::new(|| { pub enum Error { #[error(transparent)] IO(#[from] io::Error), - #[error("Failed to read zip file")] - Zip(#[from] zip::result::ZipError), + #[error("Failed to extract archive: {0}")] + Extraction(PathBuf, #[source] puffin_extract::Error), #[error("Unsupported archive format (extension not recognized): {0}")] UnsupportedArchiveType(String), #[error("Invalid source distribution: {0}")] @@ -671,40 +668,12 @@ async fn create_pep517_build_environment( /// Returns the directory with the `pyproject.toml`/`setup.py` #[instrument(skip_all, fields(sdist = ? sdist.file_name().unwrap_or(sdist.as_os_str())))] fn extract_archive(sdist: &Path, extracted: &PathBuf) -> Result { - if sdist - .extension() - .is_some_and(|ext| ext.eq_ignore_ascii_case("zip")) - { - // .zip - let mut archive = ZipArchive::new(File::open(sdist)?)?; - archive.extract(extracted)?; - } else if sdist - .extension() - .is_some_and(|ext| ext.eq_ignore_ascii_case("gz")) - && sdist.file_stem().is_some_and(|stem| { - Path::new(stem) - .extension() - .is_some_and(|ext| ext.eq_ignore_ascii_case("tar")) - }) - { - // .tar.gz - let mut archive = Archive::new(GzDecoder::new(File::open(sdist)?)); - // https://github.com/alexcrichton/tar-rs/issues/349 - archive.set_preserve_mtime(false); - archive.unpack(extracted)?; - } else { - return Err(Error::UnsupportedArchiveType( - sdist - .file_name() - .unwrap_or(sdist.as_os_str()) - .to_string_lossy() - .to_string(), - )); - } + puffin_extract::extract_archive(sdist, extracted) + .map_err(|err| Error::Extraction(sdist.to_path_buf(), err))?; // > A .tar.gz source distribution (sdist) contains a single top-level directory called // > `{name}-{version}` (e.g. foo-1.0), containing the source files of the package. - // TODO(konstin): Verify the name of the directory + // TODO(konstin): Verify the name of the directory. let top_level = fs::read_dir(extracted)?.collect::>>()?; let [root] = top_level.as_slice() else { return Err(Error::InvalidSourceDist(format!( diff --git a/crates/puffin-distribution/Cargo.toml b/crates/puffin-distribution/Cargo.toml index 0edc875be..4f370a674 100644 --- a/crates/puffin-distribution/Cargo.toml +++ b/crates/puffin-distribution/Cargo.toml @@ -20,6 +20,7 @@ pep440_rs = { path = "../pep440-rs" } platform-tags = { path = "../platform-tags" } puffin-cache = { path = "../puffin-cache" } puffin-client = { path = "../puffin-client" } +puffin-extract = { path = "../puffin-extract" } puffin-fs = { path = "../puffin-fs" } puffin-git = { path = "../puffin-git" } puffin-normalize = { path = "../puffin-normalize" } @@ -31,12 +32,10 @@ bytesize = { workspace = true } fs-err = { workspace = true } fs2 = { workspace = true } futures = { workspace = true } -rayon = { workspace = true } reqwest = { workspace = true } rustc-hash = { workspace = true } serde = { workspace = true , features = ["derive"] } serde_json = { workspace = true } -sha2 = { workspace = true } tempfile = { workspace = true } thiserror = { workspace = true } tokio = { workspace = true } diff --git a/crates/puffin-distribution/src/lib.rs b/crates/puffin-distribution/src/lib.rs index f467235f9..1ce49fd8d 100644 --- a/crates/puffin-distribution/src/lib.rs +++ b/crates/puffin-distribution/src/lib.rs @@ -3,7 +3,7 @@ pub use download::{DiskWheel, InMemoryWheel, LocalWheel}; pub use index::{BuiltWheelIndex, RegistryWheelIndex}; pub use reporter::Reporter; pub use source_dist::{SourceDistCachedBuilder, SourceDistError}; -pub use unzip::{Unzip, UnzipError}; +pub use unzip::Unzip; mod distribution_database; mod download; @@ -13,4 +13,3 @@ mod locks; mod reporter; mod source_dist; mod unzip; -mod vendor; diff --git a/crates/puffin-distribution/src/unzip.rs b/crates/puffin-distribution/src/unzip.rs index 4b1932646..d7192605b 100644 --- a/crates/puffin-distribution/src/unzip.rs +++ b/crates/puffin-distribution/src/unzip.rs @@ -1,49 +1,35 @@ -use std::io; -use std::io::{Read, Seek}; use std::path::Path; -use rayon::prelude::*; -use thiserror::Error; -use zip::result::ZipError; -use zip::ZipArchive; +use puffin_extract::{unzip_archive, Error}; use crate::download::BuiltWheel; -use crate::vendor::{CloneableSeekableReader, HasLength}; use crate::{DiskWheel, InMemoryWheel, LocalWheel}; -#[derive(Debug, Error)] -pub enum UnzipError { - #[error(transparent)] - Zip(#[from] ZipError), - #[error(transparent)] - Io(#[from] io::Error), -} - pub trait Unzip { /// Unzip a wheel into the target directory. - fn unzip(&self, target: &Path) -> Result<(), UnzipError>; + fn unzip(&self, target: &Path) -> Result<(), Error>; } impl Unzip for InMemoryWheel { - fn unzip(&self, target: &Path) -> Result<(), UnzipError> { + fn unzip(&self, target: &Path) -> Result<(), Error> { unzip_archive(std::io::Cursor::new(&self.buffer), target) } } impl Unzip for DiskWheel { - fn unzip(&self, target: &Path) -> Result<(), UnzipError> { + fn unzip(&self, target: &Path) -> Result<(), Error> { unzip_archive(fs_err::File::open(&self.path)?, target) } } impl Unzip for BuiltWheel { - fn unzip(&self, target: &Path) -> Result<(), UnzipError> { + fn unzip(&self, target: &Path) -> Result<(), Error> { unzip_archive(fs_err::File::open(&self.path)?, target) } } impl Unzip for LocalWheel { - fn unzip(&self, target: &Path) -> Result<(), UnzipError> { + fn unzip(&self, target: &Path) -> Result<(), Error> { match self { LocalWheel::InMemory(wheel) => wheel.unzip(target), LocalWheel::Disk(wheel) => wheel.unzip(target), @@ -51,52 +37,3 @@ impl Unzip for LocalWheel { } } } - -/// Unzip a zip archive into the target directory. -fn unzip_archive( - reader: R, - target: &Path, -) -> Result<(), UnzipError> { - // Unzip in parallel. - let archive = ZipArchive::new(CloneableSeekableReader::new(reader))?; - (0..archive.len()) - .par_bridge() - .map(|file_number| { - let mut archive = archive.clone(); - let mut file = archive.by_index(file_number)?; - - // Determine the path of the file within the wheel. - let file_path = match file.enclosed_name() { - Some(path) => path.to_owned(), - None => return Ok(()), - }; - - // Create necessary parent directories. - let path = target.join(file_path); - if file.is_dir() { - fs_err::create_dir_all(path)?; - return Ok(()); - } - if let Some(parent) = path.parent() { - fs_err::create_dir_all(parent)?; - } - - // Write the file. - let mut outfile = fs_err::File::create(&path)?; - std::io::copy(&mut file, &mut outfile)?; - - // Set permissions. - #[cfg(unix)] - { - use std::fs::Permissions; - use std::os::unix::fs::PermissionsExt; - - if let Some(mode) = file.unix_mode() { - std::fs::set_permissions(&path, Permissions::from_mode(mode))?; - } - } - - Ok(()) - }) - .collect::>() -} diff --git a/crates/puffin-distribution/src/vendor/mod.rs b/crates/puffin-distribution/src/vendor/mod.rs deleted file mode 100644 index 3148e2edd..000000000 --- a/crates/puffin-distribution/src/vendor/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ -pub(crate) use cloneable_seekable_reader::{CloneableSeekableReader, HasLength}; - -mod cloneable_seekable_reader; diff --git a/crates/puffin-extract/Cargo.toml b/crates/puffin-extract/Cargo.toml new file mode 100644 index 000000000..d34889118 --- /dev/null +++ b/crates/puffin-extract/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "puffin-extract" +version = "0.0.1" +edition = { workspace = true } +rust-version = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } + +[lints] +workspace = true + +[dependencies] +flate2 = { workspace = true } +fs-err = { workspace = true } +rayon = { workspace = true } +tar = { workspace = true } +thiserror = { workspace = true } +tokio = { workspace = true } +zip = { workspace = true } diff --git a/crates/puffin-extract/src/lib.rs b/crates/puffin-extract/src/lib.rs new file mode 100644 index 000000000..30fd9dbc5 --- /dev/null +++ b/crates/puffin-extract/src/lib.rs @@ -0,0 +1,104 @@ +use std::path::{Path, PathBuf}; + +use rayon::prelude::*; +use zip::result::ZipError; +use zip::ZipArchive; + +pub use crate::vendor::{CloneableSeekableReader, HasLength}; + +mod vendor; + +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error(transparent)] + Zip(#[from] ZipError), + #[error(transparent)] + Io(#[from] std::io::Error), + #[error("Unsupported archive type: {0}")] + UnsupportedArchive(PathBuf), +} + +/// Unzip a zip archive into the target directory. +pub fn unzip_archive( + reader: R, + target: &Path, +) -> Result<(), Error> { + // Unzip in parallel. + let archive = ZipArchive::new(CloneableSeekableReader::new(reader))?; + (0..archive.len()) + .par_bridge() + .map(|file_number| { + let mut archive = archive.clone(); + let mut file = archive.by_index(file_number)?; + + // Determine the path of the file within the wheel. + let file_path = match file.enclosed_name() { + Some(path) => path.to_owned(), + None => return Ok(()), + }; + + // Create necessary parent directories. + let path = target.join(file_path); + if file.is_dir() { + fs_err::create_dir_all(path)?; + return Ok(()); + } + if let Some(parent) = path.parent() { + fs_err::create_dir_all(parent)?; + } + + // Write the file. + let mut outfile = fs_err::File::create(&path)?; + std::io::copy(&mut file, &mut outfile)?; + + // Set permissions. + #[cfg(unix)] + { + use std::fs::Permissions; + use std::os::unix::fs::PermissionsExt; + + if let Some(mode) = file.unix_mode() { + std::fs::set_permissions(&path, Permissions::from_mode(mode))?; + } + } + + Ok(()) + }) + .collect::>() +} + +/// Extract a `.zip` or `.tar.gz` archive into the target directory. +pub fn extract_archive(source: impl AsRef, target: impl AsRef) -> Result<(), Error> { + // .zip + if source + .as_ref() + .extension() + .is_some_and(|ext| ext.eq_ignore_ascii_case("zip")) + { + unzip_archive(fs_err::File::open(source.as_ref())?, target.as_ref())?; + return Ok(()); + } + + // .tar.gz + if source + .as_ref() + .extension() + .is_some_and(|ext| ext.eq_ignore_ascii_case("gz")) + { + if source.as_ref().file_stem().is_some_and(|stem| { + Path::new(stem) + .extension() + .is_some_and(|ext| ext.eq_ignore_ascii_case("tar")) + }) { + let mut archive = tar::Archive::new(flate2::read::GzDecoder::new(fs_err::File::open( + source.as_ref(), + )?)); + // https://github.com/alexcrichton/tar-rs/issues/349 + archive.set_preserve_mtime(false); + archive.unpack(target)?; + return Ok(()); + } + } + + Err(Error::UnsupportedArchive(source.as_ref().to_path_buf())) +} diff --git a/crates/puffin-distribution/src/vendor/cloneable_seekable_reader.rs b/crates/puffin-extract/src/vendor/cloneable_seekable_reader.rs similarity index 97% rename from crates/puffin-distribution/src/vendor/cloneable_seekable_reader.rs rename to crates/puffin-extract/src/vendor/cloneable_seekable_reader.rs index a388e42c8..9048ac21e 100644 --- a/crates/puffin-distribution/src/vendor/cloneable_seekable_reader.rs +++ b/crates/puffin-extract/src/vendor/cloneable_seekable_reader.rs @@ -16,7 +16,8 @@ use std::{ /// A trait to represent some reader which has a total length known in /// advance. This is roughly equivalent to the nightly /// [`Seek::stream_len`] API. -pub(crate) trait HasLength { +#[allow(clippy::len_without_is_empty)] +pub trait HasLength { /// Return the current total length of this stream. fn len(&self) -> u64; } @@ -25,7 +26,7 @@ pub(crate) trait HasLength { /// and thus can be cloned cheaply. It supports seeking; each cloned instance /// maintains its own pointer into the file, and the underlying instance /// is seeked prior to each read. -pub(crate) struct CloneableSeekableReader { +pub struct CloneableSeekableReader { file: Arc>, pos: u64, // TODO determine and store this once instead of per cloneable file diff --git a/crates/puffin-extract/src/vendor/mod.rs b/crates/puffin-extract/src/vendor/mod.rs new file mode 100644 index 000000000..22f8fc671 --- /dev/null +++ b/crates/puffin-extract/src/vendor/mod.rs @@ -0,0 +1,3 @@ +pub use cloneable_seekable_reader::{CloneableSeekableReader, HasLength}; + +mod cloneable_seekable_reader; diff --git a/crates/puffin-installer/Cargo.toml b/crates/puffin-installer/Cargo.toml index 9cba1136f..3b2b9a1a3 100644 --- a/crates/puffin-installer/Cargo.toml +++ b/crates/puffin-installer/Cargo.toml @@ -14,14 +14,15 @@ workspace = true [dependencies] distribution-filename = { path = "../distribution-filename" } +distribution-types = { path = "../distribution-types" } install-wheel-rs = { path = "../install-wheel-rs", default-features = false } pep440_rs = { path = "../pep440-rs" } pep508_rs = { path = "../pep508-rs" } +platform-tags = { path = "../platform-tags" } puffin-cache = { path = "../puffin-cache" } puffin-client = { path = "../puffin-client" } -distribution-types = { path = "../distribution-types" } -platform-tags = { path = "../platform-tags" } puffin-distribution = { path = "../puffin-distribution" } +puffin-extract = { path = "../puffin-extract" } puffin-fs = { path = "../puffin-fs" } puffin-git = { path = "../puffin-git" } puffin-interpreter = { path = "../puffin-interpreter" } diff --git a/crates/puffin-installer/src/downloader.rs b/crates/puffin-installer/src/downloader.rs index 7ceb6900c..585f86512 100644 --- a/crates/puffin-installer/src/downloader.rs +++ b/crates/puffin-installer/src/downloader.rs @@ -1,6 +1,5 @@ use std::cmp::Reverse; use std::path::PathBuf; - use std::sync::Arc; use futures::{StreamExt, TryFutureExt}; @@ -12,18 +11,16 @@ use distribution_types::{CachedDist, Dist, RemoteSource, SourceDist}; use platform_tags::Tags; use puffin_cache::Cache; use puffin_client::RegistryClient; -use puffin_distribution::{ - DistributionDatabase, DistributionDatabaseError, LocalWheel, Unzip, UnzipError, -}; +use puffin_distribution::{DistributionDatabase, DistributionDatabaseError, LocalWheel, Unzip}; use puffin_traits::{BuildContext, OnceMap}; #[derive(thiserror::Error, Debug)] pub enum Error { #[error("Failed to unzip wheel: {0}")] - Unzip(Dist, #[source] UnzipError), + Unzip(Dist, #[source] puffin_extract::Error), #[error("Failed to fetch wheel: {0}")] Fetch(Dist, #[source] DistributionDatabaseError), - /// Should not occur, i've only seen it when another task panicked + /// Should not occur; only seen when another task panicked. #[error("The task executor is broken, did some other task panic?")] Join(#[from] JoinError), #[error("Unzip failed in another thread: {0}")] @@ -160,7 +157,7 @@ impl<'a, Context: BuildContext + Send + Sync> Downloader<'a, Context> { // Unzip the wheel. let normalized_path = tokio::task::spawn_blocking({ - move || -> Result { + move || -> Result { // Unzip the wheel into a temporary directory. let parent = download .target()