13 Commits

Author SHA1 Message Date
Sunshine
1c71e708e1 bump version number (2.6.2 -> 2.7.0), update dependencies 2022-11-10 06:36:32 -10:00
Sunshine
a1bb9a4b74 Update README.md 2022-11-10 06:03:30 -10:00
Sunshine
cf7e368545 Update README.md 2022-11-10 06:02:46 -10:00
Sunshine
c1edde9b3e refine CLI API for white/black-listing of domains 2022-11-10 05:37:36 -10:00
Sunshine
7c0504c4cb Update README.md 2022-11-10 04:45:37 -10:00
Sunshine
1bff2c22ba Update README.md 2022-11-10 04:41:13 -10:00
Sunshine
8113509dcf fix tests 2022-11-10 04:12:31 -10:00
Sunshine
8fc0fc155f parse XML documents, save non-HTML and non-XML targets unparsed 2022-11-10 04:12:31 -10:00
Jakub Jirutka
7c61b462dd disable unnecessary/unused regex features to reduce binary size
This will reduce the monolith binary size by ~15%.
2022-09-20 11:46:26 -04:00
Simone Mosciatti
ef3684025b move to use http instead of https 2022-09-11 14:30:44 -04:00
Simone Mosciatti
db7ee697b3 rewrite small part of the input argument handling
the commit rewrite a small part of the input argument handling, trying
to follow besr rust practices.
We get rid of a variable and of a mutable reference while keeping the
code a bit more coincise.
2022-09-11 14:30:44 -04:00
Sunshine
89ce5029b9 add option to blacklist/whitelist domains 2022-09-01 13:35:52 -10:00
dependabot[bot]
54609b10e5 Bump iana-time-zone from 0.1.44 to 0.1.46 (#316)
Bumps [iana-time-zone](https://github.com/strawlab/iana-time-zone) from 0.1.44 to 0.1.46.
- [Release notes](https://github.com/strawlab/iana-time-zone/releases)
- [Changelog](https://github.com/strawlab/iana-time-zone/blob/main/CHANGELOG.md)
- [Commits](https://github.com/strawlab/iana-time-zone/compare/0.1.44...v0.1.46)

---
updated-dependencies:
- dependency-name: iana-time-zone
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2022-08-31 11:35:38 -10:00
9 changed files with 658 additions and 250 deletions

489
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
[package]
name = "monolith"
version = "2.6.2"
version = "2.7.0"
authors = [
"Sunshine <sunshine@uberspace.net>",
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
@@ -30,10 +30,15 @@ cssparser = "0.29.6"
encoding_rs = "0.8.31"
html5ever = "0.24.1"
percent-encoding = "2.1.0"
regex = "1.6.0" # Used for parsing srcset and NOSCRIPT
sha2 = "0.10.2" # Used for calculating checksums during integrity checks
url = "2.2.2"
# Used for parsing srcset and NOSCRIPT
[dependencies.regex]
version = "1.6.0"
default-features = false
features = ["std", "perf-dfa", "unicode-perl"]
[dependencies.reqwest]
version = "0.11.11"
default-features = false

View File

@@ -24,7 +24,7 @@ If compared to saving websites with `wget -mpk`, this tool embeds all assets as
## Installation
#### Using [Cargo](https://crates.io/crates/monolith)
#### Using [Cargo](https://crates.io/crates/monolith) (cross-platform)
```console
cargo install monolith
@@ -111,8 +111,10 @@ cat index.html | monolith -aIiFfcMv -b https://original.site/ - > result.html
- `-a`: Exclude audio sources
- `-b`: Use custom `base URL`
- `-B`: Forbid retrieving assets from specified domain(s)
- `-c`: Exclude CSS
- `-C`: Save document using custom `charset`
- `-d`: Allow retrieving assets only from specified `domain(s)`
- `-e`: Ignore network errors
- `-f`: Omit frames
- `-F`: Exclude web fonts
@@ -132,6 +134,35 @@ cat index.html | monolith -aIiFfcMv -b https://original.site/ - > result.html
---------------------------------------------------
## Whitelisting and blacklisting domains
Options `-d` and `-B` provide control over what domains can be used to retrieve assets from. E.g.:
```console
monolith -I -d example.com -d www.example.com https://example.com -o example-only.html
```
```console
monolith -I -B -d .googleusercontent.com -d googleanalytics.com -d .google.com https://example.com -o example-no-ads.html
```
---------------------------------------------------
## Dynamic content
Monolith doesn't feature a JavaScript engine, hence websites that retrieve and display data after initial load may require usage of additional tools.
For example, Chromium (Chrome) can be used to act as a pre-processor for such pages:
```console
chromium --headless --incognito --dump-dom https://github.com | monolith - -I -b https://github.com -o github.html
```
---------------------------------------------------
## Proxies
Please set `https_proxy`, `http_proxy`, and `no_proxy` environment variables.

View File

@@ -65,10 +65,9 @@ pub fn read_stdin() -> Vec<u8> {
fn main() {
let options = Options::from_args();
let mut target: String = options.target.clone();
// Check if target was provided
if target.len() == 0 {
if options.target.len() == 0 {
if !options.silent {
eprintln!("No target specified");
}
@@ -83,65 +82,62 @@ fn main() {
}
}
let target_url: Url;
let mut use_stdin: bool = false;
// Determine exact target URL
if target.clone() == "-" {
// Read from pipe (stdin)
use_stdin = true;
// Set default target URL to an empty data URL; the user can set it via --base-url
target_url = Url::parse("data:text/html,").unwrap();
} else {
match Url::parse(&target.clone()) {
Ok(parsed_url) => {
if parsed_url.scheme() == "data"
|| parsed_url.scheme() == "file"
|| (parsed_url.scheme() == "http" || parsed_url.scheme() == "https")
{
target_url = parsed_url;
} else {
let target_url = match options.target.as_str() {
"-" => {
// Read from pipe (stdin)
use_stdin = true;
// Set default target URL to an empty data URL; the user can set it via --base-url
Url::parse("data:text/html,").unwrap()
}
target => match Url::parse(&target) {
Ok(url) => match url.scheme() {
"data" | "file" | "http" | "https" => url,
unsupported_scheme => {
if !options.silent {
eprintln!("Unsupported target URL type: {}", &parsed_url.scheme());
eprintln!("Unsupported target URL type: {}", unsupported_scheme);
}
process::exit(1);
process::exit(1)
}
}
Err(_err) => {
},
Err(_) => {
// Failed to parse given base URL (perhaps it's a filesystem path?)
let path: &Path = Path::new(&target);
if path.exists() {
if path.is_file() {
match Url::from_file_path(fs::canonicalize(&path).unwrap()) {
Ok(file_url) => {
target_url = file_url;
}
Err(_err) => {
if !options.silent {
eprintln!(
"Could not generate file URL out of given path: {}",
"err"
);
match path.exists() {
true => match path.is_file() {
true => {
let canonical_path = fs::canonicalize(&path).unwrap();
match Url::from_file_path(canonical_path) {
Ok(url) => url,
Err(_) => {
if !options.silent {
eprintln!(
"Could not generate file URL out of given path: {}",
&target
);
}
process::exit(1);
}
process::exit(1);
}
}
} else {
if !options.silent {
eprintln!("Local target is not a file: {}", &options.target);
false => {
if !options.silent {
eprintln!("Local target is not a file: {}", &target);
}
process::exit(1);
}
process::exit(1);
},
false => {
// It is not a FS path, now we do what browsers do:
// prepend "http://" and hope it points to a website
Url::parse(&format!("http://{hopefully_url}", hopefully_url = &target))
.unwrap()
}
} else {
// Last chance, now we do what browsers do:
// prepend "http://" and hope it points to a website
target.insert_str(0, "http://");
target_url = Url::parse(&target).unwrap();
}
}
}
}
},
};
// Initialize client
let mut cache = HashMap::new();
@@ -179,12 +175,21 @@ fn main() {
{
match retrieve_asset(&mut cache, &client, &target_url, &target_url, &options, 0) {
Ok((retrieved_data, final_url, media_type, charset)) => {
// Make sure the media type is text/html
if !media_type.eq_ignore_ascii_case("text/html") {
if !options.silent {
eprintln!("Unsupported document media type");
}
process::exit(1);
// Provide output as text without processing it, the way browsers do
if !media_type.eq_ignore_ascii_case("text/html")
&& !media_type.eq_ignore_ascii_case("application/xhtml+xml")
{
// Define output
let mut output =
Output::new(&options.output).expect("Could not prepare output");
// Write retrieved data into STDOUT or file
output
.write(&retrieved_data)
.expect("Could not write output");
// Nothing else to do past this point
process::exit(0);
}
if options
@@ -328,6 +333,6 @@ fn main() {
// Define output
let mut output = Output::new(&options.output).expect("Could not prepare output");
// Write result into stdout or file
output.write(&result).expect("Could not write HTML output");
// Write result into STDOUT or file
output.write(&result).expect("Could not write output");
}

View File

@@ -1,12 +1,14 @@
use clap::{App, Arg};
use clap::{App, Arg, ArgAction};
use std::env;
#[derive(Default)]
pub struct Options {
pub no_audio: bool,
pub base_url: Option<String>,
pub blacklist_domains: bool,
pub no_css: bool,
pub charset: Option<String>,
pub domains: Option<Vec<String>>,
pub ignore_errors: bool,
pub no_frames: bool,
pub no_fonts: bool,
@@ -48,8 +50,20 @@ impl Options {
.about(format!("{}\n{}", ASCII, env!("CARGO_PKG_DESCRIPTION")).as_str())
.args_from_usage("-a, --no-audio 'Removes audio sources'")
.args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'")
.args_from_usage(
"-B, --blacklist-domains 'Treat list of specified domains as blacklist'",
)
.args_from_usage("-c, --no-css 'Removes CSS'")
.args_from_usage("-C, --charset=[UTF-8] 'Enforces custom encoding'")
.arg(
Arg::with_name("domains")
.short('d')
.long("domain")
.takes_value(true)
.value_name("example.com")
.action(ArgAction::Append)
.help("Specify domains to use for white/black-listing"),
)
.args_from_usage("-e, --ignore-errors 'Ignore network errors'")
.args_from_usage("-f, --no-frames 'Removes frames and iframes'")
.args_from_usage("-F, --no-fonts 'Removes fonts'")
@@ -87,10 +101,15 @@ impl Options {
if let Some(base_url) = app.value_of("base-url") {
options.base_url = Some(base_url.to_string());
}
options.blacklist_domains = app.is_present("blacklist-domains");
options.no_css = app.is_present("no-css");
if let Some(charset) = app.value_of("charset") {
options.charset = Some(charset.to_string());
}
if let Some(domains) = app.get_many::<String>("domains") {
let list_of_domains: Vec<String> = domains.map(|v| v.clone()).collect::<Vec<_>>();
options.domains = Some(list_of_domains);
}
options.ignore_errors = app.is_present("ignore-errors");
options.no_frames = app.is_present("no-frames");
options.no_fonts = app.is_present("no-fonts");

View File

@@ -92,6 +92,62 @@ pub fn detect_media_type_by_file_name(filename: &str) -> String {
mime.to_string()
}
pub fn domain_is_within_domain(domain: &str, domain_to_match_against: &str) -> bool {
if domain_to_match_against.len() == 0 {
return false;
}
if domain_to_match_against == "." {
return true;
}
let domain_partials: Vec<&str> = domain.trim_end_matches(".").rsplit(".").collect();
let domain_to_match_against_partials: Vec<&str> = domain_to_match_against
.trim_end_matches(".")
.rsplit(".")
.collect();
let domain_to_match_against_starts_with_a_dot = domain_to_match_against.starts_with(".");
let mut i: usize = 0;
let l: usize = std::cmp::max(
domain_partials.len(),
domain_to_match_against_partials.len(),
);
let mut ok: bool = true;
while i < l {
// Exit and return false if went out of bounds of domain to match against, and it didn't start with a dot
if !domain_to_match_against_starts_with_a_dot
&& domain_to_match_against_partials.len() < i + 1
{
ok = false;
break;
}
let domain_partial = if domain_partials.len() < i + 1 {
""
} else {
domain_partials.get(i).unwrap()
};
let domain_to_match_against_partial = if domain_to_match_against_partials.len() < i + 1 {
""
} else {
domain_to_match_against_partials.get(i).unwrap()
};
let parts_match = domain_to_match_against_partial.eq_ignore_ascii_case(domain_partial);
if !parts_match && domain_to_match_against_partial.len() != 0 {
ok = false;
break;
}
i += 1;
}
ok
}
pub fn indent(level: u32) -> String {
let mut result: String = String::new();
let mut l: u32 = level;
@@ -148,7 +204,7 @@ pub fn retrieve_asset(
let (media_type, charset, data) = parse_data_url(url);
Ok((data, url.clone(), media_type, charset))
} else if url.scheme() == "file" {
// Check if parent_url is also file:/// (if not, then we don't embed the asset)
// Check if parent_url is also a file: URL (if not, then we don't embed the asset)
if parent_url.scheme() != "file" {
if !options.silent {
eprintln!(
@@ -236,6 +292,17 @@ pub fn retrieve_asset(
"".to_string(),
))
} else {
if let Some(domains) = &options.domains {
let domain_matches = domains
.iter()
.any(|d| domain_is_within_domain(url.host_str().unwrap(), &d.trim()));
if (options.blacklist_domains && domain_matches)
|| (!options.blacklist_domains && !domain_matches)
{
return Err(client.get("").send().unwrap_err());
}
}
// URL not in cache, we retrieve the file
match client.get(url.as_str()).send() {
Ok(response) => {

View File

@@ -196,17 +196,14 @@ mod failing {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd.arg("data:,Hello%2C%20World!").output().unwrap();
// STDERR should contain error description
assert_eq!(
String::from_utf8_lossy(&out.stderr),
"Unsupported document media type\n"
);
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain HTML
assert_eq!(String::from_utf8_lossy(&out.stdout), "");
// STDOUT should contain text
assert_eq!(String::from_utf8_lossy(&out.stdout), "Hello, World!\n");
// Exit code should be 1
out.assert().code(1);
// Exit code should be 0
out.assert().code(0);
}
#[test]
@@ -221,7 +218,7 @@ mod failing {
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain HTML with no JS in it
// STDOUT should contain HTML without contents of local JS file
assert_eq!(
String::from_utf8_lossy(&out.stdout),
"<html><head><script src=\"data:application/javascript;base64,\"></script></head><body></body></html>\n"

View File

@@ -0,0 +1,154 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use monolith::utils;
#[test]
fn sub_domain_is_within_dotted_sub_domain() {
assert!(utils::domain_is_within_domain(
"news.ycombinator.com",
".news.ycombinator.com"
));
}
#[test]
fn domain_is_within_dotted_domain() {
assert!(utils::domain_is_within_domain(
"ycombinator.com",
".ycombinator.com"
));
}
#[test]
fn sub_domain_is_within_dotted_domain() {
assert!(utils::domain_is_within_domain(
"news.ycombinator.com",
".ycombinator.com"
));
}
#[test]
fn sub_domain_is_within_dotted_top_level_domain() {
assert!(utils::domain_is_within_domain(
"news.ycombinator.com",
".com"
));
}
#[test]
fn domain_is_within_itself() {
assert!(utils::domain_is_within_domain(
"ycombinator.com",
"ycombinator.com"
));
}
#[test]
fn domain_with_trailing_dot_is_within_itself() {
assert!(utils::domain_is_within_domain(
"ycombinator.com.",
"ycombinator.com"
));
}
#[test]
fn domain_with_trailing_dot_is_within_single_dot() {
assert!(utils::domain_is_within_domain("ycombinator.com.", "."));
}
#[test]
fn domain_matches_single_dot() {
assert!(utils::domain_is_within_domain("ycombinator.com", "."));
}
#[test]
fn dotted_domain_must_be_within_dotted_domain() {
assert!(utils::domain_is_within_domain(
".ycombinator.com",
".ycombinator.com"
));
}
#[test]
fn empty_is_within_dot() {
assert!(utils::domain_is_within_domain("", "."));
}
#[test]
fn both_dots() {
assert!(utils::domain_is_within_domain(".", "."));
}
}
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝
// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗
// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod failing {
use monolith::utils;
#[test]
fn sub_domain_must_not_be_within_domain() {
assert!(!utils::domain_is_within_domain(
"news.ycombinator.com",
"ycombinator.com"
));
}
#[test]
fn domain_must_not_be_within_top_level_domain() {
assert!(!utils::domain_is_within_domain("ycombinator.com", "com"));
}
#[test]
fn different_domains_must_not_be_within_one_another() {
assert!(!utils::domain_is_within_domain(
"news.ycombinator.com",
"kernel.org"
));
}
#[test]
fn sub_domain_is_not_within_wrong_top_level_domain() {
assert!(!utils::domain_is_within_domain(
"news.ycombinator.com",
"org"
));
}
#[test]
fn dotted_domain_is_not_within_domain() {
assert!(!utils::domain_is_within_domain(
".ycombinator.com",
"ycombinator.com"
));
}
#[test]
fn different_domain_is_not_within_dotted_domain() {
assert!(!utils::domain_is_within_domain(
"www.doodleoptimize.com",
".ycombinator.com"
));
}
#[test]
fn no_domain_can_be_within_empty_domain() {
assert!(!utils::domain_is_within_domain("ycombinator.com", ""));
}
#[test]
fn both_can_not_be_empty() {
assert!(!utils::domain_is_within_domain("", ""));
}
}

View File

@@ -1,4 +1,5 @@
mod detect_media_type;
mod domain_is_within_domain;
mod indent;
mod parse_content_type;
mod retrieve_asset;