bump version number (2.6.2 -> 2.7.0), update dependencies

Update README.md
2022-11-10 06:36:32 -10:00 · 2022-11-10 06:03:30 -10:00 · 2022-11-10 06:02:46 -10:00 · 2022-11-10 05:37:36 -10:00 · 2022-11-10 04:45:37 -10:00 · 2022-11-10 04:41:13 -10:00
9 changed files with 658 additions and 250 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "monolith"
-version = "2.6.2"
+version = "2.7.0"
 authors = [
    "Sunshine <sunshine@uberspace.net>",
    "Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
@@ -30,10 +30,15 @@ cssparser = "0.29.6"
 encoding_rs = "0.8.31"
 html5ever = "0.24.1"
 percent-encoding = "2.1.0"
-regex = "1.6.0" # Used for parsing srcset and NOSCRIPT
 sha2 = "0.10.2" # Used for calculating checksums during integrity checks
 url = "2.2.2"

+# Used for parsing srcset and NOSCRIPT
+[dependencies.regex]
+version = "1.6.0"
+default-features = false
+features = ["std", "perf-dfa", "unicode-perl"]
+
 [dependencies.reqwest]
 version = "0.11.11"
 default-features = false
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ If compared to saving websites with `wget -mpk`, this tool embeds all assets as

 ## Installation

-#### Using [Cargo](https://crates.io/crates/monolith)
+#### Using [Cargo](https://crates.io/crates/monolith) (cross-platform)

 ```console
 cargo install monolith
@@ -111,8 +111,10 @@ cat index.html | monolith -aIiFfcMv -b https://original.site/ - > result.html

 - `-a`: Exclude audio sources
 - `-b`: Use custom `base URL`
+ - `-B`: Forbid retrieving assets from specified domain(s)
 - `-c`: Exclude CSS
 - `-C`: Save document using custom `charset`
+ - `-d`: Allow retrieving assets only from specified `domain(s)`
 - `-e`: Ignore network errors
 - `-f`: Omit frames
 - `-F`: Exclude web fonts
@@ -132,6 +134,35 @@ cat index.html | monolith -aIiFfcMv -b https://original.site/ - > result.html
 ---------------------------------------------------


+## Whitelisting and blacklisting domains
+
+Options `-d` and `-B` provide control over what domains can be used to retrieve assets from. E.g.:
+
+```console
+monolith -I -d example.com -d www.example.com https://example.com -o example-only.html
+```
+
+```console
+monolith -I -B -d .googleusercontent.com -d googleanalytics.com -d .google.com https://example.com -o example-no-ads.html
+```
+
+---------------------------------------------------
+
+
+## Dynamic content
+
+Monolith doesn't feature a JavaScript engine, hence websites that retrieve and display data after initial load may require usage of additional tools.
+
+For example, Chromium (Chrome) can be used to act as a pre-processor for such pages:
+
+```console
+chromium --headless --incognito --dump-dom https://github.com | monolith - -I -b https://github.com -o github.html
+```
+
+
+---------------------------------------------------
+
+
 ## Proxies

 Please set `https_proxy`, `http_proxy`, and `no_proxy` environment variables.
--- a/src/main.rs
+++ b/src/main.rs
@@ -65,10 +65,9 @@ pub fn read_stdin() -> Vec<u8> {

 fn main() {
    let options = Options::from_args();
-    let mut target: String = options.target.clone();

    // Check if target was provided
-    if target.len() == 0 {
+    if options.target.len() == 0 {
        if !options.silent {
            eprintln!("No target specified");
        }
@@ -83,65 +82,62 @@ fn main() {
        }
    }

-    let target_url: Url;
    let mut use_stdin: bool = false;

-    // Determine exact target URL
-    if target.clone() == "-" {
-        // Read from pipe (stdin)
-        use_stdin = true;
-        // Set default target URL to an empty data URL; the user can set it via --base-url
-        target_url = Url::parse("data:text/html,").unwrap();
-    } else {
-        match Url::parse(&target.clone()) {
-            Ok(parsed_url) => {
-                if parsed_url.scheme() == "data"
-                    || parsed_url.scheme() == "file"
-                    || (parsed_url.scheme() == "http" || parsed_url.scheme() == "https")
-                {
-                    target_url = parsed_url;
-                } else {
+    let target_url = match options.target.as_str() {
+        "-" => {
+            // Read from pipe (stdin)
+            use_stdin = true;
+            // Set default target URL to an empty data URL; the user can set it via --base-url
+            Url::parse("data:text/html,").unwrap()
+        }
+        target => match Url::parse(&target) {
+            Ok(url) => match url.scheme() {
+                "data" | "file" | "http" | "https" => url,
+                unsupported_scheme => {
                    if !options.silent {
-                        eprintln!("Unsupported target URL type: {}", &parsed_url.scheme());
+                        eprintln!("Unsupported target URL type: {}", unsupported_scheme);
                    }
-                    process::exit(1);
+                    process::exit(1)
                }
-            }
-            Err(_err) => {
+            },
+            Err(_) => {
                // Failed to parse given base URL (perhaps it's a filesystem path?)
                let path: &Path = Path::new(&target);
-
-                if path.exists() {
-                    if path.is_file() {
-                        match Url::from_file_path(fs::canonicalize(&path).unwrap()) {
-                            Ok(file_url) => {
-                                target_url = file_url;
-                            }
-                            Err(_err) => {
-                                if !options.silent {
-                                    eprintln!(
-                                        "Could not generate file URL out of given path: {}",
-                                        "err"
-                                    );
+                match path.exists() {
+                    true => match path.is_file() {
+                        true => {
+                            let canonical_path = fs::canonicalize(&path).unwrap();
+                            match Url::from_file_path(canonical_path) {
+                                Ok(url) => url,
+                                Err(_) => {
+                                    if !options.silent {
+                                        eprintln!(
+                                            "Could not generate file URL out of given path: {}",
+                                            &target
+                                        );
+                                    }
+                                    process::exit(1);
                                }
-                                process::exit(1);
                            }
                        }
-                    } else {
-                        if !options.silent {
-                            eprintln!("Local target is not a file: {}", &options.target);
+                        false => {
+                            if !options.silent {
+                                eprintln!("Local target is not a file: {}", &target);
+                            }
+                            process::exit(1);
                        }
-                        process::exit(1);
+                    },
+                    false => {
+                        // It is not a FS path, now we do what browsers do:
+                        // prepend "http://" and hope it points to a website
+                        Url::parse(&format!("http://{hopefully_url}", hopefully_url = &target))
+                            .unwrap()
                    }
-                } else {
-                    // Last chance, now we do what browsers do:
-                    // prepend "http://" and hope it points to a website
-                    target.insert_str(0, "http://");
-                    target_url = Url::parse(&target).unwrap();
                }
            }
-        }
-    }
+        },
+    };

    // Initialize client
    let mut cache = HashMap::new();
@@ -179,12 +175,21 @@ fn main() {
    {
        match retrieve_asset(&mut cache, &client, &target_url, &target_url, &options, 0) {
            Ok((retrieved_data, final_url, media_type, charset)) => {
-                // Make sure the media type is text/html
-                if !media_type.eq_ignore_ascii_case("text/html") {
-                    if !options.silent {
-                        eprintln!("Unsupported document media type");
-                    }
-                    process::exit(1);
+                // Provide output as text without processing it, the way browsers do
+                if !media_type.eq_ignore_ascii_case("text/html")
+                    && !media_type.eq_ignore_ascii_case("application/xhtml+xml")
+                {
+                    // Define output
+                    let mut output =
+                        Output::new(&options.output).expect("Could not prepare output");
+
+                    // Write retrieved data into STDOUT or file
+                    output
+                        .write(&retrieved_data)
+                        .expect("Could not write output");
+
+                    // Nothing else to do past this point
+                    process::exit(0);
                }

                if options
@@ -328,6 +333,6 @@ fn main() {
    // Define output
    let mut output = Output::new(&options.output).expect("Could not prepare output");

-    // Write result into stdout or file
-    output.write(&result).expect("Could not write HTML output");
+    // Write result into STDOUT or file
+    output.write(&result).expect("Could not write output");
 }
--- a/src/opts.rs
+++ b/src/opts.rs
@@ -1,12 +1,14 @@
-use clap::{App, Arg};
+use clap::{App, Arg, ArgAction};
 use std::env;

 #[derive(Default)]
 pub struct Options {
    pub no_audio: bool,
    pub base_url: Option<String>,
+    pub blacklist_domains: bool,
    pub no_css: bool,
    pub charset: Option<String>,
+    pub domains: Option<Vec<String>>,
    pub ignore_errors: bool,
    pub no_frames: bool,
    pub no_fonts: bool,
@@ -48,8 +50,20 @@ impl Options {
            .about(format!("{}\n{}", ASCII, env!("CARGO_PKG_DESCRIPTION")).as_str())
            .args_from_usage("-a, --no-audio 'Removes audio sources'")
            .args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'")
+            .args_from_usage(
+                "-B, --blacklist-domains 'Treat list of specified domains as blacklist'",
+            )
            .args_from_usage("-c, --no-css 'Removes CSS'")
            .args_from_usage("-C, --charset=[UTF-8] 'Enforces custom encoding'")
+            .arg(
+                Arg::with_name("domains")
+                    .short('d')
+                    .long("domain")
+                    .takes_value(true)
+                    .value_name("example.com")
+                    .action(ArgAction::Append)
+                    .help("Specify domains to use for white/black-listing"),
+            )
            .args_from_usage("-e, --ignore-errors 'Ignore network errors'")
            .args_from_usage("-f, --no-frames 'Removes frames and iframes'")
            .args_from_usage("-F, --no-fonts 'Removes fonts'")
@@ -87,10 +101,15 @@ impl Options {
        if let Some(base_url) = app.value_of("base-url") {
            options.base_url = Some(base_url.to_string());
        }
+        options.blacklist_domains = app.is_present("blacklist-domains");
        options.no_css = app.is_present("no-css");
        if let Some(charset) = app.value_of("charset") {
            options.charset = Some(charset.to_string());
        }
+        if let Some(domains) = app.get_many::<String>("domains") {
+            let list_of_domains: Vec<String> = domains.map(|v| v.clone()).collect::<Vec<_>>();
+            options.domains = Some(list_of_domains);
+        }
        options.ignore_errors = app.is_present("ignore-errors");
        options.no_frames = app.is_present("no-frames");
        options.no_fonts = app.is_present("no-fonts");
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -92,6 +92,62 @@ pub fn detect_media_type_by_file_name(filename: &str) -> String {
    mime.to_string()
 }

+pub fn domain_is_within_domain(domain: &str, domain_to_match_against: &str) -> bool {
+    if domain_to_match_against.len() == 0 {
+        return false;
+    }
+
+    if domain_to_match_against == "." {
+        return true;
+    }
+
+    let domain_partials: Vec<&str> = domain.trim_end_matches(".").rsplit(".").collect();
+    let domain_to_match_against_partials: Vec<&str> = domain_to_match_against
+        .trim_end_matches(".")
+        .rsplit(".")
+        .collect();
+    let domain_to_match_against_starts_with_a_dot = domain_to_match_against.starts_with(".");
+
+    let mut i: usize = 0;
+    let l: usize = std::cmp::max(
+        domain_partials.len(),
+        domain_to_match_against_partials.len(),
+    );
+    let mut ok: bool = true;
+
+    while i < l {
+        // Exit and return false if went out of bounds of domain to match against, and it didn't start with a dot
+        if !domain_to_match_against_starts_with_a_dot
+            && domain_to_match_against_partials.len() < i + 1
+        {
+            ok = false;
+            break;
+        }
+
+        let domain_partial = if domain_partials.len() < i + 1 {
+            ""
+        } else {
+            domain_partials.get(i).unwrap()
+        };
+        let domain_to_match_against_partial = if domain_to_match_against_partials.len() < i + 1 {
+            ""
+        } else {
+            domain_to_match_against_partials.get(i).unwrap()
+        };
+
+        let parts_match = domain_to_match_against_partial.eq_ignore_ascii_case(domain_partial);
+
+        if !parts_match && domain_to_match_against_partial.len() != 0 {
+            ok = false;
+            break;
+        }
+
+        i += 1;
+    }
+
+    ok
+}
+
 pub fn indent(level: u32) -> String {
    let mut result: String = String::new();
    let mut l: u32 = level;
@@ -148,7 +204,7 @@ pub fn retrieve_asset(
        let (media_type, charset, data) = parse_data_url(url);
        Ok((data, url.clone(), media_type, charset))
    } else if url.scheme() == "file" {
-        // Check if parent_url is also file:/// (if not, then we don't embed the asset)
+        // Check if parent_url is also a file: URL (if not, then we don't embed the asset)
        if parent_url.scheme() != "file" {
            if !options.silent {
                eprintln!(
@@ -236,6 +292,17 @@ pub fn retrieve_asset(
                "".to_string(),
            ))
        } else {
+            if let Some(domains) = &options.domains {
+                let domain_matches = domains
+                    .iter()
+                    .any(|d| domain_is_within_domain(url.host_str().unwrap(), &d.trim()));
+                if (options.blacklist_domains && domain_matches)
+                    || (!options.blacklist_domains && !domain_matches)
+                {
+                    return Err(client.get("").send().unwrap_err());
+                }
+            }
+
            // URL not in cache, we retrieve the file
            match client.get(url.as_str()).send() {
                Ok(response) => {
--- a/tests/cli/data_url.rs
+++ b/tests/cli/data_url.rs
@@ -196,17 +196,14 @@ mod failing {
        let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
        let out = cmd.arg("data:,Hello%2C%20World!").output().unwrap();

-        // STDERR should contain error description
-        assert_eq!(
-            String::from_utf8_lossy(&out.stderr),
-            "Unsupported document media type\n"
-        );
+        // STDERR should be empty
+        assert_eq!(String::from_utf8_lossy(&out.stderr), "");

-        // STDOUT should contain HTML
-        assert_eq!(String::from_utf8_lossy(&out.stdout), "");
+        // STDOUT should contain text
+        assert_eq!(String::from_utf8_lossy(&out.stdout), "Hello, World!\n");

-        // Exit code should be 1
-        out.assert().code(1);
+        // Exit code should be 0
+        out.assert().code(0);
    }

    #[test]
@@ -221,7 +218,7 @@ mod failing {
        // STDERR should be empty
        assert_eq!(String::from_utf8_lossy(&out.stderr), "");

-        // STDOUT should contain HTML with no JS in it
+        // STDOUT should contain HTML without contents of local JS file
        assert_eq!(
            String::from_utf8_lossy(&out.stdout),
            "<html><head><script src=\"data:application/javascript;base64,\"></script></head><body></body></html>\n"
--- a/tests/utils/domain_is_within_domain.rs
+++ b/tests/utils/domain_is_within_domain.rs
@@ -0,0 +1,154 @@
+//  ██████╗  █████╗ ███████╗███████╗██╗███╗   ██╗ ██████╗
+//  ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗  ██║██╔════╝
+//  ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║  ███╗
+//  ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║   ██║
+//  ██║     ██║  ██║███████║███████║██║██║ ╚████║╚██████╔╝
+//  ╚═╝     ╚═╝  ╚═╝╚══════╝╚══════╝╚═╝╚═╝  ╚═══╝ ╚═════╝
+
+#[cfg(test)]
+mod passing {
+    use monolith::utils;
+
+    #[test]
+    fn sub_domain_is_within_dotted_sub_domain() {
+        assert!(utils::domain_is_within_domain(
+            "news.ycombinator.com",
+            ".news.ycombinator.com"
+        ));
+    }
+
+    #[test]
+    fn domain_is_within_dotted_domain() {
+        assert!(utils::domain_is_within_domain(
+            "ycombinator.com",
+            ".ycombinator.com"
+        ));
+    }
+
+    #[test]
+    fn sub_domain_is_within_dotted_domain() {
+        assert!(utils::domain_is_within_domain(
+            "news.ycombinator.com",
+            ".ycombinator.com"
+        ));
+    }
+
+    #[test]
+    fn sub_domain_is_within_dotted_top_level_domain() {
+        assert!(utils::domain_is_within_domain(
+            "news.ycombinator.com",
+            ".com"
+        ));
+    }
+
+    #[test]
+    fn domain_is_within_itself() {
+        assert!(utils::domain_is_within_domain(
+            "ycombinator.com",
+            "ycombinator.com"
+        ));
+    }
+
+    #[test]
+    fn domain_with_trailing_dot_is_within_itself() {
+        assert!(utils::domain_is_within_domain(
+            "ycombinator.com.",
+            "ycombinator.com"
+        ));
+    }
+
+    #[test]
+    fn domain_with_trailing_dot_is_within_single_dot() {
+        assert!(utils::domain_is_within_domain("ycombinator.com.", "."));
+    }
+
+    #[test]
+    fn domain_matches_single_dot() {
+        assert!(utils::domain_is_within_domain("ycombinator.com", "."));
+    }
+
+    #[test]
+    fn dotted_domain_must_be_within_dotted_domain() {
+        assert!(utils::domain_is_within_domain(
+            ".ycombinator.com",
+            ".ycombinator.com"
+        ));
+    }
+
+    #[test]
+    fn empty_is_within_dot() {
+        assert!(utils::domain_is_within_domain("", "."));
+    }
+
+    #[test]
+    fn both_dots() {
+        assert!(utils::domain_is_within_domain(".", "."));
+    }
+}
+
+//  ███████╗ █████╗ ██╗██╗     ██╗███╗   ██╗ ██████╗
+//  ██╔════╝██╔══██╗██║██║     ██║████╗  ██║██╔════╝
+//  █████╗  ███████║██║██║     ██║██╔██╗ ██║██║  ███╗
+//  ██╔══╝  ██╔══██║██║██║     ██║██║╚██╗██║██║   ██║
+//  ██║     ██║  ██║██║███████╗██║██║ ╚████║╚██████╔╝
+//  ╚═╝     ╚═╝  ╚═╝╚═╝╚══════╝╚═╝╚═╝  ╚═══╝ ╚═════╝
+
+#[cfg(test)]
+mod failing {
+    use monolith::utils;
+
+    #[test]
+    fn sub_domain_must_not_be_within_domain() {
+        assert!(!utils::domain_is_within_domain(
+            "news.ycombinator.com",
+            "ycombinator.com"
+        ));
+    }
+
+    #[test]
+    fn domain_must_not_be_within_top_level_domain() {
+        assert!(!utils::domain_is_within_domain("ycombinator.com", "com"));
+    }
+
+    #[test]
+    fn different_domains_must_not_be_within_one_another() {
+        assert!(!utils::domain_is_within_domain(
+            "news.ycombinator.com",
+            "kernel.org"
+        ));
+    }
+
+    #[test]
+    fn sub_domain_is_not_within_wrong_top_level_domain() {
+        assert!(!utils::domain_is_within_domain(
+            "news.ycombinator.com",
+            "org"
+        ));
+    }
+
+    #[test]
+    fn dotted_domain_is_not_within_domain() {
+        assert!(!utils::domain_is_within_domain(
+            ".ycombinator.com",
+            "ycombinator.com"
+        ));
+    }
+
+    #[test]
+    fn different_domain_is_not_within_dotted_domain() {
+        assert!(!utils::domain_is_within_domain(
+            "www.doodleoptimize.com",
+            ".ycombinator.com"
+        ));
+    }
+
+    #[test]
+    fn no_domain_can_be_within_empty_domain() {
+        assert!(!utils::domain_is_within_domain("ycombinator.com", ""));
+    }
+
+    #[test]
+    fn both_can_not_be_empty() {
+        assert!(!utils::domain_is_within_domain("", ""));
+    }
+}
--- a/tests/utils/mod.rs
+++ b/tests/utils/mod.rs
@@ -1,4 +1,5 @@
 mod detect_media_type;
+mod domain_is_within_domain;
 mod indent;
 mod parse_content_type;
 mod retrieve_asset;
Author	SHA1	Message	Date
Sunshine	1c71e708e1	bump version number (2.6.2 -> 2.7.0), update dependencies	2022-11-10 06:36:32 -10:00
Sunshine	a1bb9a4b74	Update README.md	2022-11-10 06:03:30 -10:00
Sunshine	cf7e368545	Update README.md	2022-11-10 06:02:46 -10:00
Sunshine	c1edde9b3e	refine CLI API for white/black-listing of domains	2022-11-10 05:37:36 -10:00
Sunshine	7c0504c4cb	Update README.md	2022-11-10 04:45:37 -10:00
Sunshine	1bff2c22ba	Update README.md	2022-11-10 04:41:13 -10:00
Sunshine	8113509dcf	fix tests	2022-11-10 04:12:31 -10:00
Sunshine	8fc0fc155f	parse XML documents, save non-HTML and non-XML targets unparsed	2022-11-10 04:12:31 -10:00
Jakub Jirutka	7c61b462dd	disable unnecessary/unused regex features to reduce binary size This will reduce the monolith binary size by ~15%.	2022-09-20 11:46:26 -04:00
Simone Mosciatti	ef3684025b	move to use http instead of https	2022-09-11 14:30:44 -04:00
Simone Mosciatti	db7ee697b3	rewrite small part of the input argument handling the commit rewrite a small part of the input argument handling, trying to follow besr rust practices. We get rid of a variable and of a mutable reference while keeping the code a bit more coincise.	2022-09-11 14:30:44 -04:00
Sunshine	89ce5029b9	add option to blacklist/whitelist domains	2022-09-01 13:35:52 -10:00
dependabot[bot]	54609b10e5	Bump iana-time-zone from 0.1.44 to 0.1.46 (#316 ) Bumps [iana-time-zone](https://github.com/strawlab/iana-time-zone) from 0.1.44 to 0.1.46. - [Release notes](https://github.com/strawlab/iana-time-zone/releases) - [Changelog](https://github.com/strawlab/iana-time-zone/blob/main/CHANGELOG.md) - [Commits](https://github.com/strawlab/iana-time-zone/compare/0.1.44...v0.1.46) --- updated-dependencies: - dependency-name: iana-time-zone dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2022-08-31 11:35:38 -10:00