64 Commits

Author SHA1 Message Date
Sunshine
9ff9dd0928 Merge pull request #82 from snshn/str
implement str!() macro
2019-12-13 03:51:44 -05:00
Sunshine
b0fc24d77f Merge pull request #81 from Alch-Emi/shared-client
Use a shared client for HTTP requests
2019-12-13 03:51:19 -05:00
Sunshine
d8abfaf25c Merge pull request #86 from snshn/readme-update
Update README.md
2019-12-13 03:50:21 -05:00
Sunshine
565acdef25 Update README.md 2019-12-12 20:11:46 -05:00
Sunshine
4637fed15c Merge pull request #87 from snshn/let-nightly-rust-fail
let nightly Rust always pass Travis' checks
2019-12-12 20:09:23 -05:00
Sunshine
9a7ea4fdde let nightly Rust always pass Travis' checks 2019-12-12 19:53:43 -05:00
Emi Simpson
3d4a932ac1 Merge Y2Z/master, fix conflicts between shared-client & resolve-css 2019-12-12 19:29:21 -05:00
Sunshine
cf70383165 Merge pull request #83 from snshn/stderr
Get rid of brackets around URLs
2019-12-11 18:32:25 -05:00
Sunshine
9fe913d853 implement str!() macro 2019-12-11 01:36:14 -05:00
Sunshine
862489e41b Get rid of brackets around URLs 2019-12-11 01:17:00 -05:00
Sunshine
919e626b5e Merge pull request #78 from Alch-Emi/load-css-imports
Load URLs in CSS and style attributes
2019-12-10 17:47:44 -05:00
Emi Simpson
cf347e0483 Updated Cargo.lock 2019-12-10 09:12:10 -05:00
Emi Simpson
322ab41b8c Updated tests to reflect API changes 2019-12-10 00:00:15 -05:00
Emi Simpson
1a7336e809 Updated Cargo.toml 2019-12-09 22:21:54 -05:00
Emi Simpson
65d0eab793 Use a shared client initialized in main.rs 2019-12-09 22:17:54 -05:00
Emi Simpson
614af44c92 Gramatical and stylistic fixes 2019-12-09 13:58:12 -05:00
Emi Simpson
028beb821c Rustfmt update for nightly formatter 2019-12-06 16:46:52 -05:00
Emi Simpson
76ccff80f9 Fixed failure of regex to match @imports 2019-12-06 16:15:34 -05:00
Emi Simpson
45335d7507 Support links in style= attributes 2019-12-06 15:28:08 -05:00
Emi Simpson
a4743ca92f Respect the --no-images flag while parsing CSS 2019-12-06 15:00:06 -05:00
Emi Simpson
b96a777e8a Merge commit '4decea7' into load-css-imports 2019-12-06 13:56:36 -05:00
Emi Simpson
4decea716c Fixed css replacement with more than one linked asset 2019-12-06 13:55:43 -05:00
Emi Simpson
695a787206 Moved regex compilation to lazy_static 2019-12-06 13:53:44 -05:00
Emi Simpson
90e6cb1c45 Prevent crash on URLs delimited by single quotes 2019-12-06 11:52:31 -05:00
Emi Simpson
7412d663e0 Use a slightly more efficient .replace_range() instead of cloning the string twice 2019-12-06 11:37:05 -05:00
Emi Simpson
8646af6e9f removed debug code (woops sorry) 2019-12-06 10:52:20 -05:00
Emi Simpson
de383c94b1 Applied rustfmt 2019-12-05 20:41:43 -05:00
Emi Simpson
ab65b44f0d Cleaned up some overcomplicated code 2019-12-05 20:22:39 -05:00
Emi Simpson
13bacb4320 EMPTY_STRING no longer used 2019-12-05 20:11:19 -05:00
Emi Simpson
d574e9a5da Added support for <style> tags 2019-12-05 20:05:52 -05:00
Emi Simpson
1de0fc0961 Add warning and fallback when parsing a rel=stylesheet link 2019-12-05 19:10:47 -05:00
Emi Simpson
ebbf755e09 Fixed misleading variable name 2019-12-05 19:02:11 -05:00
Emi Simpson
d3956a7905 Made merge compatible with Y2Z/master 2019-12-05 19:01:03 -05:00
Emi Simpson
ef7ddcd434 Added fallback to absolute URL on failure to resolve CSS stylesheet @imports 2019-12-05 18:37:37 -05:00
Emi Simpson
11bbfc0851 Added support for recursively nested css @imports 2019-12-05 18:15:06 -05:00
Emi Simpson
a2bf7e3345 Fixed some errors detecting, parsing, and transforming urls in resolve_css_imports 2019-12-05 17:42:07 -05:00
Sunshine
35f5e1353d Merge pull request #77 from chenrui333/add-locked-flag-to-ci
Add --locked flag to the cargo build/test
2019-12-04 07:22:37 -05:00
Rui Chen
f8040f4d8c Add --locked flag to the cargo build/test 2019-12-03 23:12:52 -05:00
Sunshine
31d3fee626 Merge pull request #76 from chenrui333/add-cargo-lock-file
Add Cargo.lock file
2019-12-03 00:49:30 -05:00
Rui Chen
178abd07bd Add Cargo.lock file 2019-12-03 00:30:57 -05:00
Sunshine
491185e191 Bump version number (2.0.22 → 2.0.23) 2019-10-22 18:37:27 -04:00
Sunshine
b0c55d5016 Add Emmanuel to the list of authors 2019-10-22 18:36:10 -04:00
voila
1ff5e91087 Use HashMap as cache to minimize the number of HTTP requests (#75)
Use HashMap as cache to minimize the number of HTTP requests
2019-10-22 18:33:22 -04:00
knidarkness
550e4cc83f Fixed formatting 2019-10-12 14:05:07 +03:00
knidarkness
5443c0cc3f Added loading of the links given as url(...) in css files 2019-10-12 12:32:59 +03:00
Sunshine
8add3a8746 Bump version number 2019-10-10 22:34:38 -04:00
Sunshine
2f592d5561 Merge pull request #70 from robatipoor/master
refactor utils functions
2019-10-10 22:34:13 -04:00
robatipoor
55fe523a1c refactor utils functions 2019-10-10 16:53:00 +03:30
Sunshine
b5d42bd722 Bump version number 2019-10-10 07:49:46 -04:00
Sunshine
cbf3b66f33 Merge pull request #69 from robatipoor/master
move argument parser section to args mod
2019-10-10 07:41:59 -04:00
robatipoor
2e48ea90e1 move argument parser section to args mod 2019-10-10 08:58:12 +03:30
Sunshine
9c006f3258 Update README.md 2019-10-07 23:46:08 -04:00
Sunshine
ab24851b5b Merge pull request #67 from distributed-mind/master
add related projects section
2019-10-07 23:44:45 -04:00
Cobin Bluth
de11559efa add related projects 2019-10-07 13:57:02 +02:00
Sunshine
dbacd76103 Merge pull request #65 from Y2Z/redirects
Properly handle 30x redirects
2019-10-01 07:25:35 -04:00
Sunshine
0896f2e214 Properly handle 30x redirects 2019-09-30 23:58:09 -04:00
Sunshine
b6ba22513d Merge pull request #64 from Y2Z/refactor
Improve code structure
2019-09-30 09:45:24 -04:00
Vincent Flyson
3948ea3aa0 Improve code structure 2019-09-29 17:15:49 -04:00
Sunshine
8b3f3f3a6e Merge pull request #63 from Y2Z/poster
Add support for poster attribute
2019-09-22 14:46:34 -04:00
Vincent Flyson
eec05767cf Add support for poster attribute 2019-09-22 12:57:50 -04:00
Sunshine
c05dc2ae65 Merge pull request #61 from Y2Z/isolate
Add CSP isolation, no CSS and no iframe options
2019-09-22 02:40:04 -04:00
Vincent Flyson
88a230872c Add CSP isolation, no CSS, and no iframe options 2019-09-21 22:59:03 -04:00
Vincent Flyson
ac79a52da0 Merge pull request #57 from Y2Z/fixes
Ignore empty src in images, accept fluid icons
2019-09-08 05:31:46 -04:00
Vincent Flyson
04cbbefafa Ignore empty src in images, accept fluid icons 2019-09-08 02:51:53 -04:00
20 changed files with 3213 additions and 635 deletions

7
.gitignore vendored
View File

@@ -2,9 +2,8 @@
# will have compiled files and executables
/target/
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
Cargo.lock
# These are backup files generated by rustfmt
**/*.rs.bk
# Exclude accidental HTML files
*.html

View File

@@ -9,9 +9,18 @@ os:
rust:
- stable
- beta
- beta
- nightly
before_script:
- rustup component add rustfmt
script:
- cargo build --verbose
- cargo test --verbose
- cargo build --all --locked --verbose
- cargo test --all --locked --verbose
- cargo fmt --all -- --check
jobs:
allow_failures:
- rust: nightly
fast_finish: true

1724
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,18 +1,19 @@
[package]
name = "monolith"
version = "2.0.16"
version = "2.1.0"
authors = [
"Sunshine <sunshine@uberspace.net>",
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
"Emmanuel Delaborde <th3rac25@gmail.com>",
"Emi Simpson <emi@alchemi.dev>",
]
description = "CLI tool for saving web pages as a single HTML file"
[dependencies]
base64 = "0.10.1"
clap = "2.33.0"
html5ever = "0.24.0"
indicatif = "0.11.0"
lazy_static = "1.3.0"
regex = "1.2.1"
html5ever = "0.24.1"
lazy_static = "1.4.0"
regex = "1.3.1"
reqwest = "0.9.20"
url = "2.1.0"

16
Makefile Normal file
View File

@@ -0,0 +1,16 @@
.PHONY: all build install run test lint
all: test build
build:
@cargo build --locked
install:
@cargo install --force --locked --path .
test:
@cargo test --locked
@cargo fmt --all -- --check
lint:
@cargo fmt --all --

View File

@@ -1,7 +1,15 @@
[![Travis CI Build Status](https://travis-ci.org/Y2Z/monolith.svg?branch=master)](https://travis-ci.org/Y2Z/monolith)
[![AppVeyor Build status](https://ci.appveyor.com/api/projects/status/j1v1d96sw952b1ch?svg=true)](https://ci.appveyor.com/project/vflyson/monolith)
[![AppVeyor Build status](https://ci.appveyor.com/api/projects/status/ae7soyjih8jg2bv7/branch/master?svg=true)](https://ci.appveyor.com/project/snshn/monolith/branch/master)
# monolith
```
___ ___________ __________ ___________________ ___
| \ / \ | | | | | |
| \_/ __ \_| __ | | ___ ___ |__| |
| | | | | | | | | | | |
| |__| _ |__| |____| | | | | __ |
| |\_/| | \ | | | | | | |
|___| |__________| \____________________| |___| |___| |___|
```
A data hoarder's dream come true: bundle any web page into a single HTML file.
You can finally replace that gazillion of open tabs with a gazillion of .html files stored somewhere on your precious little drive.
@@ -10,24 +18,34 @@ Unlike the conventional "Save page as", `monolith` not only saves the target doc
If compared to saving websites with `wget -mpk`, this tool embeds all assets as data URLs and therefore lets browsers render the saved page exactly the way it was on the Internet, even when no network connection is available.
<!-- `This program works both on remote and local targets. -->
## Installation
### Installation
### From source
$ git clone https://github.com/Y2Z/monolith.git
$ cd monolith
$ cargo install --path .
### Usage
### On macOS (via Homebrew)
$ brew install monolith
## Usage
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html
### Options
## Options
- `-c`: Ignore styles
- `-f`: Exclude iframes
- `-i`: Remove images
- `-I`: Isolate document
- `-j`: Exclude JavaScript
- `-k`: Accept invalid X.509 (TLS) certificates
- `-s`: Silent mode
- `-u`: Specify custom User-Agent
### License
## Related projects
- `Pagesaver`: https://github.com/distributed-mind/pagesaver
- `SingleFile`: https://github.com/gildas-lormeau/SingleFile
## License
The Unlicense
<!-- Microtext -->

View File

@@ -114,6 +114,7 @@ install:
- if defined MINGW_PATH set PATH=%PATH%;%MINGW_PATH%
- rustc -vV
- cargo -vV
- rustup component add rustfmt
## Build Script ##
@@ -125,4 +126,5 @@ build: false
#directly or perform other testing commands. Rust will automatically be placed in the PATH
# environment variable.
test_script:
- cargo test --verbose %cargoflags%
- cargo test --verbose %cargoflags%
- cargo fmt --all -- --check

62
src/args.rs Normal file
View File

@@ -0,0 +1,62 @@
use clap::{App, Arg};
#[derive(Default)]
pub struct AppArgs {
pub url_target: String,
pub no_css: bool,
pub no_frames: bool,
pub no_images: bool,
pub no_js: bool,
pub insecure: bool,
pub isolate: bool,
pub silent: bool,
pub user_agent: String,
}
const DEFAULT_USER_AGENT: &str =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0";
impl AppArgs {
pub fn get() -> AppArgs {
let app = App::new("monolith")
.version(crate_version!())
.author(crate_authors!("\n"))
.about(crate_description!())
.arg(
Arg::with_name("url")
.required(true)
.takes_value(true)
.index(1)
.help("URL to download"),
)
// .args_from_usage("-a, --include-audio 'Embed audio sources'")
.args_from_usage("-c, --no-css 'Ignore styles'")
.args_from_usage("-f, --no-frames 'Exclude iframes'")
.args_from_usage("-i, --no-images 'Remove images'")
.args_from_usage("-I, --isolate 'Cut off from the Internet'")
.args_from_usage("-j, --no-js 'Exclude JavaScript'")
.args_from_usage("-k, --insecure 'Accept invalid X.509 (TLS) certificates'")
.args_from_usage("-s, --silent 'Suppress verbosity'")
.args_from_usage("-u, --user-agent=[Iceweasel] 'Custom User-Agent string'")
// .args_from_usage("-v, --include-video 'Embed video sources'")
.get_matches();
let mut app_args = AppArgs::default();
// Process the command
app_args.url_target = app
.value_of("url")
.expect("please set target url")
.to_string();
app_args.no_css = app.is_present("no-css");
app_args.no_frames = app.is_present("no-frames");
app_args.no_images = app.is_present("no-images");
app_args.no_js = app.is_present("no-js");
app_args.insecure = app.is_present("insecure");
app_args.isolate = app.is_present("isolate");
app_args.silent = app.is_present("silent");
app_args.user_agent = app
.value_of("user-agent")
.unwrap_or_else(|| DEFAULT_USER_AGENT)
.to_string();
app_args
}
}

View File

@@ -1,99 +1,73 @@
use html5ever::interface::QualName;
use html5ever::parse_document;
use html5ever::rcdom::{Handle, NodeData, RcDom};
use html5ever::serialize::{serialize, SerializeOpts};
use html5ever::tendril::TendrilSink;
use http::{is_valid_url, resolve_url, retrieve_asset};
use regex::Regex;
use html5ever::tendril::{format_tendril, TendrilSink};
use html5ever::tree_builder::{Attribute, TreeSink};
use html5ever::{local_name, namespace_url, ns};
use http::retrieve_asset;
use js::attr_is_event_handler;
use reqwest::Client;
use std::collections::HashMap;
use std::default::Default;
use std::io;
use utils::data_to_dataurl;
use utils::{data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol};
lazy_static! {
static ref EMPTY_STRING: String = String::new();
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
static ref ICON_VALUES: Regex = Regex::new(
r"^icon|shortcut icon|mask-icon|apple-touch-icon$"
).unwrap();
}
const TRANSPARENT_PIXEL: &str = "data:image/png;base64,\
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=";
const JS_DOM_EVENT_ATTRS: [&str; 21] = [
// Input
"onfocus",
"onblur",
"onselect",
"onchange",
"onsubmit",
"onreset",
"onkeydown",
"onkeypress",
"onkeyup",
// Mouse
"onmouseover",
"onmouseout",
"onmousedown",
"onmouseup",
"onmousemove",
// Click
"onclick",
"ondblclick",
// Load
"onload",
"onunload",
"onabort",
"onerror",
"onresize",
const ICON_VALUES: [&str; 5] = [
"icon",
"shortcut icon",
"mask-icon",
"apple-touch-icon",
"fluid-icon",
];
fn get_parent_node_name(node: &Handle) -> String {
let parent = node.parent.take().clone();
let parent_node = parent.and_then(|node| node.upgrade()).unwrap();
const TRANSPARENT_PIXEL: &str =
"data:image/png;base64,\
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=";
match &parent_node.data {
NodeData::Document => { EMPTY_STRING.clone() }
NodeData::Doctype { .. } => { EMPTY_STRING.clone() }
NodeData::Text { .. } => { EMPTY_STRING.clone() }
NodeData::Comment { .. } => { EMPTY_STRING.clone() }
NodeData::Element { ref name, attrs: _, .. } => {
name.local.as_ref().to_string()
}
NodeData::ProcessingInstruction { .. } => unreachable!()
pub fn get_parent_node(node: &Handle) -> Handle {
let parent = node.parent.take().clone();
parent.and_then(|node| node.upgrade()).unwrap()
}
pub fn get_node_name(node: &Handle) -> String {
match &node.data {
NodeData::Element { ref name, .. } => name.local.as_ref().to_string(),
_ => str!(),
}
}
pub fn is_icon(attr_value: &str) -> bool {
ICON_VALUES.contains(&&*attr_value.to_lowercase())
}
pub fn walk_and_embed_assets(
cache: &mut HashMap<String, String>,
client: &Client,
url: &str,
node: &Handle,
opt_no_css: bool,
opt_no_js: bool,
opt_no_images: bool,
opt_user_agent: &str,
opt_silent: bool,
opt_insecure: bool,
opt_no_frames: bool,
) {
match node.data {
NodeData::Document => {
// Dig deeper
for child in node.children.borrow().iter() {
walk_and_embed_assets(
&url, child,
opt_no_js,
opt_no_images,
opt_user_agent,
opt_silent,
opt_insecure,
);
cache,
client,
&url,
child,
opt_no_css,
opt_no_js,
opt_no_images,
opt_silent,
opt_no_frames,
);
}
}
NodeData::Doctype { .. } => {}
NodeData::Text { .. } => {}
NodeData::Comment { .. } => {
// Note: in case of opt_no_js being set to true, there's no need to worry about
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
// since that's not part of W3C standard and therefore gets ignored
// by browsers other than IE [5, 9]
}
NodeData::Element {
ref name,
ref attrs,
@@ -103,7 +77,7 @@ pub fn walk_and_embed_assets(
match name.local.as_ref() {
"link" => {
let mut link_type = "";
let mut link_type: &str = "";
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "rel" {
@@ -122,56 +96,71 @@ pub fn walk_and_embed_assets(
if &attr.name.local == "href" {
if opt_no_images {
attr.value.clear();
attr.value.push_slice(TRANSPARENT_PIXEL);
} else {
let href_full_url: String = resolve_url(
&url,
&attr.value.to_string()
)
.unwrap_or(EMPTY_STRING.clone());
let favicon_datauri = retrieve_asset(
&href_full_url,
true,
"",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
let href_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(str!());
let (favicon_dataurl, _) = retrieve_asset(
cache,
client,
&href_full_url,
true,
"",
opt_silent,
)
.unwrap_or((str!(), str!()));
attr.value.clear();
attr.value.push_slice(favicon_datauri.as_str());
attr.value.push_slice(favicon_dataurl.as_str());
}
}
}
} else if link_type == "stylesheet" {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
let href_full_url: String = resolve_url(
&url,
&attr.value.to_string(),
)
.unwrap_or(EMPTY_STRING.clone());
let css_datauri = retrieve_asset(
if opt_no_css {
attr.value.clear();
} else {
let href_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(str!());
let replacement_text = match retrieve_asset(
cache,
client,
&href_full_url,
true,
false,
"text/css",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
attr.value.clear();
attr.value.push_slice(css_datauri.as_str());
) {
// On successful retrieval, traverse CSS
Ok((css_data, _)) => resolve_css_imports(
cache,
client,
&css_data,
true,
&href_full_url,
opt_no_images,
opt_silent,
),
// If a network error occured, warn
Err(e) => {
eprintln!("Warning: {}", e,);
// If failed to resolve, replace with absolute URL
href_full_url
}
};
attr.value.clear();
attr.value.push_slice(&replacement_text);
}
}
}
} else {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
let href_full_url: String = resolve_url(
&url,
&attr.value.to_string(),
)
.unwrap_or(EMPTY_STRING.clone());
let href_full_url: String =
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
attr.value.clear();
attr.value.push_slice(&href_full_url.as_str());
}
@@ -181,54 +170,63 @@ pub fn walk_and_embed_assets(
"img" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
let value = attr.value.to_string();
// Ignore images with empty source
if value == str!() {
continue;
}
if opt_no_images {
attr.value.clear();
attr.value.push_slice(TRANSPARENT_PIXEL);
} else {
let src_full_url: String = resolve_url(
&url,
&attr.value.to_string(),
)
.unwrap_or(EMPTY_STRING.clone());
let img_datauri = retrieve_asset(
&src_full_url,
true,
"",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
let src_full_url: String =
resolve_url(&url, &value).unwrap_or(str!());
let (img_dataurl, _) = retrieve_asset(
cache,
client,
&src_full_url,
true,
"",
opt_silent,
)
.unwrap_or((str!(), str!()));
attr.value.clear();
attr.value.push_slice(img_datauri.as_str());
attr.value.push_slice(img_dataurl.as_str());
}
}
}
}
"source" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "srcset" {
if get_parent_node_name(&node) == "picture" {
let attr_name: &str = &attr.name.local;
if attr_name == "src" {
let src_full_url: String = resolve_url(&url, &attr.value.to_string())
.unwrap_or(attr.value.to_string());
attr.value.clear();
attr.value.push_slice(src_full_url.as_str());
} else if attr_name == "srcset" {
if get_node_name(&get_parent_node(&node)) == "picture" {
if opt_no_images {
attr.value.clear();
attr.value.push_slice(TRANSPARENT_PIXEL);
} else {
let srcset_full_url: String = resolve_url(
&url,
&attr.value.to_string(),
)
.unwrap_or(EMPTY_STRING.clone());
let source_datauri = retrieve_asset(
&srcset_full_url,
true,
"",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
let srcset_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(str!());
let (source_dataurl, _) = retrieve_asset(
cache,
client,
&srcset_full_url,
true,
"",
opt_silent,
)
.unwrap_or((str!(), str!()));
attr.value.clear();
attr.value.push_slice(source_datauri.as_str());
attr.value.push_slice(source_dataurl.as_str());
}
}
}
@@ -238,12 +236,12 @@ pub fn walk_and_embed_assets(
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
// Don't touch email links or hrefs which begin with a hash sign
if attr.value.starts_with('#') || has_protocol(&attr.value) {
if attr.value.starts_with('#') || url_has_protocol(&attr.value) {
continue;
}
let href_full_url: String = resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
let href_full_url: String =
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
attr.value.clear();
attr.value.push_slice(href_full_url.as_str());
}
@@ -251,7 +249,7 @@ pub fn walk_and_embed_assets(
}
"script" => {
if opt_no_js {
// Get rid of src and inner content of SCRIPT tags
// Empty src and inner content of SCRIPT tags
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
attr.value.clear();
@@ -261,22 +259,42 @@ pub fn walk_and_embed_assets(
} else {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
let src_full_url: String = resolve_url(
&url,
&attr.value.to_string(),
)
.unwrap_or(EMPTY_STRING.clone());
let js_datauri = retrieve_asset(
&src_full_url,
true,
"application/javascript",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
let src_full_url: String =
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
let (js_dataurl, _) = retrieve_asset(
cache,
client,
&src_full_url,
true,
"application/javascript",
opt_silent,
)
.unwrap_or((str!(), str!()));
attr.value.clear();
attr.value.push_slice(js_datauri.as_str());
attr.value.push_slice(js_dataurl.as_str());
}
}
}
}
"style" => {
if opt_no_css {
// Empty inner content of STYLE tags
node.children.borrow_mut().clear();
} else {
for node in node.children.borrow_mut().iter_mut() {
if let NodeData::Text { ref contents } = node.data {
let mut tendril = contents.borrow_mut();
let replacement = resolve_css_imports(
cache,
client,
tendril.as_ref(),
false,
&url,
opt_no_images,
opt_silent,
);
tendril.clear();
tendril.push_slice(&replacement);
}
}
}
@@ -284,90 +302,167 @@ pub fn walk_and_embed_assets(
"form" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "action" {
// Do not touch action props which are set to a URL
if is_valid_url(&attr.value) {
continue;
// Modify action to be a full URL
if !is_valid_url(&attr.value) {
let href_full_url: String =
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
attr.value.clear();
attr.value.push_slice(href_full_url.as_str());
}
let href_full_url: String = resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
attr.value.clear();
attr.value.push_slice(href_full_url.as_str());
}
}
}
"iframe" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
let value = attr.value.to_string();
// Ignore iframes with empty source (they cause infinite loops)
if value == EMPTY_STRING.clone() {
if opt_no_frames {
// Empty the src attribute
attr.value.clear();
continue;
}
let iframe_src: String = attr.value.to_string();
let src_full_url: String = resolve_url(&url, &value)
.unwrap_or(EMPTY_STRING.clone());
let iframe_data = retrieve_asset(
&src_full_url,
false,
"text/html",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or(EMPTY_STRING.clone());
// Ignore iframes with empty source (they cause infinite loops)
if iframe_src == str!() {
continue;
}
let src_full_url: String =
resolve_url(&url, &iframe_src).unwrap_or(str!());
let (iframe_data, iframe_final_url) = retrieve_asset(
cache,
client,
&src_full_url,
false,
"text/html",
opt_silent,
)
.unwrap_or((str!(), src_full_url));
let dom = html_to_dom(&iframe_data);
walk_and_embed_assets(
&src_full_url,
&dom.document,
opt_no_js,
opt_no_images,
opt_user_agent,
opt_silent,
opt_insecure,
);
cache,
client,
&iframe_final_url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
opt_silent,
opt_no_frames,
);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
let iframe_datauri = data_to_dataurl("text/html", &buf);
let iframe_dataurl = data_to_dataurl("text/html", &buf);
attr.value.clear();
attr.value.push_slice(iframe_datauri.as_str());
attr.value.push_slice(iframe_dataurl.as_str());
}
}
}
"video" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "poster" {
let video_poster = attr.value.to_string();
// Skip posters with empty source
if video_poster == str!() {
continue;
}
if opt_no_images {
attr.value.clear();
} else {
let poster_full_url: String =
resolve_url(&url, &video_poster).unwrap_or(str!());
let (poster_dataurl, _) = retrieve_asset(
cache,
client,
&poster_full_url,
true,
"",
opt_silent,
)
.unwrap_or((poster_full_url, str!()));
attr.value.clear();
attr.value.push_slice(poster_dataurl.as_str());
}
}
}
}
_ => {}
}
// Process style attributes
if opt_no_css {
// Get rid of style attributes
let mut style_attr_indexes = Vec::new();
for (i, attr) in attrs_mut.iter_mut().enumerate() {
if attr.name.local.to_lowercase() == "style" {
style_attr_indexes.push(i);
}
}
style_attr_indexes.reverse();
for attr_index in style_attr_indexes {
attrs_mut.remove(attr_index);
}
} else {
// Otherwise, parse any links found in the attributes
for attribute in attrs_mut
.iter_mut()
.filter(|a| a.name.local.as_ref().eq_ignore_ascii_case("style"))
{
let replacement = resolve_css_imports(
cache,
client,
attribute.value.as_ref(),
false,
&url,
opt_no_images,
opt_silent,
);
attribute.value.clear();
attribute.value.push_slice(&replacement);
}
}
if opt_no_js {
// Get rid of JS event attributes
for attr in attrs_mut.iter_mut() {
if JS_DOM_EVENT_ATTRS.contains(&attr.name.local.to_lowercase().as_str()) {
attr.value.clear();
let mut js_attr_indexes = Vec::new();
for (i, attr) in attrs_mut.iter_mut().enumerate() {
if attr_is_event_handler(&attr.name.local) {
js_attr_indexes.push(i);
}
}
js_attr_indexes.reverse();
for attr_index in js_attr_indexes {
attrs_mut.remove(attr_index);
}
}
// Dig deeper
for child in node.children.borrow().iter() {
walk_and_embed_assets(
&url,
child,
opt_no_js,
opt_no_images,
opt_user_agent,
opt_silent,
opt_insecure,
);
cache,
client,
&url,
child,
opt_no_css,
opt_no_js,
opt_no_images,
opt_silent,
opt_no_frames,
);
}
}
NodeData::ProcessingInstruction { .. } => unreachable!()
_ => {
// Note: in case of opt_no_js being set to true, there's no need to worry about
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
// since that's not part of W3C standard and therefore gets ignored
// by browsers other than IE [5, 9]
}
}
}
fn has_protocol(url: &str) -> bool {
HAS_PROTOCOL.is_match(&url.to_lowercase())
}
pub fn html_to_dom(data: &str) -> html5ever::rcdom::RcDom {
parse_document(RcDom::default(), Default::default())
.from_utf8()
@@ -375,157 +470,85 @@ pub fn html_to_dom(data: &str) -> html5ever::rcdom::RcDom {
.unwrap()
}
pub fn print_dom(handle: &Handle) {
serialize(&mut io::stdout(), handle, SerializeOpts::default()).unwrap();
}
fn is_icon(attr_value: &str) -> bool {
ICON_VALUES.is_match(&attr_value.to_lowercase())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_icon() {
assert_eq!(is_icon("icon"), true);
assert_eq!(is_icon("Shortcut Icon"), true);
assert_eq!(is_icon("ICON"), true);
assert_eq!(is_icon("stylesheet"), false);
assert_eq!(is_icon(""), false);
}
#[test]
fn test_has_protocol() {
assert_eq!(has_protocol("mailto:somebody@somewhere.com?subject=hello"), true);
assert_eq!(has_protocol("tel:5551234567"), true);
assert_eq!(has_protocol("ftp:user:password@some-ftp-server.com"), true);
assert_eq!(has_protocol("javascript:void(0)"), true);
assert_eq!(has_protocol("http://news.ycombinator.com"), true);
assert_eq!(has_protocol("https://github.com"), true);
assert_eq!(has_protocol("//some-hostname.com/some-file.html"), false);
assert_eq!(has_protocol("some-hostname.com/some-file.html"), false);
assert_eq!(has_protocol("/some-file.html"), false);
assert_eq!(has_protocol(""), false);
assert_eq!(has_protocol("MAILTO:somebody@somewhere.com?subject=hello"), true);
}
#[test]
fn test_get_parent_node_name() {
let html = "<!doctype html><html><HEAD></HEAD><body><div><P></P></div></body></html>";
let dom = html_to_dom(&html);
let mut count = 0;
fn test_walk(node: &Handle, i: &mut i8) {
*i += 1;
match &node.data {
NodeData::Document => {
for child in node.children.borrow().iter() {
test_walk(child, &mut *i);
}
}
NodeData::Doctype { .. } => (),
NodeData::Text { .. } => (),
NodeData::Comment { .. } => (),
NodeData::Element { ref name, attrs: _, .. } => {
let node_name = name.local.as_ref().to_string();
let parent_node_name = get_parent_node_name(node);
if node_name == "head" || node_name == "body" {
assert_eq!(parent_node_name, "html");
} else if node_name == "div" {
assert_eq!(parent_node_name, "body");
} else if node_name == "p" {
assert_eq!(parent_node_name, "div");
}
println!("{}", node_name);
for child in node.children.borrow().iter() {
test_walk(child, &mut *i);
}
}
NodeData::ProcessingInstruction { .. } => unreachable!()
};
fn get_child_node_by_name(handle: &Handle, node_name: &str) -> Handle {
let children = handle.children.borrow();
let matching_children = children.iter().find(|child| match child.data {
NodeData::Element { ref name, .. } => &*name.local == node_name,
_ => false,
});
match matching_children {
Some(node) => node.clone(),
_ => {
return handle.clone();
}
test_walk(&dom.document, &mut count);
assert_eq!(count, 7);
}
#[test]
fn test_walk_and_embed_assets() {
let html = "<div><P></P></div>";
let dom = html_to_dom(&html);
let url = "http://localhost";
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><body><div><p></p></div></body></html>"
);
}
#[test]
fn test_walk_and_embed_assets_iframe() {
let html = "<div><P></P><iframe src=\"\"></iframe></div>";
let dom = html_to_dom(&html);
let url = "http://localhost";
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><body><div><p></p><iframe src=\"\"></iframe></div></body></html>"
);
}
#[test]
fn test_walk_and_embed_assets_img() {
let html = "<div><img src=\"http://localhost/assets/mono_lisa.png\" /></div>";
let dom = html_to_dom(&html);
let url = "http://localhost";
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><body><div>\
<img src=\"data:image/png;base64,\
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0\
lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=\">\
</div></body></html>"
);
}
#[test]
fn test_walk_and_embed_assets_js() {
let html = "<div><script src=\"http://localhost/assets/some.js\"></script>\
<script>alert(1)</script></div>";
let dom = html_to_dom(&html);
let url = "http://localhost";
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><body><div><script src=\"\"></script>\
<script></script></div></body></html>"
);
}
}
pub fn stringify_document(
handle: &Handle,
opt_no_css: bool,
opt_no_frames: bool,
opt_no_js: bool,
opt_no_images: bool,
opt_isolate: bool,
) -> String {
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, handle, SerializeOpts::default())
.expect("unable to serialize DOM into buffer");
let mut result: String = String::from_utf8(buf).unwrap();
if opt_isolate || opt_no_css || opt_no_frames || opt_no_js || opt_no_images {
let mut buf: Vec<u8> = Vec::new();
let mut dom = html_to_dom(&result);
let doc = dom.get_document();
let html = get_child_node_by_name(&doc, "html");
let head = get_child_node_by_name(&html, "head");
let mut content_attr = str!();
if opt_isolate {
content_attr += " default-src 'unsafe-inline' data:;";
}
if opt_no_css {
content_attr += " style-src 'none';";
}
if opt_no_frames {
content_attr += " frame-src 'none';child-src 'none';";
}
if opt_no_js {
content_attr += " script-src 'none';";
}
if opt_no_images {
content_attr += " img-src data:;";
}
content_attr = content_attr.trim().to_string();
let meta = dom.create_element(
QualName::new(None, ns!(), local_name!("meta")),
vec![
Attribute {
name: QualName::new(None, ns!(), local_name!("http-equiv")),
value: format_tendril!("Content-Security-Policy"),
},
Attribute {
name: QualName::new(None, ns!(), local_name!("content")),
value: format_tendril!("{}", content_attr),
},
],
Default::default(),
);
head.children.borrow_mut().reverse();
head.children.borrow_mut().push(meta.clone());
head.children.borrow_mut().reverse();
// Note: the CSP meta-tag has to be prepended, never appended,
// since there already may be one defined in the document,
// and browsers don't allow re-defining them (for obvious reasons)
serialize(&mut buf, &doc, SerializeOpts::default())
.expect("unable to serialize DOM into buffer");
result = String::from_utf8(buf).unwrap();
// Note: we can't make it isolate the page right away since it may have no HEAD element,
// ergo we have to serialize, parse DOM again, and then finally serialize the result
}
result
}

View File

@@ -1,179 +1,63 @@
use regex::Regex;
use reqwest::header::CONTENT_TYPE;
use reqwest::Client;
use reqwest::header::{CONTENT_TYPE, USER_AGENT};
use std::time::Duration;
use url::{ParseError, Url};
use utils::data_to_dataurl;
lazy_static! {
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
}
pub fn is_data_url(url: &str) -> Result<bool, ParseError> {
match Url::parse(url) {
Ok(parsed_url) => Ok(parsed_url.scheme() == "data"),
Err(err) => Err(err),
}
}
pub fn is_valid_url(path: &str) -> bool {
REGEX_URL.is_match(path)
}
pub fn resolve_url(from: &str, to: &str) -> Result<String, ParseError> {
let result = if is_valid_url(to) {
// (anything, http://site.com/css/main.css)
to.to_string()
} else {
Url::parse(from)?.join(to)?.to_string()
};
Ok(result)
}
use std::collections::HashMap;
use utils::{data_to_dataurl, is_data_url};
pub fn retrieve_asset(
cache: &mut HashMap<String, String>,
client: &Client,
url: &str,
as_dataurl: bool,
as_mime: &str,
opt_user_agent: &str,
mime: &str,
opt_silent: bool,
opt_insecure: bool,
) -> Result<String, reqwest::Error> {
) -> Result<(String, String), reqwest::Error> {
if is_data_url(&url).unwrap() {
Ok(url.to_string())
Ok((url.to_string(), url.to_string()))
} else {
let client = Client::builder()
.timeout(Duration::from_secs(10))
.danger_accept_invalid_certs(opt_insecure)
.build()?;
let mut response = client
.get(url)
.header(USER_AGENT, opt_user_agent)
.send()?;
let final_url = response.url().as_str();
if cache.contains_key(&url.to_string()) {
// url is in cache
if !opt_silent {
eprintln!("{} (from cache)", &url);
}
let data = cache.get(&url.to_string()).unwrap();
Ok((data.to_string(), url.to_string()))
} else {
// url not in cache, we request it
let mut response = client.get(url).send()?;
if !opt_silent {
if url == final_url {
eprintln!("[ {} ]", &url);
if !opt_silent {
if url == response.url().as_str() {
eprintln!("{}", &url);
} else {
eprintln!("{} -> {}", &url, &response.url().as_str());
}
}
if as_dataurl {
// Convert response into a byte array
let mut data: Vec<u8> = vec![];
response.copy_to(&mut data)?;
// Attempt to obtain MIME type by reading the Content-Type header
let mimetype = if mime == "" {
response
.headers()
.get(CONTENT_TYPE)
.and_then(|header| header.to_str().ok())
.unwrap_or(&mime)
} else {
mime
};
let dataurl = data_to_dataurl(&mimetype, &data);
// insert in cache
cache.insert(response.url().to_string(), dataurl.to_string());
Ok((dataurl, response.url().to_string()))
} else {
eprintln!("[ {} -> {} ]", &url, &final_url);
let content = response.text().unwrap();
// insert in cache
cache.insert(response.url().to_string(), content.clone());
Ok((content, response.url().to_string()))
}
}
if as_dataurl {
// Convert response into a byte array
let mut data: Vec<u8> = vec![];
response.copy_to(&mut data)?;
// Attempt to obtain MIME type by reading the Content-Type header
let mimetype = if as_mime == "" {
response
.headers()
.get(CONTENT_TYPE)
.and_then(|header| header.to_str().ok())
.unwrap_or(&as_mime)
} else {
as_mime
};
Ok(data_to_dataurl(&mimetype, &data))
} else {
Ok(response.text().unwrap())
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_valid_url() {
assert!(is_valid_url("https://www.rust-lang.org/"));
assert!(is_valid_url("http://kernel.org"));
assert!(!is_valid_url("./index.html"));
assert!(!is_valid_url("some-local-page.htm"));
assert!(!is_valid_url("ftp://1.2.3.4/www/index.html"));
assert!(!is_valid_url(
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h"
));
}
#[test]
fn test_resolve_url() -> Result<(), ParseError> {
let resolved_url = resolve_url(
"https://www.kernel.org",
"../category/signatures.html",
)?;
assert_eq!(
resolved_url.as_str(),
"https://www.kernel.org/category/signatures.html"
);
let resolved_url = resolve_url(
"https://www.kernel.org",
"category/signatures.html",
)?;
assert_eq!(
resolved_url.as_str(),
"https://www.kernel.org/category/signatures.html"
);
let resolved_url = resolve_url(
"saved_page.htm",
"https://www.kernel.org/category/signatures.html",
)?;
assert_eq!(
resolved_url.as_str(),
"https://www.kernel.org/category/signatures.html"
);
let resolved_url = resolve_url(
"https://www.kernel.org",
"//www.kernel.org/theme/images/logos/tux.png",
)?;
assert_eq!(
resolved_url.as_str(),
"https://www.kernel.org/theme/images/logos/tux.png"
);
let resolved_url = resolve_url(
"https://www.kernel.org",
"//another-host.org/theme/images/logos/tux.png",
)?;
assert_eq!(
resolved_url.as_str(),
"https://another-host.org/theme/images/logos/tux.png"
);
let resolved_url = resolve_url(
"https://www.kernel.org/category/signatures.html",
"/theme/images/logos/tux.png",
)?;
assert_eq!(
resolved_url.as_str(),
"https://www.kernel.org/theme/images/logos/tux.png"
);
let resolved_url = resolve_url(
"https://www.w3schools.com/html/html_iframe.asp",
"default.asp",
)?;
assert_eq!(
resolved_url.as_str(),
"https://www.w3schools.com/html/default.asp"
);
Ok(())
}
#[test]
fn test_is_data_url() {
assert!(
is_data_url("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h")
.unwrap_or(false)
);
assert!(!is_data_url("https://kernel.org").unwrap_or(false));
assert!(!is_data_url("//kernel.org").unwrap_or(false));
}
}

32
src/js.rs Normal file
View File

@@ -0,0 +1,32 @@
const JS_DOM_EVENT_ATTRS: [&str; 21] = [
// Input
"onfocus",
"onblur",
"onselect",
"onchange",
"onsubmit",
"onreset",
"onkeydown",
"onkeypress",
"onkeyup",
// Mouse
"onmouseover",
"onmouseout",
"onmousedown",
"onmouseup",
"onmousemove",
// Click
"onclick",
"ondblclick",
// Load
"onload",
"onunload",
"onabort",
"onerror",
"onresize",
];
// Returns true if DOM attribute name matches a native JavaScript event handler
pub fn attr_is_event_handler(attr_name: &str) -> bool {
JS_DOM_EVENT_ATTRS.contains(&attr_name.to_lowercase().as_str())
}

View File

@@ -1,10 +1,17 @@
extern crate html5ever;
#[macro_use]
extern crate lazy_static;
extern crate html5ever;
extern crate regex;
extern crate reqwest;
extern crate url;
#[macro_use]
mod macros;
pub mod html;
pub mod http;
pub mod js;
pub mod utils;
#[cfg(test)]
pub mod tests;

9
src/macros.rs Normal file
View File

@@ -0,0 +1,9 @@
#[macro_export]
macro_rules! str {
() => {
String::new()
};
($val: expr) => {
ToString::to_string(&$val)
};
}

View File

@@ -1,63 +1,70 @@
#[macro_use]
extern crate clap;
extern crate monolith;
extern crate reqwest;
use clap::{App, Arg};
use monolith::html::{html_to_dom, print_dom, walk_and_embed_assets};
use monolith::http::{is_valid_url, retrieve_asset};
mod args;
static DEFAULT_USER_AGENT: &str =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0";
use args::AppArgs;
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
use monolith::http::retrieve_asset;
use monolith::utils::is_valid_url;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use std::collections::HashMap;
use std::time::Duration;
fn main() {
let command = App::new("monolith")
.version(crate_version!())
.author(crate_authors!("\n"))
.about(crate_description!())
.arg(
Arg::with_name("url")
.required(true)
.takes_value(true)
.index(1)
.help("URL to download"),
let app_args = AppArgs::get();
let cache = &mut HashMap::new();
if is_valid_url(app_args.url_target.as_str()) {
// Initialize client
let mut header_map = HeaderMap::new();
match HeaderValue::from_str(&app_args.user_agent) {
Ok(header) => header_map.insert(USER_AGENT, header),
Err(err) => {
eprintln!("Invalid user agent! {}", err);
return;
}
};
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(10))
.danger_accept_invalid_certs(app_args.insecure)
.default_headers(header_map)
.build()
.expect("Failed to initialize HTTP client");
let (data, final_url) = retrieve_asset(
cache,
&client,
app_args.url_target.as_str(),
false,
"",
app_args.silent,
)
.args_from_usage("-i, --no-images 'Removes images'")
.args_from_usage("-j, --no-js 'Excludes JavaScript'")
.args_from_usage("-k, --insecure 'Accept invalid X.509 (TLS) certificates'")
.args_from_usage("-s, --silent 'Suppress verbosity'")
.args_from_usage("-u, --user-agent=[Iceweasel] 'Custom User-Agent string'")
.get_matches();
// Process the command
let arg_target = command.value_of("url").unwrap();
let opt_no_images = command.is_present("no-images");
let opt_no_js = command.is_present("no-js");
let opt_insecure = command.is_present("insecure");
let opt_silent = command.is_present("silent");
let opt_user_agent = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT);
if is_valid_url(arg_target) {
let data = retrieve_asset(
&arg_target,
false,
"",
opt_user_agent,
opt_silent,
opt_insecure,
).unwrap();
.unwrap();
let dom = html_to_dom(&data);
walk_and_embed_assets(
&arg_target,
cache,
&client,
&final_url,
&dom.document,
opt_no_js,
opt_no_images,
opt_user_agent,
opt_silent,
opt_insecure,
app_args.no_css,
app_args.no_js,
app_args.no_images,
app_args.silent,
app_args.no_frames,
);
print_dom(&dom.document);
println!(); // Ensure newline at end of output
let html: String = stringify_document(
&dom.document,
app_args.no_css,
app_args.no_frames,
app_args.no_js,
app_args.no_images,
app_args.isolate,
);
println!("{}", html);
}
}

479
src/tests/html.rs Normal file
View File

@@ -0,0 +1,479 @@
use crate::html::{
get_node_name, get_parent_node, html_to_dom, is_icon, stringify_document, walk_and_embed_assets,
};
use html5ever::rcdom::{Handle, NodeData};
use html5ever::serialize::{serialize, SerializeOpts};
use std::collections::HashMap;
#[test]
fn test_is_icon() {
assert_eq!(is_icon("icon"), true);
assert_eq!(is_icon("Shortcut Icon"), true);
assert_eq!(is_icon("ICON"), true);
assert_eq!(is_icon("mask-icon"), true);
assert_eq!(is_icon("fluid-icon"), true);
assert_eq!(is_icon("stylesheet"), false);
assert_eq!(is_icon(""), false);
}
#[test]
fn test_get_parent_node_name() {
let html = "<!doctype html><html><HEAD></HEAD><body><div><P></P></div></body></html>";
let dom = html_to_dom(&html);
let mut count = 0;
fn test_walk(node: &Handle, i: &mut i8) {
*i += 1;
match &node.data {
NodeData::Document => {
for child in node.children.borrow().iter() {
test_walk(child, &mut *i);
}
}
NodeData::Element { ref name, .. } => {
let node_name = name.local.as_ref().to_string();
let parent_node_name = get_node_name(&get_parent_node(node));
if node_name == "head" || node_name == "body" {
assert_eq!(parent_node_name, "html");
} else if node_name == "div" {
assert_eq!(parent_node_name, "body");
} else if node_name == "p" {
assert_eq!(parent_node_name, "div");
}
println!("{}", node_name);
for child in node.children.borrow().iter() {
test_walk(child, &mut *i);
}
}
_ => (),
};
}
test_walk(&dom.document, &mut count);
assert_eq!(count, 7);
}
#[test]
fn test_walk_and_embed_assets() {
let cache = &mut HashMap::new();
let html = "<div><P></P></div>";
let dom = html_to_dom(&html);
let url = "http://localhost";
let opt_no_css: bool = false;
let opt_no_frames: bool = false;
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_silent = true;
let client = reqwest::Client::new();
walk_and_embed_assets(
cache,
&client,
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
opt_silent,
opt_no_frames,
);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><body><div><p></p></div></body></html>"
);
}
#[test]
fn test_walk_and_embed_assets_ensure_no_recursive_iframe() {
let html = "<div><P></P><iframe src=\"\"></iframe></div>";
let dom = html_to_dom(&html);
let url = "http://localhost";
let cache = &mut HashMap::new();
let opt_no_css: bool = false;
let opt_no_frames: bool = false;
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_silent = true;
let client = reqwest::Client::new();
walk_and_embed_assets(
cache,
&client,
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
opt_silent,
opt_no_frames,
);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><body><div><p></p><iframe src=\"\"></iframe></div></body></html>"
);
}
#[test]
fn test_walk_and_embed_assets_no_css() {
let html = "<link rel=\"stylesheet\" href=\"main.css\">\
<style>html{background-color: #000;}</style>\
<div style=\"display: none;\"></div>";
let dom = html_to_dom(&html);
let url = "http://localhost";
let cache = &mut HashMap::new();
let opt_no_css: bool = true;
let opt_no_frames: bool = false;
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_silent = true;
let client = reqwest::Client::new();
walk_and_embed_assets(
cache,
&client,
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
opt_silent,
opt_no_frames,
);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html>\
<head>\
<link rel=\"stylesheet\" href=\"\">\
<style></style>\
</head>\
<body>\
<div></div>\
</body>\
</html>"
);
}
#[test]
fn test_walk_and_embed_assets_no_images() {
let html = "<link rel=\"icon\" href=\"favicon.ico\">\
<div><img src=\"http://localhost/assets/mono_lisa.png\" /></div>";
let dom = html_to_dom(&html);
let url = "http://localhost";
let cache = &mut HashMap::new();
let opt_no_css: bool = false;
let opt_no_frames: bool = false;
let opt_no_js: bool = false;
let opt_no_images: bool = true;
let opt_silent = true;
let client = reqwest::Client::new();
walk_and_embed_assets(
cache,
&client,
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
opt_silent,
opt_no_frames,
);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html>\
<head>\
<link rel=\"icon\" href=\"\">\
</head>\
<body>\
<div>\
<img src=\"data:image/png;base64,\
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0\
lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=\">\
</div>\
</body>\
</html>"
);
}
#[test]
fn test_walk_and_embed_assets_no_frames() {
let html = "<iframe src=\"http://trackbook.com\"></iframe>";
let dom = html_to_dom(&html);
let url = "http://localhost";
let cache = &mut HashMap::new();
let opt_no_css: bool = false;
let opt_no_frames: bool = true;
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_silent = true;
let client = reqwest::Client::new();
walk_and_embed_assets(
cache,
&client,
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
opt_silent,
opt_no_frames,
);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><body><iframe src=\"\"></iframe></body></html>"
);
}
#[test]
fn test_walk_and_embed_assets_no_js() {
let html = "<div onClick=\"void(0)\">\
<script src=\"http://localhost/assets/some.js\"></script>\
<script>alert(1)</script>\
</div>";
let dom = html_to_dom(&html);
let url = "http://localhost";
let cache = &mut HashMap::new();
let opt_no_css: bool = false;
let opt_no_frames: bool = false;
let opt_no_js: bool = true;
let opt_no_images: bool = false;
let opt_silent = true;
let client = reqwest::Client::new();
walk_and_embed_assets(
cache,
&client,
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
opt_silent,
opt_no_frames,
);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html><head></head><body><div><script src=\"\"></script>\
<script></script></div></body></html>"
);
}
#[test]
fn test_stringify_document() {
let html = "<div><script src=\"some.js\"></script></div>";
let dom = html_to_dom(&html);
let opt_no_css: bool = false;
let opt_no_frames: bool = false;
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_isolate: bool = false;
assert_eq!(
stringify_document(
&dom.document,
opt_no_css,
opt_no_frames,
opt_no_js,
opt_no_images,
opt_isolate,
),
"<html><head></head><body><div><script src=\"some.js\"></script></div></body></html>"
);
}
#[test]
fn test_stringify_document_isolate() {
let html = "<title>Isolated document</title>\
<link rel=\"something\" href=\"some.css\" />\
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src https:\">\
<div><script src=\"some.js\"></script></div>";
let dom = html_to_dom(&html);
let opt_no_css: bool = false;
let opt_no_frames: bool = false;
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_isolate: bool = true;
assert_eq!(
stringify_document(
&dom.document,
opt_no_css,
opt_no_frames,
opt_no_js,
opt_no_images,
opt_isolate,
),
"<html>\
<head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src 'unsafe-inline' data:;\"></meta>\
<title>Isolated document</title>\
<link rel=\"something\" href=\"some.css\">\
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src https:\">\
</head>\
<body>\
<div>\
<script src=\"some.js\"></script>\
</div>\
</body>\
</html>"
);
}
#[test]
fn test_stringify_document_no_css() {
let html = "<!doctype html>\
<title>Unstyled document</title>\
<link rel=\"stylesheet\" href=\"main.css\"/>\
<div style=\"display: none;\"></div>";
let dom = html_to_dom(&html);
let opt_no_css: bool = true;
let opt_no_frames: bool = false;
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_isolate: bool = false;
assert_eq!(
stringify_document(
&dom.document,
opt_no_css,
opt_no_frames,
opt_no_js,
opt_no_images,
opt_isolate,
),
"<!DOCTYPE html>\
<html>\
<head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"style-src 'none';\"></meta>\
<title>Unstyled document</title>\
<link rel=\"stylesheet\" href=\"main.css\">\
</head>\
<body><div style=\"display: none;\"></div></body>\
</html>"
);
}
#[test]
fn test_stringify_document_no_frames() {
let html = "<!doctype html>\
<title>Frameless document</title>\
<link rel=\"something\"/>\
<div><script src=\"some.js\"></script></div>";
let dom = html_to_dom(&html);
let opt_no_css: bool = false;
let opt_no_frames: bool = true;
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_isolate: bool = false;
assert_eq!(
stringify_document(
&dom.document,
opt_no_css,
opt_no_frames,
opt_no_js,
opt_no_images,
opt_isolate,
),
"<!DOCTYPE html>\
<html>\
<head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"frame-src 'none';child-src 'none';\"></meta>\
<title>Frameless document</title>\
<link rel=\"something\">\
</head>\
<body><div><script src=\"some.js\"></script></div></body>\
</html>"
);
}
#[test]
fn test_stringify_document_isolate_no_frames_no_js_no_css_no_images() {
let html = "<!doctype html>\
<title>no-frame no-css no-js no-image isolated document</title>\
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src https:\">\
<link rel=\"stylesheet\" href=\"some.css\">\
<div>\
<script src=\"some.js\"></script>\
<img style=\"width: 100%;\" src=\"some.png\" />\
<iframe src=\"some.html\"></iframe>\
</div>";
let dom = html_to_dom(&html);
let opt_isolate: bool = true;
let opt_no_css: bool = true;
let opt_no_frames: bool = true;
let opt_no_js: bool = true;
let opt_no_images: bool = true;
assert_eq!(
stringify_document(
&dom.document,
opt_no_css,
opt_no_frames,
opt_no_js,
opt_no_images,
opt_isolate,
),
"<!DOCTYPE html>\
<html>\
<head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src \'unsafe-inline\' data:; style-src \'none\'; frame-src \'none\';child-src \'none\'; script-src \'none\'; img-src data:;\"></meta>\
<title>no-frame no-css no-js no-image isolated document</title>\
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src https:\">\
<link rel=\"stylesheet\" href=\"some.css\">\
</head>\
<body>\
<div>\
<script src=\"some.js\"></script>\
<img style=\"width: 100%;\" src=\"some.png\">\
<iframe src=\"some.html\"></iframe>\
</div>\
</body>\
</html>"
);
}

23
src/tests/http.rs Normal file
View File

@@ -0,0 +1,23 @@
use crate::http::retrieve_asset;
use std::collections::HashMap;
#[test]
fn test_retrieve_asset() {
let cache = &mut HashMap::new();
let client = reqwest::Client::new();
let (data, final_url) =
retrieve_asset(cache, &client, "data:text/html;base64,...", true, "", false).unwrap();
assert_eq!(&data, "data:text/html;base64,...");
assert_eq!(&final_url, "data:text/html;base64,...");
let (data, final_url) = retrieve_asset(
cache,
&client,
"data:text/html;base64,...",
true,
"image/png",
false,
)
.unwrap();
assert_eq!(&data, "data:text/html;base64,...");
assert_eq!(&final_url, "data:text/html;base64,...");
}

13
src/tests/js.rs Normal file
View File

@@ -0,0 +1,13 @@
use crate::js::attr_is_event_handler;
#[test]
fn test_attr_is_event_handler() {
// succeeding
assert!(attr_is_event_handler("onBlur"));
assert!(attr_is_event_handler("onclick"));
assert!(attr_is_event_handler("onClick"));
// failing
assert!(!attr_is_event_handler("href"));
assert!(!attr_is_event_handler(""));
assert!(!attr_is_event_handler("class"));
}

4
src/tests/mod.rs Normal file
View File

@@ -0,0 +1,4 @@
mod html;
mod http;
mod js;
mod utils;

160
src/tests/utils.rs Normal file
View File

@@ -0,0 +1,160 @@
use crate::utils::{
data_to_dataurl, detect_mimetype, is_data_url, is_valid_url, resolve_url, url_has_protocol,
};
use url::ParseError;
#[test]
fn test_data_to_dataurl() {
let mime = "application/javascript";
let data = "var word = 'hello';\nalert(word);\n";
let datauri = data_to_dataurl(mime, data.as_bytes());
assert_eq!(
&datauri,
"data:application/javascript;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK"
);
}
#[test]
fn test_detect_mimetype() {
// image
assert_eq!(detect_mimetype(b"GIF87a"), "image/gif");
assert_eq!(detect_mimetype(b"GIF89a"), "image/gif");
assert_eq!(detect_mimetype(b"\xFF\xD8\xFF"), "image/jpeg");
assert_eq!(detect_mimetype(b"\x89PNG\x0D\x0A\x1A\x0A"), "image/png");
assert_eq!(detect_mimetype(b"<?xml "), "image/svg+xml");
assert_eq!(detect_mimetype(b"<svg "), "image/svg+xml");
assert_eq!(detect_mimetype(b"RIFF....WEBPVP8 "), "image/webp");
assert_eq!(detect_mimetype(b"\x00\x00\x01\x00"), "image/x-icon");
// audio
assert_eq!(detect_mimetype(b"ID3"), "audio/mpeg");
assert_eq!(detect_mimetype(b"\xFF\x0E"), "audio/mpeg");
assert_eq!(detect_mimetype(b"\xFF\x0F"), "audio/mpeg");
assert_eq!(detect_mimetype(b"OggS"), "audio/ogg");
assert_eq!(detect_mimetype(b"RIFF....WAVEfmt "), "audio/wav");
assert_eq!(detect_mimetype(b"fLaC"), "audio/x-flac");
// video
assert_eq!(detect_mimetype(b"RIFF....AVI LIST"), "video/avi");
assert_eq!(detect_mimetype(b"....ftyp"), "video/mp4");
assert_eq!(detect_mimetype(b"\x00\x00\x01\x0B"), "video/mpeg");
assert_eq!(detect_mimetype(b"....moov"), "video/quicktime");
assert_eq!(detect_mimetype(b"\x1A\x45\xDF\xA3"), "video/webm");
}
#[test]
fn test_url_has_protocol() {
// succeeding
assert_eq!(
url_has_protocol("mailto:somebody@somewhere.com?subject=hello"),
true
);
assert_eq!(url_has_protocol("tel:5551234567"), true);
assert_eq!(
url_has_protocol("ftp:user:password@some-ftp-server.com"),
true
);
assert_eq!(url_has_protocol("javascript:void(0)"), true);
assert_eq!(url_has_protocol("http://news.ycombinator.com"), true);
assert_eq!(url_has_protocol("https://github.com"), true);
assert_eq!(
url_has_protocol("MAILTO:somebody@somewhere.com?subject=hello"),
true
);
// failing
assert_eq!(
url_has_protocol("//some-hostname.com/some-file.html"),
false
);
assert_eq!(url_has_protocol("some-hostname.com/some-file.html"), false);
assert_eq!(url_has_protocol("/some-file.html"), false);
assert_eq!(url_has_protocol(""), false);
}
#[test]
fn test_is_valid_url() {
// succeeding
assert!(is_valid_url("https://www.rust-lang.org/"));
assert!(is_valid_url("http://kernel.org"));
// failing
assert!(!is_valid_url("//kernel.org"));
assert!(!is_valid_url("./index.html"));
assert!(!is_valid_url("some-local-page.htm"));
assert!(!is_valid_url("ftp://1.2.3.4/www/index.html"));
assert!(!is_valid_url(
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h"
));
}
#[test]
fn test_resolve_url() -> Result<(), ParseError> {
let resolved_url = resolve_url("https://www.kernel.org", "../category/signatures.html")?;
assert_eq!(
resolved_url.as_str(),
"https://www.kernel.org/category/signatures.html"
);
let resolved_url = resolve_url("https://www.kernel.org", "category/signatures.html")?;
assert_eq!(
resolved_url.as_str(),
"https://www.kernel.org/category/signatures.html"
);
let resolved_url = resolve_url(
"saved_page.htm",
"https://www.kernel.org/category/signatures.html",
)?;
assert_eq!(
resolved_url.as_str(),
"https://www.kernel.org/category/signatures.html"
);
let resolved_url = resolve_url(
"https://www.kernel.org",
"//www.kernel.org/theme/images/logos/tux.png",
)?;
assert_eq!(
resolved_url.as_str(),
"https://www.kernel.org/theme/images/logos/tux.png"
);
let resolved_url = resolve_url(
"https://www.kernel.org",
"//another-host.org/theme/images/logos/tux.png",
)?;
assert_eq!(
resolved_url.as_str(),
"https://another-host.org/theme/images/logos/tux.png"
);
let resolved_url = resolve_url(
"https://www.kernel.org/category/signatures.html",
"/theme/images/logos/tux.png",
)?;
assert_eq!(
resolved_url.as_str(),
"https://www.kernel.org/theme/images/logos/tux.png"
);
let resolved_url = resolve_url(
"https://www.w3schools.com/html/html_iframe.asp",
"default.asp",
)?;
assert_eq!(
resolved_url.as_str(),
"https://www.w3schools.com/html/default.asp"
);
Ok(())
}
#[test]
fn test_is_data_url() {
// succeeding
assert!(
is_data_url("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h")
.unwrap_or(false)
);
// failing
assert!(!is_data_url("https://kernel.org").unwrap_or(false));
assert!(!is_data_url("//kernel.org").unwrap_or(false));
assert!(!is_data_url("").unwrap_or(false));
}

View File

@@ -1,8 +1,50 @@
extern crate base64;
use self::base64::encode;
use http::retrieve_asset;
use regex::Regex;
use reqwest::Client;
use std::collections::HashMap;
use url::{ParseError, Url};
static MAGIC: [[&[u8]; 2]; 19] = [
/// This monster of a regex is used to match any kind of URL found in CSS.
///
/// There are roughly three different categories that a found URL could fit
/// into:
/// - Font [found after a src: property in an @font-family rule]
/// - Stylesheet [denoted by an @import before the url
/// - Image [covers all other uses of the url() function]
///
/// This regex aims to extract the following information:
/// - What type of URL is it (font/image/css)
/// - Where is the part that needs to be replaced (incl any wrapping quotes)
/// - What is the URL (excl any wrapping quotes)
///
/// Essentially, the regex can be broken down into two parts:
///
/// `(?:(?P<import>@import)|(?P<font>src\s*:)\s+)?`
/// This matches the precursor to a font or CSS URL, and fills in a match under
/// either `<import>` (if it's a CSS URL) or `<font>` (if it's a font).
/// Determining whether or not it's an image can be done by the negation of both
/// of these. Either zero or one of these can match.
///
/// `url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)`
/// This matches the actual URL part of the url(), and must always match. It also
/// sets `<to_repl>` and `<url>` which correspond to everything within
/// `url(...)` and a usable URL, respectively.
///
/// Note, however, that this does not perform any validation of the found URL.
/// Malformed CSS could lead to an invalid URL being present. It is therefore
/// recomended that the URL gets manually validated.
const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P<stylesheet>@import)|(?P<font>src\s*:))\s+)?url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)"###;
lazy_static! {
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap();
}
const MAGIC: [[&[u8]; 2]; 19] = [
// Image
[b"GIF87a", b"image/gif"],
[b"GIF89a", b"image/gif"],
@@ -28,7 +70,7 @@ static MAGIC: [[&[u8]; 2]; 19] = [
];
pub fn data_to_dataurl(mime: &str, data: &[u8]) -> String {
let mimetype = if mime == "" {
let mimetype = if mime.is_empty() {
detect_mimetype(data)
} else {
mime.to_string()
@@ -36,57 +78,121 @@ pub fn data_to_dataurl(mime: &str, data: &[u8]) -> String {
format!("data:{};base64,{}", mimetype, encode(data))
}
fn detect_mimetype(data: &[u8]) -> String {
let mut re = String::new();
for item in MAGIC.iter() {
pub fn detect_mimetype(data: &[u8]) -> String {
for item in MAGIC.iter() {
if data.starts_with(item[0]) {
re = String::from_utf8(item[1].to_vec()).unwrap();
break;
return String::from_utf8(item[1].to_vec()).unwrap();
}
}
re
"".to_owned()
}
#[cfg(test)]
mod tests {
use super::*;
pub fn url_has_protocol<T: AsRef<str>>(url: T) -> bool {
HAS_PROTOCOL.is_match(url.as_ref().to_lowercase().as_str())
}
#[test]
fn test_data_to_dataurl() {
let mime = "application/javascript";
let data = "var word = 'hello';\nalert(word);\n";
let datauri = data_to_dataurl(mime, data.as_bytes());
assert_eq!(
&datauri,
"data:application/javascript;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK"
);
pub fn is_data_url<T: AsRef<str>>(url: T) -> Result<bool, ParseError> {
Url::parse(url.as_ref()).and_then(|u| Ok(u.scheme() == "data"))
}
pub fn is_valid_url<T: AsRef<str>>(path: T) -> bool {
REGEX_URL.is_match(path.as_ref())
}
pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<String, ParseError> {
let result = if is_valid_url(to.as_ref()) {
to.as_ref().to_string()
} else {
Url::parse(from.as_ref())?
.join(to.as_ref())?
.as_ref()
.to_string()
};
Ok(result)
}
pub fn resolve_css_imports(
cache: &mut HashMap<String, String>,
client: &Client,
css_string: &str,
as_dataurl: bool,
href: &str,
opt_no_images: bool,
opt_silent: bool,
) -> String {
let mut resolved_css = String::from(css_string);
for link in REGEX_CSS_URL.captures_iter(&css_string) {
let target_link = link.name("url").unwrap().as_str();
// Determine the type of link
let is_stylesheet = link.name("stylesheet").is_some();
let is_font = link.name("font").is_some();
let is_image = !is_stylesheet && !is_font;
// Generate absolute URL for content
let embedded_url = match resolve_url(href, target_link) {
Ok(url) => url,
Err(_) => continue, // Malformed URL
};
// Download the asset. If it's more CSS, resolve that too
let content = if is_stylesheet {
// The link is an @import link
retrieve_asset(
cache,
client,
&embedded_url,
false, // Formating as data URL will be done later
"text/css", // Expect CSS
opt_silent,
)
.map(|(content, _)| {
resolve_css_imports(
cache,
client,
&content,
true, // Finally, convert to a dataurl
&embedded_url,
opt_no_images,
opt_silent,
)
})
} else if (is_image && !opt_no_images) || is_font {
// The link is some other, non-@import link
retrieve_asset(
cache,
client,
&embedded_url,
true, // Format as data URL
"", // Unknown MIME type
opt_silent,
)
.map(|(a, _)| a)
} else {
// If it's a datatype that has been opt_no'd out of, replace with
// absolute URL
Ok(embedded_url.clone())
}
.unwrap_or_else(|e| {
eprintln!("Warning: {}", e);
// If failed to resolve, replace with absolute URL
embedded_url
});
let replacement = format!("\"{}\"", &content);
let dest = link.name("to_repl").unwrap();
let offset = resolved_css.len() - css_string.len();
let target_range = (dest.start() + offset)..(dest.end() + offset);
resolved_css.replace_range(target_range, &replacement);
}
#[test]
fn test_detect_mimetype() {
// Image
assert_eq!(detect_mimetype(b"GIF87a"), "image/gif");
assert_eq!(detect_mimetype(b"GIF89a"), "image/gif");
assert_eq!(detect_mimetype(b"\xFF\xD8\xFF"), "image/jpeg");
assert_eq!(detect_mimetype(b"\x89PNG\x0D\x0A\x1A\x0A"), "image/png");
assert_eq!(detect_mimetype(b"<?xml "), "image/svg+xml");
assert_eq!(detect_mimetype(b"<svg "), "image/svg+xml");
assert_eq!(detect_mimetype(b"RIFF....WEBPVP8 "), "image/webp");
assert_eq!(detect_mimetype(b"\x00\x00\x01\x00"), "image/x-icon");
// Audio
assert_eq!(detect_mimetype(b"ID3"), "audio/mpeg");
assert_eq!(detect_mimetype(b"\xFF\x0E"), "audio/mpeg");
assert_eq!(detect_mimetype(b"\xFF\x0F"), "audio/mpeg");
assert_eq!(detect_mimetype(b"OggS"), "audio/ogg");
assert_eq!(detect_mimetype(b"RIFF....WAVEfmt "), "audio/wav");
assert_eq!(detect_mimetype(b"fLaC"), "audio/x-flac");
// Video
assert_eq!(detect_mimetype(b"RIFF....AVI LIST"), "video/avi");
assert_eq!(detect_mimetype(b"....ftyp"), "video/mp4");
assert_eq!(detect_mimetype(b"\x00\x00\x01\x0B"), "video/mpeg");
assert_eq!(detect_mimetype(b"....moov"), "video/quicktime");
assert_eq!(detect_mimetype(b"\x1A\x45\xDF\xA3"), "video/webm");
if as_dataurl {
data_to_dataurl("text/css", resolved_css.as_bytes())
} else {
resolved_css
}
}