Compare commits
75 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5ba8931502 | ||
|
|
13d2ea1607 | ||
|
|
88ffde0c3b | ||
|
|
bfb97bd062 | ||
|
|
295931041c | ||
|
|
2e623dd9f8 | ||
|
|
169b9657e5 | ||
|
|
dab4ae6965 | ||
|
|
c7fc121c7c | ||
|
|
9ff9dd0928 | ||
|
|
b0fc24d77f | ||
|
|
d8abfaf25c | ||
|
|
565acdef25 | ||
|
|
4637fed15c | ||
|
|
9a7ea4fdde | ||
|
|
3d4a932ac1 | ||
|
|
cf70383165 | ||
|
|
9fe913d853 | ||
|
|
862489e41b | ||
|
|
919e626b5e | ||
|
|
cf347e0483 | ||
|
|
322ab41b8c | ||
|
|
1a7336e809 | ||
|
|
65d0eab793 | ||
|
|
292221ea28 | ||
|
|
614af44c92 | ||
|
|
feb37f5812 | ||
|
|
028beb821c | ||
|
|
76ccff80f9 | ||
|
|
45335d7507 | ||
|
|
a4743ca92f | ||
|
|
b96a777e8a | ||
|
|
4decea716c | ||
|
|
695a787206 | ||
|
|
90e6cb1c45 | ||
|
|
7412d663e0 | ||
|
|
8646af6e9f | ||
|
|
de383c94b1 | ||
|
|
ab65b44f0d | ||
|
|
13bacb4320 | ||
|
|
d574e9a5da | ||
|
|
1de0fc0961 | ||
|
|
ebbf755e09 | ||
|
|
d3956a7905 | ||
|
|
ef7ddcd434 | ||
|
|
11bbfc0851 | ||
|
|
a2bf7e3345 | ||
|
|
35f5e1353d | ||
|
|
f8040f4d8c | ||
|
|
31d3fee626 | ||
|
|
178abd07bd | ||
|
|
491185e191 | ||
|
|
b0c55d5016 | ||
|
|
1ff5e91087 | ||
|
|
550e4cc83f | ||
|
|
5443c0cc3f | ||
|
|
8add3a8746 | ||
|
|
2f592d5561 | ||
|
|
55fe523a1c | ||
|
|
b5d42bd722 | ||
|
|
cbf3b66f33 | ||
|
|
2e48ea90e1 | ||
|
|
9c006f3258 | ||
|
|
ab24851b5b | ||
|
|
de11559efa | ||
|
|
dbacd76103 | ||
|
|
0896f2e214 | ||
|
|
b6ba22513d | ||
|
|
3948ea3aa0 | ||
|
|
8b3f3f3a6e | ||
|
|
eec05767cf | ||
|
|
c05dc2ae65 | ||
|
|
88a230872c | ||
|
|
ac79a52da0 | ||
|
|
04cbbefafa |
7
.gitignore
vendored
7
.gitignore
vendored
@@ -2,9 +2,8 @@
|
||||
# will have compiled files and executables
|
||||
/target/
|
||||
|
||||
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
|
||||
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
|
||||
Cargo.lock
|
||||
|
||||
# These are backup files generated by rustfmt
|
||||
**/*.rs.bk
|
||||
|
||||
# Exclude accidental HTML files
|
||||
*.html
|
||||
|
||||
24
.travis.yml
24
.travis.yml
@@ -4,14 +4,24 @@ cache: cargo
|
||||
sudo: false
|
||||
|
||||
os:
|
||||
- linux
|
||||
- osx
|
||||
- linux
|
||||
- osx
|
||||
|
||||
rust:
|
||||
- stable
|
||||
- beta
|
||||
- nightly
|
||||
- stable
|
||||
- beta
|
||||
- nightly
|
||||
|
||||
before_script:
|
||||
- rustup component add rustfmt
|
||||
|
||||
script:
|
||||
- cargo build --verbose
|
||||
- cargo test --verbose
|
||||
- cargo build --all --locked --verbose
|
||||
- cargo test --all --locked --verbose
|
||||
- cargo fmt --all -- --check
|
||||
|
||||
jobs:
|
||||
allow_failures:
|
||||
- rust: beta
|
||||
- rust: nightly
|
||||
fast_finish: true
|
||||
|
||||
1724
Cargo.lock
generated
Normal file
1724
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
11
Cargo.toml
11
Cargo.toml
@@ -1,18 +1,19 @@
|
||||
[package]
|
||||
name = "monolith"
|
||||
version = "2.0.16"
|
||||
version = "2.1.1"
|
||||
authors = [
|
||||
"Sunshine <sunshine@uberspace.net>",
|
||||
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
|
||||
"Emmanuel Delaborde <th3rac25@gmail.com>",
|
||||
"Emi Simpson <emi@alchemi.dev>",
|
||||
]
|
||||
description = "CLI tool for saving web pages as a single HTML file"
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.10.1"
|
||||
clap = "2.33.0"
|
||||
html5ever = "0.24.0"
|
||||
indicatif = "0.11.0"
|
||||
lazy_static = "1.3.0"
|
||||
regex = "1.2.1"
|
||||
html5ever = "0.24.1"
|
||||
lazy_static = "1.4.0"
|
||||
regex = "1.3.1"
|
||||
reqwest = "0.9.20"
|
||||
url = "2.1.0"
|
||||
|
||||
16
Makefile
Normal file
16
Makefile
Normal file
@@ -0,0 +1,16 @@
|
||||
.PHONY: all build install run test lint
|
||||
|
||||
all: test build
|
||||
|
||||
build:
|
||||
@cargo build --locked
|
||||
|
||||
install:
|
||||
@cargo install --force --locked --path .
|
||||
|
||||
test:
|
||||
@cargo test --locked
|
||||
@cargo fmt --all -- --check
|
||||
|
||||
lint:
|
||||
@cargo fmt --all --
|
||||
35
README.md
35
README.md
@@ -1,7 +1,15 @@
|
||||
[](https://travis-ci.org/Y2Z/monolith)
|
||||
[](https://ci.appveyor.com/project/vflyson/monolith)
|
||||
[](https://ci.appveyor.com/project/snshn/monolith/branch/master)
|
||||
|
||||
# monolith
|
||||
```
|
||||
___ ___________ __________ ___________________ ___
|
||||
| \ / \ | | | | | |
|
||||
| \_/ __ \_| __ | | ___ ___ |__| |
|
||||
| | | | | | | | | | | |
|
||||
| |__| _ |__| |____| | | | | __ |
|
||||
| |\_/| | \ | | | | | | |
|
||||
|___| |__________| \____________________| |___| |___| |___|
|
||||
```
|
||||
|
||||
A data hoarder's dream come true: bundle any web page into a single HTML file.
|
||||
You can finally replace that gazillion of open tabs with a gazillion of .html files stored somewhere on your precious little drive.
|
||||
@@ -10,24 +18,35 @@ Unlike the conventional "Save page as", `monolith` not only saves the target doc
|
||||
|
||||
If compared to saving websites with `wget -mpk`, this tool embeds all assets as data URLs and therefore lets browsers render the saved page exactly the way it was on the Internet, even when no network connection is available.
|
||||
|
||||
<!-- `This program works both on remote and local targets. -->
|
||||
## Installation
|
||||
|
||||
### Installation
|
||||
### From source
|
||||
$ git clone https://github.com/Y2Z/monolith.git
|
||||
$ cd monolith
|
||||
$ cargo install --path .
|
||||
|
||||
### Usage
|
||||
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html
|
||||
### On macOS (via Homebrew)
|
||||
$ brew install monolith
|
||||
|
||||
### Options
|
||||
## Usage
|
||||
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ -o portishead-roads-lyrics.html
|
||||
|
||||
## Options
|
||||
- `-c`: Ignore styles
|
||||
- `-f`: Exclude iframes
|
||||
- `-i`: Remove images
|
||||
- `-I`: Isolate document
|
||||
- `-j`: Exclude JavaScript
|
||||
- `-k`: Accept invalid X.509 (TLS) certificates
|
||||
- `-o`: Write output to file
|
||||
- `-s`: Silent mode
|
||||
- `-u`: Specify custom User-Agent
|
||||
|
||||
### License
|
||||
## Related projects
|
||||
- `Pagesaver`: https://github.com/distributed-mind/pagesaver
|
||||
- `SingleFile`: https://github.com/gildas-lormeau/SingleFile
|
||||
|
||||
## License
|
||||
The Unlicense
|
||||
|
||||
<!-- Microtext -->
|
||||
|
||||
@@ -94,6 +94,7 @@ environment:
|
||||
# or test failure in the matching channels/targets from failing the entire build.
|
||||
matrix:
|
||||
allow_failures:
|
||||
- channel: beta
|
||||
- channel: nightly
|
||||
|
||||
# If you only care about stable channel build failures, uncomment the following line:
|
||||
@@ -114,6 +115,7 @@ install:
|
||||
- if defined MINGW_PATH set PATH=%PATH%;%MINGW_PATH%
|
||||
- rustc -vV
|
||||
- cargo -vV
|
||||
- rustup component add rustfmt
|
||||
|
||||
## Build Script ##
|
||||
|
||||
@@ -125,4 +127,5 @@ build: false
|
||||
#directly or perform other testing commands. Rust will automatically be placed in the PATH
|
||||
# environment variable.
|
||||
test_script:
|
||||
- cargo test --verbose %cargoflags%
|
||||
- cargo test --verbose %cargoflags%
|
||||
- cargo fmt --all -- --check
|
||||
|
||||
65
src/args.rs
Normal file
65
src/args.rs
Normal file
@@ -0,0 +1,65 @@
|
||||
use clap::{App, Arg};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct AppArgs {
|
||||
pub url_target: String,
|
||||
pub no_css: bool,
|
||||
pub no_frames: bool,
|
||||
pub no_images: bool,
|
||||
pub no_js: bool,
|
||||
pub insecure: bool,
|
||||
pub isolate: bool,
|
||||
pub output: String,
|
||||
pub silent: bool,
|
||||
pub user_agent: String,
|
||||
}
|
||||
|
||||
const DEFAULT_USER_AGENT: &str =
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0";
|
||||
|
||||
impl AppArgs {
|
||||
pub fn get() -> AppArgs {
|
||||
let app = App::new("monolith")
|
||||
.version(crate_version!())
|
||||
.author(crate_authors!("\n"))
|
||||
.about(crate_description!())
|
||||
.arg(
|
||||
Arg::with_name("url")
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
.index(1)
|
||||
.help("URL to download"),
|
||||
)
|
||||
// .args_from_usage("-a, --include-audio 'Embed audio sources'")
|
||||
.args_from_usage("-c, --no-css 'Ignore styles'")
|
||||
.args_from_usage("-f, --no-frames 'Exclude iframes'")
|
||||
.args_from_usage("-i, --no-images 'Remove images'")
|
||||
.args_from_usage("-I, --isolate 'Cut off from the Internet'")
|
||||
.args_from_usage("-j, --no-js 'Exclude JavaScript'")
|
||||
.args_from_usage("-k, --insecure 'Accept invalid X.509 (TLS) certificates'")
|
||||
.args_from_usage("-o, --output=[document.html] 'Write output to <file>'")
|
||||
.args_from_usage("-s, --silent 'Suppress verbosity'")
|
||||
.args_from_usage("-u, --user-agent=[Iceweasel] 'Custom User-Agent string'")
|
||||
// .args_from_usage("-v, --include-video 'Embed video sources'")
|
||||
.get_matches();
|
||||
let mut app_args = AppArgs::default();
|
||||
// Process the command
|
||||
app_args.url_target = app
|
||||
.value_of("url")
|
||||
.expect("please set target url")
|
||||
.to_string();
|
||||
app_args.no_css = app.is_present("no-css");
|
||||
app_args.no_frames = app.is_present("no-frames");
|
||||
app_args.no_images = app.is_present("no-images");
|
||||
app_args.no_js = app.is_present("no-js");
|
||||
app_args.insecure = app.is_present("insecure");
|
||||
app_args.isolate = app.is_present("isolate");
|
||||
app_args.silent = app.is_present("silent");
|
||||
app_args.output = app.value_of("output").unwrap_or("").to_string();
|
||||
app_args.user_agent = app
|
||||
.value_of("user-agent")
|
||||
.unwrap_or_else(|| DEFAULT_USER_AGENT)
|
||||
.to_string();
|
||||
app_args
|
||||
}
|
||||
}
|
||||
788
src/html.rs
788
src/html.rs
@@ -1,99 +1,73 @@
|
||||
use html5ever::interface::QualName;
|
||||
use html5ever::parse_document;
|
||||
use html5ever::rcdom::{Handle, NodeData, RcDom};
|
||||
use html5ever::serialize::{serialize, SerializeOpts};
|
||||
use html5ever::tendril::TendrilSink;
|
||||
use http::{is_valid_url, resolve_url, retrieve_asset};
|
||||
use regex::Regex;
|
||||
use html5ever::tendril::{format_tendril, Tendril, TendrilSink};
|
||||
use html5ever::tree_builder::{Attribute, TreeSink};
|
||||
use html5ever::{local_name, namespace_url, ns};
|
||||
use http::retrieve_asset;
|
||||
use js::attr_is_event_handler;
|
||||
use reqwest::Client;
|
||||
use std::collections::HashMap;
|
||||
use std::default::Default;
|
||||
use std::io;
|
||||
use utils::data_to_dataurl;
|
||||
use utils::{data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol};
|
||||
|
||||
lazy_static! {
|
||||
static ref EMPTY_STRING: String = String::new();
|
||||
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
|
||||
static ref ICON_VALUES: Regex = Regex::new(
|
||||
r"^icon|shortcut icon|mask-icon|apple-touch-icon$"
|
||||
).unwrap();
|
||||
}
|
||||
|
||||
const TRANSPARENT_PIXEL: &str = "data:image/png;base64,\
|
||||
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=";
|
||||
|
||||
const JS_DOM_EVENT_ATTRS: [&str; 21] = [
|
||||
// Input
|
||||
"onfocus",
|
||||
"onblur",
|
||||
"onselect",
|
||||
"onchange",
|
||||
"onsubmit",
|
||||
"onreset",
|
||||
"onkeydown",
|
||||
"onkeypress",
|
||||
"onkeyup",
|
||||
// Mouse
|
||||
"onmouseover",
|
||||
"onmouseout",
|
||||
"onmousedown",
|
||||
"onmouseup",
|
||||
"onmousemove",
|
||||
// Click
|
||||
"onclick",
|
||||
"ondblclick",
|
||||
// Load
|
||||
"onload",
|
||||
"onunload",
|
||||
"onabort",
|
||||
"onerror",
|
||||
"onresize",
|
||||
const ICON_VALUES: [&str; 5] = [
|
||||
"icon",
|
||||
"shortcut icon",
|
||||
"mask-icon",
|
||||
"apple-touch-icon",
|
||||
"fluid-icon",
|
||||
];
|
||||
|
||||
fn get_parent_node_name(node: &Handle) -> String {
|
||||
let parent = node.parent.take().clone();
|
||||
let parent_node = parent.and_then(|node| node.upgrade()).unwrap();
|
||||
const TRANSPARENT_PIXEL: &str =
|
||||
"data:image/png;base64,\
|
||||
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=";
|
||||
|
||||
match &parent_node.data {
|
||||
NodeData::Document => { EMPTY_STRING.clone() }
|
||||
NodeData::Doctype { .. } => { EMPTY_STRING.clone() }
|
||||
NodeData::Text { .. } => { EMPTY_STRING.clone() }
|
||||
NodeData::Comment { .. } => { EMPTY_STRING.clone() }
|
||||
NodeData::Element { ref name, attrs: _, .. } => {
|
||||
name.local.as_ref().to_string()
|
||||
}
|
||||
NodeData::ProcessingInstruction { .. } => unreachable!()
|
||||
pub fn get_parent_node(node: &Handle) -> Handle {
|
||||
let parent = node.parent.take().clone();
|
||||
parent.and_then(|node| node.upgrade()).unwrap()
|
||||
}
|
||||
|
||||
pub fn get_node_name(node: &Handle) -> String {
|
||||
match &node.data {
|
||||
NodeData::Element { ref name, .. } => name.local.as_ref().to_string(),
|
||||
_ => str!(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_icon(attr_value: &str) -> bool {
|
||||
ICON_VALUES.contains(&&*attr_value.to_lowercase())
|
||||
}
|
||||
|
||||
pub fn walk_and_embed_assets(
|
||||
cache: &mut HashMap<String, String>,
|
||||
client: &Client,
|
||||
url: &str,
|
||||
node: &Handle,
|
||||
opt_no_css: bool,
|
||||
opt_no_js: bool,
|
||||
opt_no_images: bool,
|
||||
opt_user_agent: &str,
|
||||
opt_silent: bool,
|
||||
opt_insecure: bool,
|
||||
opt_no_frames: bool,
|
||||
) {
|
||||
match node.data {
|
||||
NodeData::Document => {
|
||||
// Dig deeper
|
||||
for child in node.children.borrow().iter() {
|
||||
walk_and_embed_assets(
|
||||
&url, child,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
);
|
||||
cache,
|
||||
client,
|
||||
&url,
|
||||
child,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
}
|
||||
}
|
||||
NodeData::Doctype { .. } => {}
|
||||
NodeData::Text { .. } => {}
|
||||
NodeData::Comment { .. } => {
|
||||
// Note: in case of opt_no_js being set to true, there's no need to worry about
|
||||
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
|
||||
// since that's not part of W3C standard and therefore gets ignored
|
||||
// by browsers other than IE [5, 9]
|
||||
}
|
||||
NodeData::Element {
|
||||
ref name,
|
||||
ref attrs,
|
||||
@@ -103,7 +77,18 @@ pub fn walk_and_embed_assets(
|
||||
|
||||
match name.local.as_ref() {
|
||||
"link" => {
|
||||
let mut link_type = "";
|
||||
let mut link_type: &str = "";
|
||||
|
||||
// Remove integrity attributes
|
||||
let mut i = 0;
|
||||
while i < attrs_mut.len() {
|
||||
let attr_name = attrs_mut[i].name.local.as_ref();
|
||||
if attr_name.eq_ignore_ascii_case("integrity") {
|
||||
attrs_mut.remove(i);
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "rel" {
|
||||
@@ -122,56 +107,71 @@ pub fn walk_and_embed_assets(
|
||||
if &attr.name.local == "href" {
|
||||
if opt_no_images {
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(TRANSPARENT_PIXEL);
|
||||
} else {
|
||||
let href_full_url: String = resolve_url(
|
||||
&url,
|
||||
&attr.value.to_string()
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let favicon_datauri = retrieve_asset(
|
||||
&href_full_url,
|
||||
true,
|
||||
"",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let href_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(str!());
|
||||
let (favicon_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&href_full_url,
|
||||
true,
|
||||
"",
|
||||
opt_silent,
|
||||
)
|
||||
.unwrap_or((str!(), str!()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(favicon_datauri.as_str());
|
||||
attr.value.push_slice(favicon_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if link_type == "stylesheet" {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
let href_full_url: String = resolve_url(
|
||||
&url,
|
||||
&attr.value.to_string(),
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let css_datauri = retrieve_asset(
|
||||
if opt_no_css {
|
||||
attr.value.clear();
|
||||
} else {
|
||||
let href_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(str!());
|
||||
let replacement_text = match retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&href_full_url,
|
||||
true,
|
||||
false,
|
||||
"text/css",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(css_datauri.as_str());
|
||||
) {
|
||||
// On successful retrieval, traverse CSS
|
||||
Ok((css_data, _)) => resolve_css_imports(
|
||||
cache,
|
||||
client,
|
||||
&css_data,
|
||||
true,
|
||||
&href_full_url,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
),
|
||||
|
||||
// If a network error occured, warn
|
||||
Err(e) => {
|
||||
eprintln!("Warning: {}", e,);
|
||||
|
||||
// If failed to resolve, replace with absolute URL
|
||||
href_full_url
|
||||
}
|
||||
};
|
||||
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(&replacement_text);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
let href_full_url: String = resolve_url(
|
||||
&url,
|
||||
&attr.value.to_string(),
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let href_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(&href_full_url.as_str());
|
||||
}
|
||||
@@ -179,56 +179,80 @@ pub fn walk_and_embed_assets(
|
||||
}
|
||||
}
|
||||
"img" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "src" {
|
||||
if opt_no_images {
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(TRANSPARENT_PIXEL);
|
||||
} else {
|
||||
let src_full_url: String = resolve_url(
|
||||
&url,
|
||||
&attr.value.to_string(),
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let img_datauri = retrieve_asset(
|
||||
&src_full_url,
|
||||
true,
|
||||
"",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(img_datauri.as_str());
|
||||
}
|
||||
// Find source tags
|
||||
let mut found_src: Option<Attribute> = None;
|
||||
let mut found_datasrc: Option<Attribute> = None;
|
||||
let mut i = 0;
|
||||
while i < attrs_mut.len() {
|
||||
let name = attrs_mut[i].name.local.as_ref();
|
||||
if name.eq_ignore_ascii_case("src") {
|
||||
found_src = Some(attrs_mut.remove(i));
|
||||
} else if name.eq_ignore_ascii_case("data-src") {
|
||||
found_datasrc = Some(attrs_mut.remove(i));
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// If images are disabled, clear both sources
|
||||
if opt_no_images {
|
||||
attrs_mut.push(Attribute {
|
||||
name: QualName::new(None, ns!(), local_name!("src")),
|
||||
value: Tendril::from_slice(TRANSPARENT_PIXEL),
|
||||
});
|
||||
} else if let Some((dataurl, _)) = (&found_datasrc)
|
||||
.into_iter()
|
||||
.chain(&found_src) // Give dataurl priority
|
||||
.map(|attr| &attr.value)
|
||||
.filter(|src| !src.is_empty()) // Ignore empty srcs
|
||||
.next()
|
||||
.and_then(|src| resolve_url(&url, src).ok()) // Make absolute
|
||||
.and_then(|abs_src| // Download and convert to dataurl
|
||||
retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&abs_src,
|
||||
true,
|
||||
"",
|
||||
opt_silent,
|
||||
).ok())
|
||||
{
|
||||
// Add the new dataurl src attribute
|
||||
attrs_mut.push(Attribute {
|
||||
name: QualName::new(None, ns!(), local_name!("src")),
|
||||
value: Tendril::from_slice(dataurl.as_ref()),
|
||||
});
|
||||
}
|
||||
}
|
||||
"source" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "srcset" {
|
||||
if get_parent_node_name(&node) == "picture" {
|
||||
let attr_name: &str = &attr.name.local;
|
||||
|
||||
if attr_name == "src" {
|
||||
let src_full_url: String = resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(attr.value.to_string());
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(src_full_url.as_str());
|
||||
} else if attr_name == "srcset" {
|
||||
if get_node_name(&get_parent_node(&node)) == "picture" {
|
||||
if opt_no_images {
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(TRANSPARENT_PIXEL);
|
||||
} else {
|
||||
let srcset_full_url: String = resolve_url(
|
||||
&url,
|
||||
&attr.value.to_string(),
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let source_datauri = retrieve_asset(
|
||||
&srcset_full_url,
|
||||
true,
|
||||
"",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let srcset_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(str!());
|
||||
let (source_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&srcset_full_url,
|
||||
true,
|
||||
"",
|
||||
opt_silent,
|
||||
)
|
||||
.unwrap_or((str!(), str!()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(source_datauri.as_str());
|
||||
attr.value.push_slice(source_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -238,20 +262,31 @@ pub fn walk_and_embed_assets(
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
// Don't touch email links or hrefs which begin with a hash sign
|
||||
if attr.value.starts_with('#') || has_protocol(&attr.value) {
|
||||
if attr.value.starts_with('#') || url_has_protocol(&attr.value) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let href_full_url: String = resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let href_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(href_full_url.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
"script" => {
|
||||
// Remove integrity attributes
|
||||
let mut i = 0;
|
||||
while i < attrs_mut.len() {
|
||||
let attr_name = attrs_mut[i].name.local.as_ref();
|
||||
if attr_name.eq_ignore_ascii_case("integrity") {
|
||||
attrs_mut.remove(i);
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if opt_no_js {
|
||||
// Get rid of src and inner content of SCRIPT tags
|
||||
// Empty src and inner content of SCRIPT tags
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "src" {
|
||||
attr.value.clear();
|
||||
@@ -261,22 +296,42 @@ pub fn walk_and_embed_assets(
|
||||
} else {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "src" {
|
||||
let src_full_url: String = resolve_url(
|
||||
&url,
|
||||
&attr.value.to_string(),
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let js_datauri = retrieve_asset(
|
||||
&src_full_url,
|
||||
true,
|
||||
"application/javascript",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let src_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
|
||||
let (js_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&src_full_url,
|
||||
true,
|
||||
"application/javascript",
|
||||
opt_silent,
|
||||
)
|
||||
.unwrap_or((str!(), str!()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(js_datauri.as_str());
|
||||
attr.value.push_slice(js_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"style" => {
|
||||
if opt_no_css {
|
||||
// Empty inner content of STYLE tags
|
||||
node.children.borrow_mut().clear();
|
||||
} else {
|
||||
for node in node.children.borrow_mut().iter_mut() {
|
||||
if let NodeData::Text { ref contents } = node.data {
|
||||
let mut tendril = contents.borrow_mut();
|
||||
let replacement = resolve_css_imports(
|
||||
cache,
|
||||
client,
|
||||
tendril.as_ref(),
|
||||
false,
|
||||
&url,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
);
|
||||
tendril.clear();
|
||||
tendril.push_slice(&replacement);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -284,90 +339,167 @@ pub fn walk_and_embed_assets(
|
||||
"form" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "action" {
|
||||
// Do not touch action props which are set to a URL
|
||||
if is_valid_url(&attr.value) {
|
||||
continue;
|
||||
// Modify action to be a full URL
|
||||
if !is_valid_url(&attr.value) {
|
||||
let href_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(href_full_url.as_str());
|
||||
}
|
||||
|
||||
let href_full_url: String = resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(href_full_url.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
"iframe" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "src" {
|
||||
let value = attr.value.to_string();
|
||||
// Ignore iframes with empty source (they cause infinite loops)
|
||||
if value == EMPTY_STRING.clone() {
|
||||
if opt_no_frames {
|
||||
// Empty the src attribute
|
||||
attr.value.clear();
|
||||
continue;
|
||||
}
|
||||
|
||||
let iframe_src: String = attr.value.to_string();
|
||||
|
||||
let src_full_url: String = resolve_url(&url, &value)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let iframe_data = retrieve_asset(
|
||||
&src_full_url,
|
||||
false,
|
||||
"text/html",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
// Ignore iframes with empty source (they cause infinite loops)
|
||||
if iframe_src == str!() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let src_full_url: String =
|
||||
resolve_url(&url, &iframe_src).unwrap_or(str!());
|
||||
let (iframe_data, iframe_final_url) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&src_full_url,
|
||||
false,
|
||||
"text/html",
|
||||
opt_silent,
|
||||
)
|
||||
.unwrap_or((str!(), src_full_url));
|
||||
let dom = html_to_dom(&iframe_data);
|
||||
walk_and_embed_assets(
|
||||
&src_full_url,
|
||||
&dom.document,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
);
|
||||
cache,
|
||||
client,
|
||||
&iframe_final_url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
let iframe_datauri = data_to_dataurl("text/html", &buf);
|
||||
let iframe_dataurl = data_to_dataurl("text/html", &buf);
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(iframe_datauri.as_str());
|
||||
attr.value.push_slice(iframe_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
"video" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "poster" {
|
||||
let video_poster = attr.value.to_string();
|
||||
|
||||
// Skip posters with empty source
|
||||
if video_poster == str!() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if opt_no_images {
|
||||
attr.value.clear();
|
||||
} else {
|
||||
let poster_full_url: String =
|
||||
resolve_url(&url, &video_poster).unwrap_or(str!());
|
||||
let (poster_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&poster_full_url,
|
||||
true,
|
||||
"",
|
||||
opt_silent,
|
||||
)
|
||||
.unwrap_or((poster_full_url, str!()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(poster_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
// Process style attributes
|
||||
if opt_no_css {
|
||||
// Get rid of style attributes
|
||||
let mut style_attr_indexes = Vec::new();
|
||||
for (i, attr) in attrs_mut.iter_mut().enumerate() {
|
||||
if attr.name.local.to_lowercase() == "style" {
|
||||
style_attr_indexes.push(i);
|
||||
}
|
||||
}
|
||||
style_attr_indexes.reverse();
|
||||
for attr_index in style_attr_indexes {
|
||||
attrs_mut.remove(attr_index);
|
||||
}
|
||||
} else {
|
||||
// Otherwise, parse any links found in the attributes
|
||||
for attribute in attrs_mut
|
||||
.iter_mut()
|
||||
.filter(|a| a.name.local.as_ref().eq_ignore_ascii_case("style"))
|
||||
{
|
||||
let replacement = resolve_css_imports(
|
||||
cache,
|
||||
client,
|
||||
attribute.value.as_ref(),
|
||||
false,
|
||||
&url,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
);
|
||||
attribute.value.clear();
|
||||
attribute.value.push_slice(&replacement);
|
||||
}
|
||||
}
|
||||
|
||||
if opt_no_js {
|
||||
// Get rid of JS event attributes
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if JS_DOM_EVENT_ATTRS.contains(&attr.name.local.to_lowercase().as_str()) {
|
||||
attr.value.clear();
|
||||
let mut js_attr_indexes = Vec::new();
|
||||
for (i, attr) in attrs_mut.iter_mut().enumerate() {
|
||||
if attr_is_event_handler(&attr.name.local) {
|
||||
js_attr_indexes.push(i);
|
||||
}
|
||||
}
|
||||
js_attr_indexes.reverse();
|
||||
for attr_index in js_attr_indexes {
|
||||
attrs_mut.remove(attr_index);
|
||||
}
|
||||
}
|
||||
|
||||
// Dig deeper
|
||||
for child in node.children.borrow().iter() {
|
||||
walk_and_embed_assets(
|
||||
&url,
|
||||
child,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
);
|
||||
cache,
|
||||
client,
|
||||
&url,
|
||||
child,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
}
|
||||
}
|
||||
NodeData::ProcessingInstruction { .. } => unreachable!()
|
||||
_ => {
|
||||
// Note: in case of opt_no_js being set to true, there's no need to worry about
|
||||
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
|
||||
// since that's not part of W3C standard and therefore gets ignored
|
||||
// by browsers other than IE [5, 9]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn has_protocol(url: &str) -> bool {
|
||||
HAS_PROTOCOL.is_match(&url.to_lowercase())
|
||||
}
|
||||
|
||||
pub fn html_to_dom(data: &str) -> html5ever::rcdom::RcDom {
|
||||
parse_document(RcDom::default(), Default::default())
|
||||
.from_utf8()
|
||||
@@ -375,157 +507,85 @@ pub fn html_to_dom(data: &str) -> html5ever::rcdom::RcDom {
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn print_dom(handle: &Handle) {
|
||||
serialize(&mut io::stdout(), handle, SerializeOpts::default()).unwrap();
|
||||
}
|
||||
|
||||
fn is_icon(attr_value: &str) -> bool {
|
||||
ICON_VALUES.is_match(&attr_value.to_lowercase())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_is_icon() {
|
||||
assert_eq!(is_icon("icon"), true);
|
||||
assert_eq!(is_icon("Shortcut Icon"), true);
|
||||
assert_eq!(is_icon("ICON"), true);
|
||||
assert_eq!(is_icon("stylesheet"), false);
|
||||
assert_eq!(is_icon(""), false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_has_protocol() {
|
||||
assert_eq!(has_protocol("mailto:somebody@somewhere.com?subject=hello"), true);
|
||||
assert_eq!(has_protocol("tel:5551234567"), true);
|
||||
assert_eq!(has_protocol("ftp:user:password@some-ftp-server.com"), true);
|
||||
assert_eq!(has_protocol("javascript:void(0)"), true);
|
||||
assert_eq!(has_protocol("http://news.ycombinator.com"), true);
|
||||
assert_eq!(has_protocol("https://github.com"), true);
|
||||
assert_eq!(has_protocol("//some-hostname.com/some-file.html"), false);
|
||||
assert_eq!(has_protocol("some-hostname.com/some-file.html"), false);
|
||||
assert_eq!(has_protocol("/some-file.html"), false);
|
||||
assert_eq!(has_protocol(""), false);
|
||||
assert_eq!(has_protocol("MAILTO:somebody@somewhere.com?subject=hello"), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_parent_node_name() {
|
||||
let html = "<!doctype html><html><HEAD></HEAD><body><div><P></P></div></body></html>";
|
||||
let dom = html_to_dom(&html);
|
||||
let mut count = 0;
|
||||
|
||||
fn test_walk(node: &Handle, i: &mut i8) {
|
||||
*i += 1;
|
||||
|
||||
match &node.data {
|
||||
NodeData::Document => {
|
||||
for child in node.children.borrow().iter() {
|
||||
test_walk(child, &mut *i);
|
||||
}
|
||||
}
|
||||
NodeData::Doctype { .. } => (),
|
||||
NodeData::Text { .. } => (),
|
||||
NodeData::Comment { .. } => (),
|
||||
NodeData::Element { ref name, attrs: _, .. } => {
|
||||
let node_name = name.local.as_ref().to_string();
|
||||
let parent_node_name = get_parent_node_name(node);
|
||||
if node_name == "head" || node_name == "body" {
|
||||
assert_eq!(parent_node_name, "html");
|
||||
} else if node_name == "div" {
|
||||
assert_eq!(parent_node_name, "body");
|
||||
} else if node_name == "p" {
|
||||
assert_eq!(parent_node_name, "div");
|
||||
}
|
||||
|
||||
println!("{}", node_name);
|
||||
|
||||
for child in node.children.borrow().iter() {
|
||||
test_walk(child, &mut *i);
|
||||
}
|
||||
}
|
||||
NodeData::ProcessingInstruction { .. } => unreachable!()
|
||||
};
|
||||
fn get_child_node_by_name(handle: &Handle, node_name: &str) -> Handle {
|
||||
let children = handle.children.borrow();
|
||||
let matching_children = children.iter().find(|child| match child.data {
|
||||
NodeData::Element { ref name, .. } => &*name.local == node_name,
|
||||
_ => false,
|
||||
});
|
||||
match matching_children {
|
||||
Some(node) => node.clone(),
|
||||
_ => {
|
||||
return handle.clone();
|
||||
}
|
||||
|
||||
test_walk(&dom.document, &mut count);
|
||||
|
||||
assert_eq!(count, 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets() {
|
||||
let html = "<div><P></P></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
|
||||
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html><head></head><body><div><p></p></div></body></html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets_iframe() {
|
||||
let html = "<div><P></P><iframe src=\"\"></iframe></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
|
||||
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html><head></head><body><div><p></p><iframe src=\"\"></iframe></div></body></html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets_img() {
|
||||
let html = "<div><img src=\"http://localhost/assets/mono_lisa.png\" /></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
|
||||
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html><head></head><body><div>\
|
||||
<img src=\"data:image/png;base64,\
|
||||
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0\
|
||||
lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=\">\
|
||||
</div></body></html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets_js() {
|
||||
let html = "<div><script src=\"http://localhost/assets/some.js\"></script>\
|
||||
<script>alert(1)</script></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
|
||||
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html><head></head><body><div><script src=\"\"></script>\
|
||||
<script></script></div></body></html>"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stringify_document(
|
||||
handle: &Handle,
|
||||
opt_no_css: bool,
|
||||
opt_no_frames: bool,
|
||||
opt_no_js: bool,
|
||||
opt_no_images: bool,
|
||||
opt_isolate: bool,
|
||||
) -> String {
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, handle, SerializeOpts::default())
|
||||
.expect("unable to serialize DOM into buffer");
|
||||
|
||||
let mut result: String = String::from_utf8(buf).unwrap();
|
||||
|
||||
if opt_isolate || opt_no_css || opt_no_frames || opt_no_js || opt_no_images {
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
let mut dom = html_to_dom(&result);
|
||||
let doc = dom.get_document();
|
||||
let html = get_child_node_by_name(&doc, "html");
|
||||
let head = get_child_node_by_name(&html, "head");
|
||||
let mut content_attr = str!();
|
||||
if opt_isolate {
|
||||
content_attr += " default-src 'unsafe-inline' data:;";
|
||||
}
|
||||
if opt_no_css {
|
||||
content_attr += " style-src 'none';";
|
||||
}
|
||||
if opt_no_frames {
|
||||
content_attr += " frame-src 'none';child-src 'none';";
|
||||
}
|
||||
if opt_no_js {
|
||||
content_attr += " script-src 'none';";
|
||||
}
|
||||
if opt_no_images {
|
||||
content_attr += " img-src data:;";
|
||||
}
|
||||
content_attr = content_attr.trim().to_string();
|
||||
|
||||
let meta = dom.create_element(
|
||||
QualName::new(None, ns!(), local_name!("meta")),
|
||||
vec![
|
||||
Attribute {
|
||||
name: QualName::new(None, ns!(), local_name!("http-equiv")),
|
||||
value: format_tendril!("Content-Security-Policy"),
|
||||
},
|
||||
Attribute {
|
||||
name: QualName::new(None, ns!(), local_name!("content")),
|
||||
value: format_tendril!("{}", content_attr),
|
||||
},
|
||||
],
|
||||
Default::default(),
|
||||
);
|
||||
head.children.borrow_mut().reverse();
|
||||
head.children.borrow_mut().push(meta.clone());
|
||||
head.children.borrow_mut().reverse();
|
||||
// Note: the CSP meta-tag has to be prepended, never appended,
|
||||
// since there already may be one defined in the document,
|
||||
// and browsers don't allow re-defining them (for obvious reasons)
|
||||
|
||||
serialize(&mut buf, &doc, SerializeOpts::default())
|
||||
.expect("unable to serialize DOM into buffer");
|
||||
result = String::from_utf8(buf).unwrap();
|
||||
// Note: we can't make it isolate the page right away since it may have no HEAD element,
|
||||
// ergo we have to serialize, parse DOM again, and then finally serialize the result
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
222
src/http.rs
222
src/http.rs
@@ -1,179 +1,67 @@
|
||||
use regex::Regex;
|
||||
use reqwest::header::CONTENT_TYPE;
|
||||
use reqwest::Client;
|
||||
use reqwest::header::{CONTENT_TYPE, USER_AGENT};
|
||||
use std::time::Duration;
|
||||
use url::{ParseError, Url};
|
||||
use utils::data_to_dataurl;
|
||||
|
||||
lazy_static! {
|
||||
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
|
||||
}
|
||||
|
||||
pub fn is_data_url(url: &str) -> Result<bool, ParseError> {
|
||||
match Url::parse(url) {
|
||||
Ok(parsed_url) => Ok(parsed_url.scheme() == "data"),
|
||||
Err(err) => Err(err),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_valid_url(path: &str) -> bool {
|
||||
REGEX_URL.is_match(path)
|
||||
}
|
||||
|
||||
pub fn resolve_url(from: &str, to: &str) -> Result<String, ParseError> {
|
||||
let result = if is_valid_url(to) {
|
||||
// (anything, http://site.com/css/main.css)
|
||||
to.to_string()
|
||||
} else {
|
||||
Url::parse(from)?.join(to)?.to_string()
|
||||
};
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
use std::collections::HashMap;
|
||||
use utils::{clean_url, data_to_dataurl, is_data_url};
|
||||
|
||||
pub fn retrieve_asset(
|
||||
cache: &mut HashMap<String, String>,
|
||||
client: &Client,
|
||||
url: &str,
|
||||
as_dataurl: bool,
|
||||
as_mime: &str,
|
||||
opt_user_agent: &str,
|
||||
mime: &str,
|
||||
opt_silent: bool,
|
||||
opt_insecure: bool,
|
||||
) -> Result<String, reqwest::Error> {
|
||||
if is_data_url(&url).unwrap() {
|
||||
Ok(url.to_string())
|
||||
} else {
|
||||
let client = Client::builder()
|
||||
.timeout(Duration::from_secs(10))
|
||||
.danger_accept_invalid_certs(opt_insecure)
|
||||
.build()?;
|
||||
let mut response = client
|
||||
.get(url)
|
||||
.header(USER_AGENT, opt_user_agent)
|
||||
.send()?;
|
||||
let final_url = response.url().as_str();
|
||||
) -> Result<(String, String), reqwest::Error> {
|
||||
let cache_key = clean_url(&url);
|
||||
|
||||
if !opt_silent {
|
||||
if url == final_url {
|
||||
eprintln!("[ {} ]", &url);
|
||||
if is_data_url(&url).unwrap() {
|
||||
Ok((url.to_string(), url.to_string()))
|
||||
} else {
|
||||
if cache.contains_key(&cache_key) {
|
||||
// url is in cache
|
||||
if !opt_silent {
|
||||
eprintln!("{} (from cache)", &url);
|
||||
}
|
||||
let data = cache.get(&cache_key).unwrap();
|
||||
Ok((data.to_string(), url.to_string()))
|
||||
} else {
|
||||
// url not in cache, we request it
|
||||
let mut response = client.get(url).send()?;
|
||||
|
||||
if !opt_silent {
|
||||
if url == response.url().as_str() {
|
||||
eprintln!("{}", &url);
|
||||
} else {
|
||||
eprintln!("{} -> {}", &url, &response.url().as_str());
|
||||
}
|
||||
}
|
||||
|
||||
let new_cache_key = clean_url(response.url().to_string());
|
||||
|
||||
if as_dataurl {
|
||||
// Convert response into a byte array
|
||||
let mut data: Vec<u8> = vec![];
|
||||
response.copy_to(&mut data)?;
|
||||
|
||||
// Attempt to obtain MIME type by reading the Content-Type header
|
||||
let mimetype = if mime == "" {
|
||||
response
|
||||
.headers()
|
||||
.get(CONTENT_TYPE)
|
||||
.and_then(|header| header.to_str().ok())
|
||||
.unwrap_or(&mime)
|
||||
} else {
|
||||
mime
|
||||
};
|
||||
let dataurl = data_to_dataurl(&mimetype, &data);
|
||||
// insert in cache
|
||||
cache.insert(new_cache_key, dataurl.to_string());
|
||||
Ok((dataurl, response.url().to_string()))
|
||||
} else {
|
||||
eprintln!("[ {} -> {} ]", &url, &final_url);
|
||||
let content = response.text().unwrap();
|
||||
// insert in cache
|
||||
cache.insert(new_cache_key, content.clone());
|
||||
Ok((content, response.url().to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
if as_dataurl {
|
||||
// Convert response into a byte array
|
||||
let mut data: Vec<u8> = vec![];
|
||||
response.copy_to(&mut data)?;
|
||||
|
||||
// Attempt to obtain MIME type by reading the Content-Type header
|
||||
let mimetype = if as_mime == "" {
|
||||
response
|
||||
.headers()
|
||||
.get(CONTENT_TYPE)
|
||||
.and_then(|header| header.to_str().ok())
|
||||
.unwrap_or(&as_mime)
|
||||
} else {
|
||||
as_mime
|
||||
};
|
||||
|
||||
Ok(data_to_dataurl(&mimetype, &data))
|
||||
} else {
|
||||
Ok(response.text().unwrap())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_is_valid_url() {
|
||||
assert!(is_valid_url("https://www.rust-lang.org/"));
|
||||
assert!(is_valid_url("http://kernel.org"));
|
||||
assert!(!is_valid_url("./index.html"));
|
||||
assert!(!is_valid_url("some-local-page.htm"));
|
||||
assert!(!is_valid_url("ftp://1.2.3.4/www/index.html"));
|
||||
assert!(!is_valid_url(
|
||||
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_url() -> Result<(), ParseError> {
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.kernel.org",
|
||||
"../category/signatures.html",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/category/signatures.html"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.kernel.org",
|
||||
"category/signatures.html",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/category/signatures.html"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"saved_page.htm",
|
||||
"https://www.kernel.org/category/signatures.html",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/category/signatures.html"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.kernel.org",
|
||||
"//www.kernel.org/theme/images/logos/tux.png",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/theme/images/logos/tux.png"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.kernel.org",
|
||||
"//another-host.org/theme/images/logos/tux.png",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://another-host.org/theme/images/logos/tux.png"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.kernel.org/category/signatures.html",
|
||||
"/theme/images/logos/tux.png",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/theme/images/logos/tux.png"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.w3schools.com/html/html_iframe.asp",
|
||||
"default.asp",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.w3schools.com/html/default.asp"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_data_url() {
|
||||
assert!(
|
||||
is_data_url("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h")
|
||||
.unwrap_or(false)
|
||||
);
|
||||
assert!(!is_data_url("https://kernel.org").unwrap_or(false));
|
||||
assert!(!is_data_url("//kernel.org").unwrap_or(false));
|
||||
}
|
||||
}
|
||||
|
||||
32
src/js.rs
Normal file
32
src/js.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
const JS_DOM_EVENT_ATTRS: [&str; 21] = [
|
||||
// Input
|
||||
"onfocus",
|
||||
"onblur",
|
||||
"onselect",
|
||||
"onchange",
|
||||
"onsubmit",
|
||||
"onreset",
|
||||
"onkeydown",
|
||||
"onkeypress",
|
||||
"onkeyup",
|
||||
// Mouse
|
||||
"onmouseover",
|
||||
"onmouseout",
|
||||
"onmousedown",
|
||||
"onmouseup",
|
||||
"onmousemove",
|
||||
// Click
|
||||
"onclick",
|
||||
"ondblclick",
|
||||
// Load
|
||||
"onload",
|
||||
"onunload",
|
||||
"onabort",
|
||||
"onerror",
|
||||
"onresize",
|
||||
];
|
||||
|
||||
// Returns true if DOM attribute name matches a native JavaScript event handler
|
||||
pub fn attr_is_event_handler(attr_name: &str) -> bool {
|
||||
JS_DOM_EVENT_ATTRS.contains(&attr_name.to_lowercase().as_str())
|
||||
}
|
||||
@@ -1,10 +1,17 @@
|
||||
extern crate html5ever;
|
||||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
extern crate html5ever;
|
||||
extern crate regex;
|
||||
extern crate reqwest;
|
||||
extern crate url;
|
||||
|
||||
#[macro_use]
|
||||
mod macros;
|
||||
|
||||
pub mod html;
|
||||
pub mod http;
|
||||
pub mod js;
|
||||
pub mod utils;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests;
|
||||
|
||||
9
src/macros.rs
Normal file
9
src/macros.rs
Normal file
@@ -0,0 +1,9 @@
|
||||
#[macro_export]
|
||||
macro_rules! str {
|
||||
() => {
|
||||
String::new()
|
||||
};
|
||||
($val: expr) => {
|
||||
ToString::to_string(&$val)
|
||||
};
|
||||
}
|
||||
135
src/main.rs
135
src/main.rs
@@ -1,63 +1,104 @@
|
||||
#[macro_use]
|
||||
extern crate clap;
|
||||
extern crate monolith;
|
||||
extern crate reqwest;
|
||||
|
||||
use clap::{App, Arg};
|
||||
use monolith::html::{html_to_dom, print_dom, walk_and_embed_assets};
|
||||
use monolith::http::{is_valid_url, retrieve_asset};
|
||||
mod args;
|
||||
mod macros;
|
||||
|
||||
static DEFAULT_USER_AGENT: &str =
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0";
|
||||
use args::AppArgs;
|
||||
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
|
||||
use monolith::http::retrieve_asset;
|
||||
use monolith::utils::is_valid_url;
|
||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||
use std::collections::HashMap;
|
||||
use std::fs::{remove_file, File};
|
||||
use std::io::{Error, Write};
|
||||
use std::time::Duration;
|
||||
|
||||
fn create_file(file_path: &String, content: String) -> Result<(), Error> {
|
||||
let file = File::create(file_path.as_str());
|
||||
|
||||
let mut file = match file {
|
||||
Ok(file) => file,
|
||||
Err(error) => return Err(error),
|
||||
};
|
||||
|
||||
if content != str!() {
|
||||
file.write_all(content.as_bytes())?;
|
||||
file.write_all("\n".as_bytes())?;
|
||||
file.sync_all()?;
|
||||
} else {
|
||||
// Remove the file right away if it had no content
|
||||
remove_file(file_path.as_str())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let command = App::new("monolith")
|
||||
.version(crate_version!())
|
||||
.author(crate_authors!("\n"))
|
||||
.about(crate_description!())
|
||||
.arg(
|
||||
Arg::with_name("url")
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
.index(1)
|
||||
.help("URL to download"),
|
||||
let app_args = AppArgs::get();
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
// Attempt to create output file
|
||||
if app_args.output != str!() {
|
||||
create_file(&app_args.output, str!()).unwrap();
|
||||
}
|
||||
|
||||
if is_valid_url(app_args.url_target.as_str()) {
|
||||
// Initialize client
|
||||
let mut header_map = HeaderMap::new();
|
||||
match HeaderValue::from_str(&app_args.user_agent) {
|
||||
Ok(header) => header_map.insert(USER_AGENT, header),
|
||||
Err(err) => {
|
||||
eprintln!("Invalid user agent! {}", err);
|
||||
return;
|
||||
}
|
||||
};
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(10))
|
||||
.danger_accept_invalid_certs(app_args.insecure)
|
||||
.default_headers(header_map)
|
||||
.build()
|
||||
.expect("Failed to initialize HTTP client");
|
||||
|
||||
// Retrieve root document
|
||||
let (data, final_url) = retrieve_asset(
|
||||
cache,
|
||||
&client,
|
||||
app_args.url_target.as_str(),
|
||||
false,
|
||||
"",
|
||||
app_args.silent,
|
||||
)
|
||||
.args_from_usage("-i, --no-images 'Removes images'")
|
||||
.args_from_usage("-j, --no-js 'Excludes JavaScript'")
|
||||
.args_from_usage("-k, --insecure 'Accept invalid X.509 (TLS) certificates'")
|
||||
.args_from_usage("-s, --silent 'Suppress verbosity'")
|
||||
.args_from_usage("-u, --user-agent=[Iceweasel] 'Custom User-Agent string'")
|
||||
.get_matches();
|
||||
|
||||
// Process the command
|
||||
let arg_target = command.value_of("url").unwrap();
|
||||
let opt_no_images = command.is_present("no-images");
|
||||
let opt_no_js = command.is_present("no-js");
|
||||
let opt_insecure = command.is_present("insecure");
|
||||
let opt_silent = command.is_present("silent");
|
||||
let opt_user_agent = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT);
|
||||
|
||||
if is_valid_url(arg_target) {
|
||||
let data = retrieve_asset(
|
||||
&arg_target,
|
||||
false,
|
||||
"",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
).unwrap();
|
||||
.unwrap();
|
||||
let dom = html_to_dom(&data);
|
||||
|
||||
walk_and_embed_assets(
|
||||
&arg_target,
|
||||
cache,
|
||||
&client,
|
||||
&final_url,
|
||||
&dom.document,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
app_args.no_css,
|
||||
app_args.no_js,
|
||||
app_args.no_images,
|
||||
app_args.silent,
|
||||
app_args.no_frames,
|
||||
);
|
||||
|
||||
print_dom(&dom.document);
|
||||
println!(); // Ensure newline at end of output
|
||||
let html: String = stringify_document(
|
||||
&dom.document,
|
||||
app_args.no_css,
|
||||
app_args.no_frames,
|
||||
app_args.no_js,
|
||||
app_args.no_images,
|
||||
app_args.isolate,
|
||||
);
|
||||
|
||||
if app_args.output == str!() {
|
||||
println!("{}", html);
|
||||
} else {
|
||||
create_file(&app_args.output, html).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
518
src/tests/html.rs
Normal file
518
src/tests/html.rs
Normal file
@@ -0,0 +1,518 @@
|
||||
use crate::html::{
|
||||
get_node_name, get_parent_node, html_to_dom, is_icon, stringify_document, walk_and_embed_assets,
|
||||
};
|
||||
use html5ever::rcdom::{Handle, NodeData};
|
||||
use html5ever::serialize::{serialize, SerializeOpts};
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[test]
|
||||
fn test_is_icon() {
|
||||
assert_eq!(is_icon("icon"), true);
|
||||
assert_eq!(is_icon("Shortcut Icon"), true);
|
||||
assert_eq!(is_icon("ICON"), true);
|
||||
assert_eq!(is_icon("mask-icon"), true);
|
||||
assert_eq!(is_icon("fluid-icon"), true);
|
||||
assert_eq!(is_icon("stylesheet"), false);
|
||||
assert_eq!(is_icon(""), false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_parent_node_name() {
|
||||
let html = "<!doctype html><html><HEAD></HEAD><body><div><P></P></div></body></html>";
|
||||
let dom = html_to_dom(&html);
|
||||
let mut count = 0;
|
||||
|
||||
fn test_walk(node: &Handle, i: &mut i8) {
|
||||
*i += 1;
|
||||
|
||||
match &node.data {
|
||||
NodeData::Document => {
|
||||
for child in node.children.borrow().iter() {
|
||||
test_walk(child, &mut *i);
|
||||
}
|
||||
}
|
||||
NodeData::Element { ref name, .. } => {
|
||||
let node_name = name.local.as_ref().to_string();
|
||||
let parent_node_name = get_node_name(&get_parent_node(node));
|
||||
if node_name == "head" || node_name == "body" {
|
||||
assert_eq!(parent_node_name, "html");
|
||||
} else if node_name == "div" {
|
||||
assert_eq!(parent_node_name, "body");
|
||||
} else if node_name == "p" {
|
||||
assert_eq!(parent_node_name, "div");
|
||||
}
|
||||
|
||||
println!("{}", node_name);
|
||||
|
||||
for child in node.children.borrow().iter() {
|
||||
test_walk(child, &mut *i);
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
};
|
||||
}
|
||||
|
||||
test_walk(&dom.document, &mut count);
|
||||
|
||||
assert_eq!(count, 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets() {
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let html = "<div><P></P></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = false;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html><head></head><body><div><p></p></div></body></html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets_ensure_no_recursive_iframe() {
|
||||
let html = "<div><P></P><iframe src=\"\"></iframe></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = false;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html><head></head><body><div><p></p><iframe src=\"\"></iframe></div></body></html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets_no_css() {
|
||||
let html = "<link rel=\"stylesheet\" href=\"main.css\">\
|
||||
<style>html{background-color: #000;}</style>\
|
||||
<div style=\"display: none;\"></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let opt_no_css: bool = true;
|
||||
let opt_no_frames: bool = false;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html>\
|
||||
<head>\
|
||||
<link rel=\"stylesheet\" href=\"\">\
|
||||
<style></style>\
|
||||
</head>\
|
||||
<body>\
|
||||
<div></div>\
|
||||
</body>\
|
||||
</html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets_no_images() {
|
||||
let html = "<link rel=\"icon\" href=\"favicon.ico\">\
|
||||
<div><img src=\"http://localhost/assets/mono_lisa.png\" /></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = false;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = true;
|
||||
let opt_silent = true;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html>\
|
||||
<head>\
|
||||
<link rel=\"icon\" href=\"\">\
|
||||
</head>\
|
||||
<body>\
|
||||
<div>\
|
||||
<img src=\"data:image/png;base64,\
|
||||
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0\
|
||||
lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=\">\
|
||||
</div>\
|
||||
</body>\
|
||||
</html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets_no_frames() {
|
||||
let html = "<iframe src=\"http://trackbook.com\"></iframe>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = true;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html><head></head><body><iframe src=\"\"></iframe></body></html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets_no_js() {
|
||||
let html = "<div onClick=\"void(0)\">\
|
||||
<script src=\"http://localhost/assets/some.js\"></script>\
|
||||
<script>alert(1)</script>\
|
||||
</div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = false;
|
||||
let opt_no_js: bool = true;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html><head></head><body><div><script src=\"\"></script>\
|
||||
<script></script></div></body></html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_with_no_integrity() {
|
||||
let html = "<title>No integrity</title>\
|
||||
<link integrity=\"sha384-...\" rel=\"something\"/>\
|
||||
<script integrity=\"sha384-...\" src=\"some.js\"></script>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
let client = reqwest::Client::new();
|
||||
let opt_no_css: bool = true;
|
||||
let opt_no_frames: bool = true;
|
||||
let opt_no_js: bool = true;
|
||||
let opt_no_images: bool = true;
|
||||
let opt_silent = true;
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html>\
|
||||
<head><title>No integrity</title><link rel=\"something\"><script src=\"\"></script></head>\
|
||||
<body></body>\
|
||||
</html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stringify_document() {
|
||||
let html = "<div><script src=\"some.js\"></script></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = false;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_isolate: bool = false;
|
||||
|
||||
assert_eq!(
|
||||
stringify_document(
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_frames,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_isolate,
|
||||
),
|
||||
"<html><head></head><body><div><script src=\"some.js\"></script></div></body></html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stringify_document_isolate() {
|
||||
let html = "<title>Isolated document</title>\
|
||||
<link rel=\"something\" href=\"some.css\" />\
|
||||
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src https:\">\
|
||||
<div><script src=\"some.js\"></script></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = false;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_isolate: bool = true;
|
||||
|
||||
assert_eq!(
|
||||
stringify_document(
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_frames,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_isolate,
|
||||
),
|
||||
"<html>\
|
||||
<head>\
|
||||
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src 'unsafe-inline' data:;\"></meta>\
|
||||
<title>Isolated document</title>\
|
||||
<link rel=\"something\" href=\"some.css\">\
|
||||
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src https:\">\
|
||||
</head>\
|
||||
<body>\
|
||||
<div>\
|
||||
<script src=\"some.js\"></script>\
|
||||
</div>\
|
||||
</body>\
|
||||
</html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stringify_document_no_css() {
|
||||
let html = "<!doctype html>\
|
||||
<title>Unstyled document</title>\
|
||||
<link rel=\"stylesheet\" href=\"main.css\"/>\
|
||||
<div style=\"display: none;\"></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
|
||||
let opt_no_css: bool = true;
|
||||
let opt_no_frames: bool = false;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_isolate: bool = false;
|
||||
|
||||
assert_eq!(
|
||||
stringify_document(
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_frames,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_isolate,
|
||||
),
|
||||
"<!DOCTYPE html>\
|
||||
<html>\
|
||||
<head>\
|
||||
<meta http-equiv=\"Content-Security-Policy\" content=\"style-src 'none';\"></meta>\
|
||||
<title>Unstyled document</title>\
|
||||
<link rel=\"stylesheet\" href=\"main.css\">\
|
||||
</head>\
|
||||
<body><div style=\"display: none;\"></div></body>\
|
||||
</html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stringify_document_no_frames() {
|
||||
let html = "<!doctype html>\
|
||||
<title>Frameless document</title>\
|
||||
<link rel=\"something\"/>\
|
||||
<div><script src=\"some.js\"></script></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = true;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_isolate: bool = false;
|
||||
|
||||
assert_eq!(
|
||||
stringify_document(
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_frames,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_isolate,
|
||||
),
|
||||
"<!DOCTYPE html>\
|
||||
<html>\
|
||||
<head>\
|
||||
<meta http-equiv=\"Content-Security-Policy\" content=\"frame-src 'none';child-src 'none';\"></meta>\
|
||||
<title>Frameless document</title>\
|
||||
<link rel=\"something\">\
|
||||
</head>\
|
||||
<body><div><script src=\"some.js\"></script></div></body>\
|
||||
</html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stringify_document_isolate_no_frames_no_js_no_css_no_images() {
|
||||
let html = "<!doctype html>\
|
||||
<title>no-frame no-css no-js no-image isolated document</title>\
|
||||
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src https:\">\
|
||||
<link rel=\"stylesheet\" href=\"some.css\">\
|
||||
<div>\
|
||||
<script src=\"some.js\"></script>\
|
||||
<img style=\"width: 100%;\" src=\"some.png\" />\
|
||||
<iframe src=\"some.html\"></iframe>\
|
||||
</div>";
|
||||
let dom = html_to_dom(&html);
|
||||
|
||||
let opt_isolate: bool = true;
|
||||
let opt_no_css: bool = true;
|
||||
let opt_no_frames: bool = true;
|
||||
let opt_no_js: bool = true;
|
||||
let opt_no_images: bool = true;
|
||||
|
||||
assert_eq!(
|
||||
stringify_document(
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_frames,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_isolate,
|
||||
),
|
||||
"<!DOCTYPE html>\
|
||||
<html>\
|
||||
<head>\
|
||||
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src \'unsafe-inline\' data:; style-src \'none\'; frame-src \'none\';child-src \'none\'; script-src \'none\'; img-src data:;\"></meta>\
|
||||
<title>no-frame no-css no-js no-image isolated document</title>\
|
||||
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src https:\">\
|
||||
<link rel=\"stylesheet\" href=\"some.css\">\
|
||||
</head>\
|
||||
<body>\
|
||||
<div>\
|
||||
<script src=\"some.js\"></script>\
|
||||
<img style=\"width: 100%;\" src=\"some.png\">\
|
||||
<iframe src=\"some.html\"></iframe>\
|
||||
</div>\
|
||||
</body>\
|
||||
</html>"
|
||||
);
|
||||
}
|
||||
23
src/tests/http.rs
Normal file
23
src/tests/http.rs
Normal file
@@ -0,0 +1,23 @@
|
||||
use crate::http::retrieve_asset;
|
||||
use std::collections::HashMap;
|
||||
#[test]
|
||||
fn test_retrieve_asset() {
|
||||
let cache = &mut HashMap::new();
|
||||
let client = reqwest::Client::new();
|
||||
let (data, final_url) =
|
||||
retrieve_asset(cache, &client, "data:text/html;base64,...", true, "", false).unwrap();
|
||||
assert_eq!(&data, "data:text/html;base64,...");
|
||||
assert_eq!(&final_url, "data:text/html;base64,...");
|
||||
|
||||
let (data, final_url) = retrieve_asset(
|
||||
cache,
|
||||
&client,
|
||||
"data:text/html;base64,...",
|
||||
true,
|
||||
"image/png",
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(&data, "data:text/html;base64,...");
|
||||
assert_eq!(&final_url, "data:text/html;base64,...");
|
||||
}
|
||||
13
src/tests/js.rs
Normal file
13
src/tests/js.rs
Normal file
@@ -0,0 +1,13 @@
|
||||
use crate::js::attr_is_event_handler;
|
||||
|
||||
#[test]
|
||||
fn test_attr_is_event_handler() {
|
||||
// succeeding
|
||||
assert!(attr_is_event_handler("onBlur"));
|
||||
assert!(attr_is_event_handler("onclick"));
|
||||
assert!(attr_is_event_handler("onClick"));
|
||||
// failing
|
||||
assert!(!attr_is_event_handler("href"));
|
||||
assert!(!attr_is_event_handler(""));
|
||||
assert!(!attr_is_event_handler("class"));
|
||||
}
|
||||
4
src/tests/mod.rs
Normal file
4
src/tests/mod.rs
Normal file
@@ -0,0 +1,4 @@
|
||||
mod html;
|
||||
mod http;
|
||||
mod js;
|
||||
mod utils;
|
||||
177
src/tests/utils.rs
Normal file
177
src/tests/utils.rs
Normal file
@@ -0,0 +1,177 @@
|
||||
use crate::utils::{
|
||||
clean_url, data_to_dataurl, detect_mimetype, is_data_url, is_valid_url, resolve_url,
|
||||
url_has_protocol,
|
||||
};
|
||||
use url::ParseError;
|
||||
|
||||
#[test]
|
||||
fn test_data_to_dataurl() {
|
||||
let mime = "application/javascript";
|
||||
let data = "var word = 'hello';\nalert(word);\n";
|
||||
let datauri = data_to_dataurl(mime, data.as_bytes());
|
||||
assert_eq!(
|
||||
&datauri,
|
||||
"data:application/javascript;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_mimetype() {
|
||||
// image
|
||||
assert_eq!(detect_mimetype(b"GIF87a"), "image/gif");
|
||||
assert_eq!(detect_mimetype(b"GIF89a"), "image/gif");
|
||||
assert_eq!(detect_mimetype(b"\xFF\xD8\xFF"), "image/jpeg");
|
||||
assert_eq!(detect_mimetype(b"\x89PNG\x0D\x0A\x1A\x0A"), "image/png");
|
||||
assert_eq!(detect_mimetype(b"<?xml "), "image/svg+xml");
|
||||
assert_eq!(detect_mimetype(b"<svg "), "image/svg+xml");
|
||||
assert_eq!(detect_mimetype(b"RIFF....WEBPVP8 "), "image/webp");
|
||||
assert_eq!(detect_mimetype(b"\x00\x00\x01\x00"), "image/x-icon");
|
||||
// audio
|
||||
assert_eq!(detect_mimetype(b"ID3"), "audio/mpeg");
|
||||
assert_eq!(detect_mimetype(b"\xFF\x0E"), "audio/mpeg");
|
||||
assert_eq!(detect_mimetype(b"\xFF\x0F"), "audio/mpeg");
|
||||
assert_eq!(detect_mimetype(b"OggS"), "audio/ogg");
|
||||
assert_eq!(detect_mimetype(b"RIFF....WAVEfmt "), "audio/wav");
|
||||
assert_eq!(detect_mimetype(b"fLaC"), "audio/x-flac");
|
||||
// video
|
||||
assert_eq!(detect_mimetype(b"RIFF....AVI LIST"), "video/avi");
|
||||
assert_eq!(detect_mimetype(b"....ftyp"), "video/mp4");
|
||||
assert_eq!(detect_mimetype(b"\x00\x00\x01\x0B"), "video/mpeg");
|
||||
assert_eq!(detect_mimetype(b"....moov"), "video/quicktime");
|
||||
assert_eq!(detect_mimetype(b"\x1A\x45\xDF\xA3"), "video/webm");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_url_has_protocol() {
|
||||
// succeeding
|
||||
assert_eq!(
|
||||
url_has_protocol("mailto:somebody@somewhere.com?subject=hello"),
|
||||
true
|
||||
);
|
||||
assert_eq!(url_has_protocol("tel:5551234567"), true);
|
||||
assert_eq!(
|
||||
url_has_protocol("ftp:user:password@some-ftp-server.com"),
|
||||
true
|
||||
);
|
||||
assert_eq!(url_has_protocol("javascript:void(0)"), true);
|
||||
assert_eq!(url_has_protocol("http://news.ycombinator.com"), true);
|
||||
assert_eq!(url_has_protocol("https://github.com"), true);
|
||||
assert_eq!(
|
||||
url_has_protocol("MAILTO:somebody@somewhere.com?subject=hello"),
|
||||
true
|
||||
);
|
||||
// failing
|
||||
assert_eq!(
|
||||
url_has_protocol("//some-hostname.com/some-file.html"),
|
||||
false
|
||||
);
|
||||
assert_eq!(url_has_protocol("some-hostname.com/some-file.html"), false);
|
||||
assert_eq!(url_has_protocol("/some-file.html"), false);
|
||||
assert_eq!(url_has_protocol(""), false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_valid_url() {
|
||||
// succeeding
|
||||
assert!(is_valid_url("https://www.rust-lang.org/"));
|
||||
assert!(is_valid_url("http://kernel.org"));
|
||||
// failing
|
||||
assert!(!is_valid_url("//kernel.org"));
|
||||
assert!(!is_valid_url("./index.html"));
|
||||
assert!(!is_valid_url("some-local-page.htm"));
|
||||
assert!(!is_valid_url("ftp://1.2.3.4/www/index.html"));
|
||||
assert!(!is_valid_url(
|
||||
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_url() -> Result<(), ParseError> {
|
||||
let resolved_url = resolve_url("https://www.kernel.org", "../category/signatures.html")?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/category/signatures.html"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url("https://www.kernel.org", "category/signatures.html")?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/category/signatures.html"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"saved_page.htm",
|
||||
"https://www.kernel.org/category/signatures.html",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/category/signatures.html"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.kernel.org",
|
||||
"//www.kernel.org/theme/images/logos/tux.png",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/theme/images/logos/tux.png"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.kernel.org",
|
||||
"//another-host.org/theme/images/logos/tux.png",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://another-host.org/theme/images/logos/tux.png"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.kernel.org/category/signatures.html",
|
||||
"/theme/images/logos/tux.png",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/theme/images/logos/tux.png"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.w3schools.com/html/html_iframe.asp",
|
||||
"default.asp",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.w3schools.com/html/default.asp"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_data_url() {
|
||||
// succeeding
|
||||
assert!(
|
||||
is_data_url("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h")
|
||||
.unwrap_or(false)
|
||||
);
|
||||
// failing
|
||||
assert!(!is_data_url("https://kernel.org").unwrap_or(false));
|
||||
assert!(!is_data_url("//kernel.org").unwrap_or(false));
|
||||
assert!(!is_data_url("").unwrap_or(false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_clean_url() {
|
||||
assert_eq!(
|
||||
clean_url("https://somewhere.com/font.eot#iefix"),
|
||||
"https://somewhere.com/font.eot"
|
||||
);
|
||||
assert_eq!(
|
||||
clean_url("https://somewhere.com/font.eot#"),
|
||||
"https://somewhere.com/font.eot"
|
||||
);
|
||||
assert_eq!(
|
||||
clean_url("https://somewhere.com/font.eot?#"),
|
||||
"https://somewhere.com/font.eot"
|
||||
);
|
||||
}
|
||||
209
src/utils.rs
209
src/utils.rs
@@ -1,8 +1,50 @@
|
||||
extern crate base64;
|
||||
|
||||
use self::base64::encode;
|
||||
use http::retrieve_asset;
|
||||
use regex::Regex;
|
||||
use reqwest::Client;
|
||||
use std::collections::HashMap;
|
||||
use url::{ParseError, Url};
|
||||
|
||||
static MAGIC: [[&[u8]; 2]; 19] = [
|
||||
/// This monster of a regex is used to match any kind of URL found in CSS.
|
||||
///
|
||||
/// There are roughly three different categories that a found URL could fit
|
||||
/// into:
|
||||
/// - Font [found after a src: property in an @font-family rule]
|
||||
/// - Stylesheet [denoted by an @import before the url
|
||||
/// - Image [covers all other uses of the url() function]
|
||||
///
|
||||
/// This regex aims to extract the following information:
|
||||
/// - What type of URL is it (font/image/css)
|
||||
/// - Where is the part that needs to be replaced (incl any wrapping quotes)
|
||||
/// - What is the URL (excl any wrapping quotes)
|
||||
///
|
||||
/// Essentially, the regex can be broken down into two parts:
|
||||
///
|
||||
/// `(?:(?P<import>@import)|(?P<font>src\s*:)\s+)?`
|
||||
/// This matches the precursor to a font or CSS URL, and fills in a match under
|
||||
/// either `<import>` (if it's a CSS URL) or `<font>` (if it's a font).
|
||||
/// Determining whether or not it's an image can be done by the negation of both
|
||||
/// of these. Either zero or one of these can match.
|
||||
///
|
||||
/// `url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)`
|
||||
/// This matches the actual URL part of the url(), and must always match. It also
|
||||
/// sets `<to_repl>` and `<url>` which correspond to everything within
|
||||
/// `url(...)` and a usable URL, respectively.
|
||||
///
|
||||
/// Note, however, that this does not perform any validation of the found URL.
|
||||
/// Malformed CSS could lead to an invalid URL being present. It is therefore
|
||||
/// recomended that the URL gets manually validated.
|
||||
const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P<stylesheet>@import)|(?P<font>src\s*:))\s+)?url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)"###;
|
||||
|
||||
lazy_static! {
|
||||
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
|
||||
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
|
||||
static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap();
|
||||
}
|
||||
|
||||
const MAGIC: [[&[u8]; 2]; 19] = [
|
||||
// Image
|
||||
[b"GIF87a", b"image/gif"],
|
||||
[b"GIF89a", b"image/gif"],
|
||||
@@ -28,7 +70,7 @@ static MAGIC: [[&[u8]; 2]; 19] = [
|
||||
];
|
||||
|
||||
pub fn data_to_dataurl(mime: &str, data: &[u8]) -> String {
|
||||
let mimetype = if mime == "" {
|
||||
let mimetype = if mime.is_empty() {
|
||||
detect_mimetype(data)
|
||||
} else {
|
||||
mime.to_string()
|
||||
@@ -36,57 +78,132 @@ pub fn data_to_dataurl(mime: &str, data: &[u8]) -> String {
|
||||
format!("data:{};base64,{}", mimetype, encode(data))
|
||||
}
|
||||
|
||||
fn detect_mimetype(data: &[u8]) -> String {
|
||||
let mut re = String::new();
|
||||
|
||||
for item in MAGIC.iter() {
|
||||
pub fn detect_mimetype(data: &[u8]) -> String {
|
||||
for item in MAGIC.iter() {
|
||||
if data.starts_with(item[0]) {
|
||||
re = String::from_utf8(item[1].to_vec()).unwrap();
|
||||
break;
|
||||
return String::from_utf8(item[1].to_vec()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
re
|
||||
"".to_owned()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
pub fn url_has_protocol<T: AsRef<str>>(url: T) -> bool {
|
||||
HAS_PROTOCOL.is_match(url.as_ref().to_lowercase().as_str())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_data_to_dataurl() {
|
||||
let mime = "application/javascript";
|
||||
let data = "var word = 'hello';\nalert(word);\n";
|
||||
let datauri = data_to_dataurl(mime, data.as_bytes());
|
||||
assert_eq!(
|
||||
&datauri,
|
||||
"data:application/javascript;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK"
|
||||
);
|
||||
pub fn is_data_url<T: AsRef<str>>(url: T) -> Result<bool, ParseError> {
|
||||
Url::parse(url.as_ref()).and_then(|u| Ok(u.scheme() == "data"))
|
||||
}
|
||||
|
||||
pub fn is_valid_url<T: AsRef<str>>(path: T) -> bool {
|
||||
REGEX_URL.is_match(path.as_ref())
|
||||
}
|
||||
|
||||
pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<String, ParseError> {
|
||||
let result = if is_valid_url(to.as_ref()) {
|
||||
to.as_ref().to_string()
|
||||
} else {
|
||||
Url::parse(from.as_ref())?
|
||||
.join(to.as_ref())?
|
||||
.as_ref()
|
||||
.to_string()
|
||||
};
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn resolve_css_imports(
|
||||
cache: &mut HashMap<String, String>,
|
||||
client: &Client,
|
||||
css_string: &str,
|
||||
as_dataurl: bool,
|
||||
href: &str,
|
||||
opt_no_images: bool,
|
||||
opt_silent: bool,
|
||||
) -> String {
|
||||
let mut resolved_css = String::from(css_string);
|
||||
|
||||
for link in REGEX_CSS_URL.captures_iter(&css_string) {
|
||||
let target_link = link.name("url").unwrap().as_str();
|
||||
|
||||
// Determine the type of link
|
||||
let is_stylesheet = link.name("stylesheet").is_some();
|
||||
let is_font = link.name("font").is_some();
|
||||
let is_image = !is_stylesheet && !is_font;
|
||||
|
||||
// Generate absolute URL for content
|
||||
let embedded_url = match resolve_url(href, target_link) {
|
||||
Ok(url) => url,
|
||||
Err(_) => continue, // Malformed URL
|
||||
};
|
||||
|
||||
// Download the asset. If it's more CSS, resolve that too
|
||||
let content = if is_stylesheet {
|
||||
// The link is an @import link
|
||||
retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&embedded_url,
|
||||
false, // Formating as data URL will be done later
|
||||
"text/css", // Expect CSS
|
||||
opt_silent,
|
||||
)
|
||||
.map(|(content, _)| {
|
||||
resolve_css_imports(
|
||||
cache,
|
||||
client,
|
||||
&content,
|
||||
true, // Finally, convert to a dataurl
|
||||
&embedded_url,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
)
|
||||
})
|
||||
} else if (is_image && !opt_no_images) || is_font {
|
||||
// The link is some other, non-@import link
|
||||
retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&embedded_url,
|
||||
true, // Format as data URL
|
||||
"", // Unknown MIME type
|
||||
opt_silent,
|
||||
)
|
||||
.map(|(a, _)| a)
|
||||
} else {
|
||||
// If it's a datatype that has been opt_no'd out of, replace with
|
||||
// absolute URL
|
||||
|
||||
Ok(embedded_url.clone())
|
||||
}
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("Warning: {}", e);
|
||||
|
||||
// If failed to resolve, replace with absolute URL
|
||||
embedded_url
|
||||
});
|
||||
|
||||
let replacement = format!("\"{}\"", &content);
|
||||
let dest = link.name("to_repl").unwrap();
|
||||
let offset = resolved_css.len() - css_string.len();
|
||||
let target_range = (dest.start() + offset)..(dest.end() + offset);
|
||||
|
||||
resolved_css.replace_range(target_range, &replacement);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_mimetype() {
|
||||
// Image
|
||||
assert_eq!(detect_mimetype(b"GIF87a"), "image/gif");
|
||||
assert_eq!(detect_mimetype(b"GIF89a"), "image/gif");
|
||||
assert_eq!(detect_mimetype(b"\xFF\xD8\xFF"), "image/jpeg");
|
||||
assert_eq!(detect_mimetype(b"\x89PNG\x0D\x0A\x1A\x0A"), "image/png");
|
||||
assert_eq!(detect_mimetype(b"<?xml "), "image/svg+xml");
|
||||
assert_eq!(detect_mimetype(b"<svg "), "image/svg+xml");
|
||||
assert_eq!(detect_mimetype(b"RIFF....WEBPVP8 "), "image/webp");
|
||||
assert_eq!(detect_mimetype(b"\x00\x00\x01\x00"), "image/x-icon");
|
||||
// Audio
|
||||
assert_eq!(detect_mimetype(b"ID3"), "audio/mpeg");
|
||||
assert_eq!(detect_mimetype(b"\xFF\x0E"), "audio/mpeg");
|
||||
assert_eq!(detect_mimetype(b"\xFF\x0F"), "audio/mpeg");
|
||||
assert_eq!(detect_mimetype(b"OggS"), "audio/ogg");
|
||||
assert_eq!(detect_mimetype(b"RIFF....WAVEfmt "), "audio/wav");
|
||||
assert_eq!(detect_mimetype(b"fLaC"), "audio/x-flac");
|
||||
// Video
|
||||
assert_eq!(detect_mimetype(b"RIFF....AVI LIST"), "video/avi");
|
||||
assert_eq!(detect_mimetype(b"....ftyp"), "video/mp4");
|
||||
assert_eq!(detect_mimetype(b"\x00\x00\x01\x0B"), "video/mpeg");
|
||||
assert_eq!(detect_mimetype(b"....moov"), "video/quicktime");
|
||||
assert_eq!(detect_mimetype(b"\x1A\x45\xDF\xA3"), "video/webm");
|
||||
if as_dataurl {
|
||||
data_to_dataurl("text/css", resolved_css.as_bytes())
|
||||
} else {
|
||||
resolved_css
|
||||
}
|
||||
}
|
||||
|
||||
pub fn clean_url<T: AsRef<str>>(url: T) -> String {
|
||||
let mut result = Url::parse(url.as_ref()).unwrap();
|
||||
// Clear fragment
|
||||
result.set_fragment(None);
|
||||
// Get rid of stray question mark
|
||||
if result.query() == Some("") {
|
||||
result.set_query(None);
|
||||
}
|
||||
result.to_string()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user