Compare commits
118 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f876e9243c | ||
|
|
b6896febf1 | ||
|
|
29d2ba5857 | ||
|
|
8b1ebc7871 | ||
|
|
d753c83c76 | ||
|
|
47a825f5ed | ||
|
|
0e12cecd85 | ||
|
|
d8def879b2 | ||
|
|
0420854ed6 | ||
|
|
d47482fcd9 | ||
|
|
b68624f2f3 | ||
|
|
a9d114d04d | ||
|
|
4e4ebe9c98 | ||
|
|
429217d8f7 | ||
|
|
1779f4a374 | ||
|
|
26e89ae6d3 | ||
|
|
b333d19d04 | ||
|
|
c1dc798ded | ||
|
|
69d99b69e8 | ||
|
|
aae53d20f0 | ||
|
|
14cf2ce8a6 | ||
|
|
67b79e92f9 | ||
|
|
b51f41fe34 | ||
|
|
6f158dc6db | ||
|
|
8d7052b39c | ||
|
|
660511b8a0 | ||
|
|
929512f4f5 | ||
|
|
a46d89cefc | ||
|
|
f93646e17a | ||
|
|
9d14b6dfea | ||
|
|
9783b96524 | ||
|
|
106efe58ce | ||
|
|
6e99ad13e7 | ||
|
|
413dd66886 | ||
|
|
dc7ec6e7a8 | ||
|
|
ed879231af | ||
|
|
ddf4b8ac13 | ||
|
|
84c13f0605 | ||
|
|
ce03e0e487 | ||
|
|
63e19998d0 | ||
|
|
e3321bbb07 | ||
|
|
0a38cd0eae | ||
|
|
75fb6961ed | ||
|
|
5ba8931502 | ||
|
|
13d2ea1607 | ||
|
|
88ffde0c3b | ||
|
|
bfb97bd062 | ||
|
|
295931041c | ||
|
|
2e623dd9f8 | ||
|
|
169b9657e5 | ||
|
|
dab4ae6965 | ||
|
|
c7fc121c7c | ||
|
|
9ff9dd0928 | ||
|
|
b0fc24d77f | ||
|
|
d8abfaf25c | ||
|
|
565acdef25 | ||
|
|
4637fed15c | ||
|
|
9a7ea4fdde | ||
|
|
3d4a932ac1 | ||
|
|
cf70383165 | ||
|
|
9fe913d853 | ||
|
|
862489e41b | ||
|
|
919e626b5e | ||
|
|
cf347e0483 | ||
|
|
322ab41b8c | ||
|
|
1a7336e809 | ||
|
|
65d0eab793 | ||
|
|
292221ea28 | ||
|
|
614af44c92 | ||
|
|
feb37f5812 | ||
|
|
028beb821c | ||
|
|
76ccff80f9 | ||
|
|
45335d7507 | ||
|
|
a4743ca92f | ||
|
|
b96a777e8a | ||
|
|
4decea716c | ||
|
|
695a787206 | ||
|
|
90e6cb1c45 | ||
|
|
7412d663e0 | ||
|
|
8646af6e9f | ||
|
|
de383c94b1 | ||
|
|
ab65b44f0d | ||
|
|
13bacb4320 | ||
|
|
d574e9a5da | ||
|
|
1de0fc0961 | ||
|
|
ebbf755e09 | ||
|
|
d3956a7905 | ||
|
|
ef7ddcd434 | ||
|
|
11bbfc0851 | ||
|
|
a2bf7e3345 | ||
|
|
35f5e1353d | ||
|
|
f8040f4d8c | ||
|
|
31d3fee626 | ||
|
|
178abd07bd | ||
|
|
491185e191 | ||
|
|
b0c55d5016 | ||
|
|
1ff5e91087 | ||
|
|
550e4cc83f | ||
|
|
5443c0cc3f | ||
|
|
8add3a8746 | ||
|
|
2f592d5561 | ||
|
|
55fe523a1c | ||
|
|
b5d42bd722 | ||
|
|
cbf3b66f33 | ||
|
|
2e48ea90e1 | ||
|
|
9c006f3258 | ||
|
|
ab24851b5b | ||
|
|
de11559efa | ||
|
|
dbacd76103 | ||
|
|
0896f2e214 | ||
|
|
b6ba22513d | ||
|
|
3948ea3aa0 | ||
|
|
8b3f3f3a6e | ||
|
|
eec05767cf | ||
|
|
c05dc2ae65 | ||
|
|
88a230872c | ||
|
|
ac79a52da0 | ||
|
|
04cbbefafa |
@@ -95,6 +95,7 @@ environment:
|
||||
matrix:
|
||||
allow_failures:
|
||||
- channel: nightly
|
||||
- channel: beta
|
||||
|
||||
# If you only care about stable channel build failures, uncomment the following line:
|
||||
#- channel: beta
|
||||
@@ -114,6 +115,7 @@ install:
|
||||
- if defined MINGW_PATH set PATH=%PATH%;%MINGW_PATH%
|
||||
- rustc -vV
|
||||
- cargo -vV
|
||||
- rustup component add rustfmt
|
||||
|
||||
## Build Script ##
|
||||
|
||||
@@ -125,4 +127,4 @@ build: false
|
||||
#directly or perform other testing commands. Rust will automatically be placed in the PATH
|
||||
# environment variable.
|
||||
test_script:
|
||||
- cargo test --verbose %cargoflags%
|
||||
- cargo test --verbose %cargoflags%
|
||||
7
.gitignore
vendored
7
.gitignore
vendored
@@ -2,9 +2,8 @@
|
||||
# will have compiled files and executables
|
||||
/target/
|
||||
|
||||
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
|
||||
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
|
||||
Cargo.lock
|
||||
|
||||
# These are backup files generated by rustfmt
|
||||
**/*.rs.bk
|
||||
|
||||
# Exclude accidental HTML files
|
||||
*.html
|
||||
|
||||
26
.travis.yml
26
.travis.yml
@@ -4,14 +4,26 @@ cache: cargo
|
||||
sudo: false
|
||||
|
||||
os:
|
||||
- linux
|
||||
- osx
|
||||
- linux
|
||||
- osx
|
||||
|
||||
rust:
|
||||
- stable
|
||||
- beta
|
||||
- nightly
|
||||
- stable
|
||||
- beta
|
||||
- nightly
|
||||
|
||||
before_script:
|
||||
- rustup component add rustfmt
|
||||
|
||||
script:
|
||||
- cargo build --verbose
|
||||
- cargo test --verbose
|
||||
- cargo build --all --locked --verbose
|
||||
- cargo test --all --locked --verbose
|
||||
- |
|
||||
if [[ "$TRAVIS_RUST_VERSION" == "stable" ]]; then
|
||||
cargo fmt --all -- --check
|
||||
fi
|
||||
|
||||
jobs:
|
||||
allow_failures:
|
||||
- rust: nightly
|
||||
fast_finish: true
|
||||
|
||||
1547
Cargo.lock
generated
Normal file
1547
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
19
Cargo.toml
19
Cargo.toml
@@ -1,18 +1,25 @@
|
||||
[package]
|
||||
name = "monolith"
|
||||
version = "2.0.16"
|
||||
version = "2.1.2"
|
||||
edition = "2018"
|
||||
authors = [
|
||||
"Sunshine <sunshine@uberspace.net>",
|
||||
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
|
||||
"Emmanuel Delaborde <th3rac25@gmail.com>",
|
||||
"Emi Simpson <emi@alchemi.dev>",
|
||||
"rhysd <lin90162@yahoo.co.jp>",
|
||||
]
|
||||
description = "CLI tool for saving web pages as a single HTML file"
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.10.1"
|
||||
clap = "2.33.0"
|
||||
html5ever = "0.24.0"
|
||||
indicatif = "0.11.0"
|
||||
lazy_static = "1.3.0"
|
||||
regex = "1.2.1"
|
||||
reqwest = "0.9.20"
|
||||
html5ever = "0.24.1"
|
||||
lazy_static = "1.4.0"
|
||||
regex = "1.3.1"
|
||||
url = "2.1.0"
|
||||
|
||||
[dependencies.reqwest]
|
||||
version = "0.10.*"
|
||||
default-features = false
|
||||
features = ["default-tls", "blocking", "gzip"]
|
||||
|
||||
16
Makefile
Normal file
16
Makefile
Normal file
@@ -0,0 +1,16 @@
|
||||
.PHONY: all build install run test lint
|
||||
|
||||
all: test build
|
||||
|
||||
build:
|
||||
@cargo build --locked
|
||||
|
||||
install:
|
||||
@cargo install --force --locked --path .
|
||||
|
||||
test:
|
||||
@cargo test --locked
|
||||
@cargo fmt --all -- --check
|
||||
|
||||
lint:
|
||||
@cargo fmt --all --
|
||||
49
README.md
49
README.md
@@ -1,33 +1,62 @@
|
||||
[](https://travis-ci.org/Y2Z/monolith)
|
||||
[](https://ci.appveyor.com/project/vflyson/monolith)
|
||||
[](https://ci.appveyor.com/project/snshn/monolith/branch/master)
|
||||
|
||||
# monolith
|
||||
```
|
||||
___ ___________ __________ ___________________ ___
|
||||
| \ / \ | | | | | |
|
||||
| \_/ __ \_| __ | | ___ ___ |__| |
|
||||
| | | | | | | | | | | |
|
||||
| |__| _ |__| |____| | | | | __ |
|
||||
| |\_/| | \ | | | | | | |
|
||||
|___| |__________| \____________________| |___| |___| |___|
|
||||
```
|
||||
|
||||
A data hoarder's dream come true: bundle any web page into a single HTML file.
|
||||
You can finally replace that gazillion of open tabs with a gazillion of .html files stored somewhere on your precious little drive.
|
||||
A data hoarder's dream come true: bundle any web page into a single HTML file. You can finally replace that gazillion of open tabs with a gazillion of .html files stored somewhere on your precious little drive.
|
||||
|
||||
Unlike the conventional "Save page as", `monolith` not only saves the target document, it embeds CSS, image, and JavaScript assets **all at once**, producing a single HTML5 document that is a joy to store and share.
|
||||
|
||||
If compared to saving websites with `wget -mpk`, this tool embeds all assets as data URLs and therefore lets browsers render the saved page exactly the way it was on the Internet, even when no network connection is available.
|
||||
|
||||
<!-- `This program works both on remote and local targets. -->
|
||||
## Installation
|
||||
|
||||
### Installation
|
||||
### From source
|
||||
$ git clone https://github.com/Y2Z/monolith.git
|
||||
$ cd monolith
|
||||
$ cargo install --path .
|
||||
|
||||
### Usage
|
||||
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html
|
||||
### With Homebrew (on macOS and GNU/Linux)
|
||||
$ brew install monolith
|
||||
|
||||
### Options
|
||||
### Using Snapcraft (on GNU/Linux)
|
||||
$ snap install monolith
|
||||
|
||||
## Usage
|
||||
$ monolith https://lyrics.github.io/db/P/Portishead/Dummy/Roads/ -o portishead-roads-lyrics.html
|
||||
|
||||
## Options
|
||||
- `-c`: Ignore styles
|
||||
- `-f`: Exclude iframes
|
||||
- `-i`: Remove images
|
||||
- `-I`: Isolate document
|
||||
- `-j`: Exclude JavaScript
|
||||
- `-k`: Accept invalid X.509 (TLS) certificates
|
||||
- `-o`: Write output to file
|
||||
- `-s`: Silent mode
|
||||
- `-u`: Specify custom User-Agent
|
||||
|
||||
### License
|
||||
## HTTPS and HTTP proxies
|
||||
Please set `https_proxy`, `http_proxy` and `no_proxy` environment variables.
|
||||
|
||||
## Contributing
|
||||
Please open an issue if something is wrong, that helps make this project better.
|
||||
|
||||
## Related projects
|
||||
- `Monolith Chrome Extension`: https://github.com/rhysd/monolith-of-web
|
||||
- `Pagesaver`: https://github.com/distributed-mind/pagesaver
|
||||
- `Personal WayBack Machine`: https://github.com/popey/pwbm
|
||||
- `SingleFile`: https://github.com/gildas-lormeau/SingleFile
|
||||
|
||||
## License
|
||||
The Unlicense
|
||||
|
||||
<!-- Microtext -->
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
name: monolith
|
||||
base: core18
|
||||
version: git
|
||||
# Version data defined inside the monolith part below
|
||||
adopt-info: monolith
|
||||
summary: Monolith - Save HTML pages with ease
|
||||
description: |
|
||||
A data hoarder's dream come true: bundle any web page into a single
|
||||
@@ -17,6 +18,14 @@ description: |
|
||||
|
||||
confinement: strict
|
||||
|
||||
# Building on armhf fails, so we specify all supported non-armhf architectures
|
||||
architectures:
|
||||
- build-on: amd64
|
||||
- build-on: i386
|
||||
- build-on: arm64
|
||||
- build-on: ppc64el
|
||||
- build-on: s390x
|
||||
|
||||
parts:
|
||||
monolith:
|
||||
plugin: rust
|
||||
@@ -24,6 +33,21 @@ parts:
|
||||
build-packages:
|
||||
- libssl-dev
|
||||
- pkg-config
|
||||
override-pull: |
|
||||
snapcraftctl pull
|
||||
# Determine the current tag
|
||||
last_committed_tag="$(git describe --tags --abbrev=0)"
|
||||
last_committed_tag_ver="$(echo ${last_committed_tag} | sed 's/v//')"
|
||||
# Determine the most recent version in the beta channel in the Snap Store
|
||||
last_released_tag="$(snap info $SNAPCRAFT_PROJECT_NAME | awk '$1 == "beta:" { print $2 }')"
|
||||
# If the latest tag from the upstream project has not been released to
|
||||
# beta, build that tag instead of master.
|
||||
if [ "${last_committed_tag_ver}" != "${last_released_tag}" ]; then
|
||||
git fetch
|
||||
git checkout "${last_committed_tag}"
|
||||
fi
|
||||
# set version number of the snap based on what we did above
|
||||
snapcraftctl set-version $(git describe --tags --abbrev=0)
|
||||
|
||||
apps:
|
||||
monolith:
|
||||
|
||||
65
src/args.rs
Normal file
65
src/args.rs
Normal file
@@ -0,0 +1,65 @@
|
||||
use clap::{App, Arg};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct AppArgs {
|
||||
pub url_target: String,
|
||||
pub no_css: bool,
|
||||
pub no_frames: bool,
|
||||
pub no_images: bool,
|
||||
pub no_js: bool,
|
||||
pub insecure: bool,
|
||||
pub isolate: bool,
|
||||
pub output: String,
|
||||
pub silent: bool,
|
||||
pub user_agent: String,
|
||||
}
|
||||
|
||||
const DEFAULT_USER_AGENT: &str =
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0";
|
||||
|
||||
impl AppArgs {
|
||||
pub fn get() -> AppArgs {
|
||||
let app = App::new("monolith")
|
||||
.version(crate_version!())
|
||||
.author(crate_authors!("\n"))
|
||||
.about(crate_description!())
|
||||
.arg(
|
||||
Arg::with_name("url")
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
.index(1)
|
||||
.help("URL to download"),
|
||||
)
|
||||
// .args_from_usage("-a, --include-audio 'Embed audio sources'")
|
||||
.args_from_usage("-c, --no-css 'Ignore styles'")
|
||||
.args_from_usage("-f, --no-frames 'Exclude iframes'")
|
||||
.args_from_usage("-i, --no-images 'Remove images'")
|
||||
.args_from_usage("-I, --isolate 'Cut off from the Internet'")
|
||||
.args_from_usage("-j, --no-js 'Exclude JavaScript'")
|
||||
.args_from_usage("-k, --insecure 'Accept invalid X.509 (TLS) certificates'")
|
||||
.args_from_usage("-o, --output=[document.html] 'Write output to <file>'")
|
||||
.args_from_usage("-s, --silent 'Suppress verbosity'")
|
||||
.args_from_usage("-u, --user-agent=[Iceweasel] 'Custom User-Agent string'")
|
||||
// .args_from_usage("-v, --include-video 'Embed video sources'")
|
||||
.get_matches();
|
||||
let mut app_args = AppArgs::default();
|
||||
// Process the command
|
||||
app_args.url_target = app
|
||||
.value_of("url")
|
||||
.expect("please set target url")
|
||||
.to_string();
|
||||
app_args.no_css = app.is_present("no-css");
|
||||
app_args.no_frames = app.is_present("no-frames");
|
||||
app_args.no_images = app.is_present("no-images");
|
||||
app_args.no_js = app.is_present("no-js");
|
||||
app_args.insecure = app.is_present("insecure");
|
||||
app_args.isolate = app.is_present("isolate");
|
||||
app_args.silent = app.is_present("silent");
|
||||
app_args.output = app.value_of("output").unwrap_or("").to_string();
|
||||
app_args.user_agent = app
|
||||
.value_of("user-agent")
|
||||
.unwrap_or(DEFAULT_USER_AGENT)
|
||||
.to_string();
|
||||
app_args
|
||||
}
|
||||
}
|
||||
832
src/html.rs
832
src/html.rs
@@ -1,99 +1,78 @@
|
||||
use crate::http::retrieve_asset;
|
||||
use crate::js::attr_is_event_handler;
|
||||
use crate::utils::{
|
||||
data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol,
|
||||
};
|
||||
use html5ever::interface::QualName;
|
||||
use html5ever::parse_document;
|
||||
use html5ever::rcdom::{Handle, NodeData, RcDom};
|
||||
use html5ever::serialize::{serialize, SerializeOpts};
|
||||
use html5ever::tendril::TendrilSink;
|
||||
use http::{is_valid_url, resolve_url, retrieve_asset};
|
||||
use regex::Regex;
|
||||
use html5ever::tendril::{format_tendril, Tendril, TendrilSink};
|
||||
use html5ever::tree_builder::{Attribute, TreeSink};
|
||||
use html5ever::{local_name, namespace_url, ns};
|
||||
use reqwest::blocking::Client;
|
||||
use std::collections::HashMap;
|
||||
use std::default::Default;
|
||||
use std::io;
|
||||
use utils::data_to_dataurl;
|
||||
|
||||
lazy_static! {
|
||||
static ref EMPTY_STRING: String = String::new();
|
||||
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
|
||||
static ref ICON_VALUES: Regex = Regex::new(
|
||||
r"^icon|shortcut icon|mask-icon|apple-touch-icon$"
|
||||
).unwrap();
|
||||
}
|
||||
|
||||
const TRANSPARENT_PIXEL: &str = "data:image/png;base64,\
|
||||
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=";
|
||||
|
||||
const JS_DOM_EVENT_ATTRS: [&str; 21] = [
|
||||
// Input
|
||||
"onfocus",
|
||||
"onblur",
|
||||
"onselect",
|
||||
"onchange",
|
||||
"onsubmit",
|
||||
"onreset",
|
||||
"onkeydown",
|
||||
"onkeypress",
|
||||
"onkeyup",
|
||||
// Mouse
|
||||
"onmouseover",
|
||||
"onmouseout",
|
||||
"onmousedown",
|
||||
"onmouseup",
|
||||
"onmousemove",
|
||||
// Click
|
||||
"onclick",
|
||||
"ondblclick",
|
||||
// Load
|
||||
"onload",
|
||||
"onunload",
|
||||
"onabort",
|
||||
"onerror",
|
||||
"onresize",
|
||||
const ICON_VALUES: &[&str] = &[
|
||||
"icon",
|
||||
"shortcut icon",
|
||||
"mask-icon",
|
||||
"apple-touch-icon",
|
||||
"fluid-icon",
|
||||
];
|
||||
|
||||
fn get_parent_node_name(node: &Handle) -> String {
|
||||
let parent = node.parent.take().clone();
|
||||
let parent_node = parent.and_then(|node| node.upgrade()).unwrap();
|
||||
const TRANSPARENT_PIXEL: &str =
|
||||
"data:image/png;base64,\
|
||||
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=";
|
||||
|
||||
match &parent_node.data {
|
||||
NodeData::Document => { EMPTY_STRING.clone() }
|
||||
NodeData::Doctype { .. } => { EMPTY_STRING.clone() }
|
||||
NodeData::Text { .. } => { EMPTY_STRING.clone() }
|
||||
NodeData::Comment { .. } => { EMPTY_STRING.clone() }
|
||||
NodeData::Element { ref name, attrs: _, .. } => {
|
||||
name.local.as_ref().to_string()
|
||||
}
|
||||
NodeData::ProcessingInstruction { .. } => unreachable!()
|
||||
pub fn get_parent_node(node: &Handle) -> Handle {
|
||||
let parent = node.parent.take().clone();
|
||||
parent.and_then(|node| node.upgrade()).unwrap()
|
||||
}
|
||||
|
||||
pub fn get_node_name(node: &Handle) -> &'_ str {
|
||||
match &node.data {
|
||||
NodeData::Element { ref name, .. } => name.local.as_ref(),
|
||||
_ => "",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_icon(attr_value: &str) -> bool {
|
||||
ICON_VALUES
|
||||
.iter()
|
||||
.find(|a| attr_value.eq_ignore_ascii_case(a))
|
||||
.is_some()
|
||||
}
|
||||
|
||||
pub fn walk_and_embed_assets(
|
||||
cache: &mut HashMap<String, String>,
|
||||
client: &Client,
|
||||
url: &str,
|
||||
node: &Handle,
|
||||
opt_no_css: bool,
|
||||
opt_no_js: bool,
|
||||
opt_no_images: bool,
|
||||
opt_user_agent: &str,
|
||||
opt_silent: bool,
|
||||
opt_insecure: bool,
|
||||
opt_no_frames: bool,
|
||||
) {
|
||||
match node.data {
|
||||
NodeData::Document => {
|
||||
// Dig deeper
|
||||
for child in node.children.borrow().iter() {
|
||||
walk_and_embed_assets(
|
||||
&url, child,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
);
|
||||
cache,
|
||||
client,
|
||||
&url,
|
||||
child,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
}
|
||||
}
|
||||
NodeData::Doctype { .. } => {}
|
||||
NodeData::Text { .. } => {}
|
||||
NodeData::Comment { .. } => {
|
||||
// Note: in case of opt_no_js being set to true, there's no need to worry about
|
||||
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
|
||||
// since that's not part of W3C standard and therefore gets ignored
|
||||
// by browsers other than IE [5, 9]
|
||||
}
|
||||
NodeData::Element {
|
||||
ref name,
|
||||
ref attrs,
|
||||
@@ -103,132 +82,206 @@ pub fn walk_and_embed_assets(
|
||||
|
||||
match name.local.as_ref() {
|
||||
"link" => {
|
||||
let mut link_type = "";
|
||||
// Remove integrity attributes
|
||||
let mut i = 0;
|
||||
while i < attrs_mut.len() {
|
||||
let attr_name = attrs_mut[i].name.local.as_ref();
|
||||
if attr_name.eq_ignore_ascii_case("integrity") {
|
||||
attrs_mut.remove(i);
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
enum LinkType {
|
||||
Icon,
|
||||
Stylesheet,
|
||||
Preload,
|
||||
DnsPrefetch,
|
||||
Unknown,
|
||||
}
|
||||
|
||||
let mut link_type = LinkType::Unknown;
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "rel" {
|
||||
if is_icon(&attr.value.to_string()) {
|
||||
link_type = "icon";
|
||||
let value = attr.value.trim();
|
||||
if is_icon(value) {
|
||||
link_type = LinkType::Icon;
|
||||
break;
|
||||
} else if attr.value.to_string() == "stylesheet" {
|
||||
link_type = "stylesheet";
|
||||
} else if value.eq_ignore_ascii_case("stylesheet") {
|
||||
link_type = LinkType::Stylesheet;
|
||||
break;
|
||||
} else if value.eq_ignore_ascii_case("preload") {
|
||||
link_type = LinkType::Preload;
|
||||
break;
|
||||
} else if value.eq_ignore_ascii_case("dns-prefetch") {
|
||||
link_type = LinkType::DnsPrefetch;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
let link_type = link_type;
|
||||
|
||||
if link_type == "icon" {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
if opt_no_images {
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(TRANSPARENT_PIXEL);
|
||||
} else {
|
||||
let href_full_url: String = resolve_url(
|
||||
&url,
|
||||
&attr.value.to_string()
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let favicon_datauri = retrieve_asset(
|
||||
match link_type {
|
||||
LinkType::Icon => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
if opt_no_images {
|
||||
attr.value.clear();
|
||||
} else {
|
||||
let href_full_url = resolve_url(&url, attr.value.as_ref())
|
||||
.unwrap_or_default();
|
||||
let (favicon_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&href_full_url,
|
||||
true,
|
||||
"",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(favicon_datauri.as_str());
|
||||
.unwrap_or_default();
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(favicon_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if link_type == "stylesheet" {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
let href_full_url: String = resolve_url(
|
||||
&url,
|
||||
&attr.value.to_string(),
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let css_datauri = retrieve_asset(
|
||||
&href_full_url,
|
||||
true,
|
||||
"text/css",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(css_datauri.as_str());
|
||||
LinkType::Stylesheet => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
if opt_no_css {
|
||||
attr.value.clear();
|
||||
} else {
|
||||
let href_full_url = resolve_url(&url, &attr.value.as_ref())
|
||||
.unwrap_or_default();
|
||||
let replacement_text = match retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&href_full_url,
|
||||
false,
|
||||
"text/css",
|
||||
opt_silent,
|
||||
) {
|
||||
// On successful retrieval, traverse CSS
|
||||
Ok((css_data, _)) => resolve_css_imports(
|
||||
cache,
|
||||
client,
|
||||
&css_data,
|
||||
true,
|
||||
&href_full_url,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
),
|
||||
|
||||
// If a network error occured, warn
|
||||
Err(e) => {
|
||||
eprintln!("Warning: {}", e);
|
||||
|
||||
// If failed to resolve, replace with absolute URL
|
||||
href_full_url
|
||||
}
|
||||
};
|
||||
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(&replacement_text);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
let href_full_url: String = resolve_url(
|
||||
&url,
|
||||
&attr.value.to_string(),
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
LinkType::Preload | LinkType::DnsPrefetch => {
|
||||
// Since all resources are embedded as data URL, preloading and prefetching are unnecessary
|
||||
if let Some(attr) =
|
||||
attrs_mut.iter_mut().find(|a| &a.name.local == "href")
|
||||
{
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(&href_full_url.as_str());
|
||||
}
|
||||
}
|
||||
LinkType::Unknown => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
let href_full_url =
|
||||
resolve_url(&url, attr.value.as_ref()).unwrap_or_default();
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(&href_full_url.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"img" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "src" {
|
||||
if opt_no_images {
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(TRANSPARENT_PIXEL);
|
||||
} else {
|
||||
let src_full_url: String = resolve_url(
|
||||
&url,
|
||||
&attr.value.to_string(),
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let img_datauri = retrieve_asset(
|
||||
&src_full_url,
|
||||
true,
|
||||
"",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(img_datauri.as_str());
|
||||
}
|
||||
// Find source tags
|
||||
let mut found_src: Option<Attribute> = None;
|
||||
let mut found_datasrc: Option<Attribute> = None;
|
||||
let mut i = 0;
|
||||
while i < attrs_mut.len() {
|
||||
let name = attrs_mut[i].name.local.as_ref();
|
||||
if name.eq_ignore_ascii_case("src") {
|
||||
found_src = Some(attrs_mut.remove(i));
|
||||
} else if name.eq_ignore_ascii_case("data-src") {
|
||||
found_datasrc = Some(attrs_mut.remove(i));
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// If images are disabled, clear both sources
|
||||
if opt_no_images {
|
||||
attrs_mut.push(Attribute {
|
||||
name: QualName::new(None, ns!(), local_name!("src")),
|
||||
value: Tendril::from_slice(TRANSPARENT_PIXEL),
|
||||
});
|
||||
} else if let Some((dataurl, _)) = found_datasrc
|
||||
.iter()
|
||||
.chain(&found_src) // Give dataurl priority
|
||||
.map(|attr| attr.value.trim())
|
||||
.filter(|src| !src.is_empty()) // Ignore empty srcs
|
||||
.next()
|
||||
.and_then(|src| resolve_url(&url, src).ok()) // Make absolute
|
||||
.and_then(|abs_src| // Download and convert to dataurl
|
||||
retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&abs_src,
|
||||
true,
|
||||
"",
|
||||
opt_silent,
|
||||
).ok())
|
||||
{
|
||||
// Add the new dataurl src attribute
|
||||
attrs_mut.push(Attribute {
|
||||
name: QualName::new(None, ns!(), local_name!("src")),
|
||||
value: Tendril::from_slice(dataurl.as_ref()),
|
||||
});
|
||||
}
|
||||
}
|
||||
"source" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "srcset" {
|
||||
if get_parent_node_name(&node) == "picture" {
|
||||
let attr_name: &str = &attr.name.local;
|
||||
|
||||
if attr_name == "src" {
|
||||
let src_full_url = resolve_url(&url, attr.value.trim())
|
||||
.unwrap_or_else(|_| attr.value.to_string());
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(src_full_url.as_str());
|
||||
} else if attr_name == "srcset" {
|
||||
if get_node_name(&get_parent_node(&node)) == "picture" {
|
||||
if opt_no_images {
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(TRANSPARENT_PIXEL);
|
||||
} else {
|
||||
let srcset_full_url: String = resolve_url(
|
||||
&url,
|
||||
&attr.value.to_string(),
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let source_datauri = retrieve_asset(
|
||||
&srcset_full_url,
|
||||
true,
|
||||
"",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let srcset_full_url =
|
||||
resolve_url(&url, attr.value.trim()).unwrap_or_default();
|
||||
let (source_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&srcset_full_url,
|
||||
true,
|
||||
"",
|
||||
opt_silent,
|
||||
)
|
||||
.unwrap_or((str!(), str!()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(source_datauri.as_str());
|
||||
attr.value.push_slice(source_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -237,21 +290,32 @@ pub fn walk_and_embed_assets(
|
||||
"a" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
let attr_value = attr.value.trim();
|
||||
// Don't touch email links or hrefs which begin with a hash sign
|
||||
if attr.value.starts_with('#') || has_protocol(&attr.value) {
|
||||
if attr_value.starts_with('#') || url_has_protocol(attr_value) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let href_full_url: String = resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let href_full_url = resolve_url(&url, attr_value).unwrap_or_default();
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(href_full_url.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
"script" => {
|
||||
// Remove integrity attributes
|
||||
let mut i = 0;
|
||||
while i < attrs_mut.len() {
|
||||
let attr_name = attrs_mut[i].name.local.as_ref();
|
||||
if attr_name.eq_ignore_ascii_case("integrity") {
|
||||
attrs_mut.remove(i);
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if opt_no_js {
|
||||
// Get rid of src and inner content of SCRIPT tags
|
||||
// Empty src and inner content of SCRIPT tags
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "src" {
|
||||
attr.value.clear();
|
||||
@@ -261,22 +325,42 @@ pub fn walk_and_embed_assets(
|
||||
} else {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "src" {
|
||||
let src_full_url: String = resolve_url(
|
||||
&url,
|
||||
&attr.value.to_string(),
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let js_datauri = retrieve_asset(
|
||||
&src_full_url,
|
||||
true,
|
||||
"application/javascript",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let src_full_url =
|
||||
resolve_url(&url, attr.value.trim()).unwrap_or_default();
|
||||
let (js_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&src_full_url,
|
||||
true,
|
||||
"application/javascript",
|
||||
opt_silent,
|
||||
)
|
||||
.unwrap_or((str!(), str!()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(js_datauri.as_str());
|
||||
attr.value.push_slice(js_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"style" => {
|
||||
if opt_no_css {
|
||||
// Empty inner content of STYLE tags
|
||||
node.children.borrow_mut().clear();
|
||||
} else {
|
||||
for node in node.children.borrow_mut().iter_mut() {
|
||||
if let NodeData::Text { ref contents } = node.data {
|
||||
let mut tendril = contents.borrow_mut();
|
||||
let replacement = resolve_css_imports(
|
||||
cache,
|
||||
client,
|
||||
tendril.as_ref(),
|
||||
false,
|
||||
&url,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
);
|
||||
tendril.clear();
|
||||
tendril.push_slice(&replacement);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -284,90 +368,167 @@ pub fn walk_and_embed_assets(
|
||||
"form" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "action" {
|
||||
// Do not touch action props which are set to a URL
|
||||
if is_valid_url(&attr.value) {
|
||||
continue;
|
||||
let attr_value = attr.value.trim();
|
||||
// Modify action to be a full URL
|
||||
if !is_valid_url(attr_value) {
|
||||
let href_full_url =
|
||||
resolve_url(&url, attr_value).unwrap_or_default();
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(href_full_url.as_str());
|
||||
}
|
||||
|
||||
let href_full_url: String = resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(href_full_url.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
"iframe" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "src" {
|
||||
let value = attr.value.to_string();
|
||||
// Ignore iframes with empty source (they cause infinite loops)
|
||||
if value == EMPTY_STRING.clone() {
|
||||
if opt_no_frames {
|
||||
// Empty the src attribute
|
||||
attr.value.clear();
|
||||
continue;
|
||||
}
|
||||
|
||||
let iframe_src = attr.value.trim();
|
||||
|
||||
let src_full_url: String = resolve_url(&url, &value)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
let iframe_data = retrieve_asset(
|
||||
&src_full_url,
|
||||
false,
|
||||
"text/html",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
)
|
||||
.unwrap_or(EMPTY_STRING.clone());
|
||||
// Ignore iframes with empty source (they cause infinite loops)
|
||||
if iframe_src.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let src_full_url = resolve_url(&url, iframe_src).unwrap_or_default();
|
||||
let (iframe_data, iframe_final_url) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&src_full_url,
|
||||
false,
|
||||
"text/html",
|
||||
opt_silent,
|
||||
)
|
||||
.unwrap_or((str!(), src_full_url));
|
||||
let dom = html_to_dom(&iframe_data);
|
||||
walk_and_embed_assets(
|
||||
&src_full_url,
|
||||
&dom.document,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
);
|
||||
cache,
|
||||
client,
|
||||
&iframe_final_url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
let iframe_datauri = data_to_dataurl("text/html", &buf);
|
||||
let iframe_dataurl = data_to_dataurl("text/html", &buf);
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(iframe_datauri.as_str());
|
||||
attr.value.push_slice(iframe_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
"video" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "poster" {
|
||||
let video_poster = attr.value.trim();
|
||||
|
||||
// Skip posters with empty source
|
||||
if video_poster.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if opt_no_images {
|
||||
attr.value.clear();
|
||||
} else {
|
||||
let poster_full_url =
|
||||
resolve_url(&url, video_poster).unwrap_or_default();
|
||||
let (poster_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&poster_full_url,
|
||||
true,
|
||||
"",
|
||||
opt_silent,
|
||||
)
|
||||
.unwrap_or((poster_full_url, str!()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(poster_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
// Process style attributes
|
||||
if opt_no_css {
|
||||
// Get rid of style attributes
|
||||
let mut style_attr_indexes = Vec::new();
|
||||
for (i, attr) in attrs_mut.iter_mut().enumerate() {
|
||||
if attr.name.local.to_lowercase() == "style" {
|
||||
style_attr_indexes.push(i);
|
||||
}
|
||||
}
|
||||
style_attr_indexes.reverse();
|
||||
for attr_index in style_attr_indexes {
|
||||
attrs_mut.remove(attr_index);
|
||||
}
|
||||
} else {
|
||||
// Otherwise, parse any links found in the attributes
|
||||
for attribute in attrs_mut
|
||||
.iter_mut()
|
||||
.filter(|a| a.name.local.as_ref().eq_ignore_ascii_case("style"))
|
||||
{
|
||||
let replacement = resolve_css_imports(
|
||||
cache,
|
||||
client,
|
||||
attribute.value.as_ref(),
|
||||
false,
|
||||
&url,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
);
|
||||
attribute.value.clear();
|
||||
attribute.value.push_slice(&replacement);
|
||||
}
|
||||
}
|
||||
|
||||
if opt_no_js {
|
||||
// Get rid of JS event attributes
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if JS_DOM_EVENT_ATTRS.contains(&attr.name.local.to_lowercase().as_str()) {
|
||||
attr.value.clear();
|
||||
let mut js_attr_indexes = Vec::new();
|
||||
for (i, attr) in attrs_mut.iter_mut().enumerate() {
|
||||
if attr_is_event_handler(&attr.name.local) {
|
||||
js_attr_indexes.push(i);
|
||||
}
|
||||
}
|
||||
js_attr_indexes.reverse();
|
||||
for attr_index in js_attr_indexes {
|
||||
attrs_mut.remove(attr_index);
|
||||
}
|
||||
}
|
||||
|
||||
// Dig deeper
|
||||
for child in node.children.borrow().iter() {
|
||||
walk_and_embed_assets(
|
||||
&url,
|
||||
child,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
);
|
||||
cache,
|
||||
client,
|
||||
&url,
|
||||
child,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
}
|
||||
}
|
||||
NodeData::ProcessingInstruction { .. } => unreachable!()
|
||||
_ => {
|
||||
// Note: in case of opt_no_js being set to true, there's no need to worry about
|
||||
// getting rid of comments that may contain scripts, e.g. <!--[if IE]><script>...
|
||||
// since that's not part of W3C standard and therefore gets ignored
|
||||
// by browsers other than IE [5, 9]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn has_protocol(url: &str) -> bool {
|
||||
HAS_PROTOCOL.is_match(&url.to_lowercase())
|
||||
}
|
||||
|
||||
pub fn html_to_dom(data: &str) -> html5ever::rcdom::RcDom {
|
||||
parse_document(RcDom::default(), Default::default())
|
||||
.from_utf8()
|
||||
@@ -375,157 +536,82 @@ pub fn html_to_dom(data: &str) -> html5ever::rcdom::RcDom {
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn print_dom(handle: &Handle) {
|
||||
serialize(&mut io::stdout(), handle, SerializeOpts::default()).unwrap();
|
||||
fn get_child_node_by_name(handle: &Handle, node_name: &str) -> Handle {
|
||||
let children = handle.children.borrow();
|
||||
let matching_children = children.iter().find(|child| match child.data {
|
||||
NodeData::Element { ref name, .. } => &*name.local == node_name,
|
||||
_ => false,
|
||||
});
|
||||
match matching_children {
|
||||
Some(node) => node.clone(),
|
||||
_ => handle.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_icon(attr_value: &str) -> bool {
|
||||
ICON_VALUES.is_match(&attr_value.to_lowercase())
|
||||
}
|
||||
pub fn stringify_document(
|
||||
handle: &Handle,
|
||||
opt_no_css: bool,
|
||||
opt_no_frames: bool,
|
||||
opt_no_js: bool,
|
||||
opt_no_images: bool,
|
||||
opt_isolate: bool,
|
||||
) -> String {
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, handle, SerializeOpts::default())
|
||||
.expect("unable to serialize DOM into buffer");
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
let mut result = String::from_utf8(buf).unwrap();
|
||||
|
||||
#[test]
|
||||
fn test_is_icon() {
|
||||
assert_eq!(is_icon("icon"), true);
|
||||
assert_eq!(is_icon("Shortcut Icon"), true);
|
||||
assert_eq!(is_icon("ICON"), true);
|
||||
assert_eq!(is_icon("stylesheet"), false);
|
||||
assert_eq!(is_icon(""), false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_has_protocol() {
|
||||
assert_eq!(has_protocol("mailto:somebody@somewhere.com?subject=hello"), true);
|
||||
assert_eq!(has_protocol("tel:5551234567"), true);
|
||||
assert_eq!(has_protocol("ftp:user:password@some-ftp-server.com"), true);
|
||||
assert_eq!(has_protocol("javascript:void(0)"), true);
|
||||
assert_eq!(has_protocol("http://news.ycombinator.com"), true);
|
||||
assert_eq!(has_protocol("https://github.com"), true);
|
||||
assert_eq!(has_protocol("//some-hostname.com/some-file.html"), false);
|
||||
assert_eq!(has_protocol("some-hostname.com/some-file.html"), false);
|
||||
assert_eq!(has_protocol("/some-file.html"), false);
|
||||
assert_eq!(has_protocol(""), false);
|
||||
assert_eq!(has_protocol("MAILTO:somebody@somewhere.com?subject=hello"), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_parent_node_name() {
|
||||
let html = "<!doctype html><html><HEAD></HEAD><body><div><P></P></div></body></html>";
|
||||
let dom = html_to_dom(&html);
|
||||
let mut count = 0;
|
||||
|
||||
fn test_walk(node: &Handle, i: &mut i8) {
|
||||
*i += 1;
|
||||
|
||||
match &node.data {
|
||||
NodeData::Document => {
|
||||
for child in node.children.borrow().iter() {
|
||||
test_walk(child, &mut *i);
|
||||
}
|
||||
}
|
||||
NodeData::Doctype { .. } => (),
|
||||
NodeData::Text { .. } => (),
|
||||
NodeData::Comment { .. } => (),
|
||||
NodeData::Element { ref name, attrs: _, .. } => {
|
||||
let node_name = name.local.as_ref().to_string();
|
||||
let parent_node_name = get_parent_node_name(node);
|
||||
if node_name == "head" || node_name == "body" {
|
||||
assert_eq!(parent_node_name, "html");
|
||||
} else if node_name == "div" {
|
||||
assert_eq!(parent_node_name, "body");
|
||||
} else if node_name == "p" {
|
||||
assert_eq!(parent_node_name, "div");
|
||||
}
|
||||
|
||||
println!("{}", node_name);
|
||||
|
||||
for child in node.children.borrow().iter() {
|
||||
test_walk(child, &mut *i);
|
||||
}
|
||||
}
|
||||
NodeData::ProcessingInstruction { .. } => unreachable!()
|
||||
};
|
||||
if opt_isolate || opt_no_css || opt_no_frames || opt_no_js || opt_no_images {
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
let mut dom = html_to_dom(&result);
|
||||
let doc = dom.get_document();
|
||||
let html = get_child_node_by_name(&doc, "html");
|
||||
let head = get_child_node_by_name(&html, "head");
|
||||
let mut content_attr = str!();
|
||||
if opt_isolate {
|
||||
content_attr += " default-src 'unsafe-inline' data:;";
|
||||
}
|
||||
if opt_no_css {
|
||||
content_attr += " style-src 'none';";
|
||||
}
|
||||
if opt_no_frames {
|
||||
content_attr += " frame-src 'none';child-src 'none';";
|
||||
}
|
||||
if opt_no_js {
|
||||
content_attr += " script-src 'none';";
|
||||
}
|
||||
if opt_no_images {
|
||||
content_attr += " img-src data:;";
|
||||
}
|
||||
|
||||
test_walk(&dom.document, &mut count);
|
||||
|
||||
assert_eq!(count, 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets() {
|
||||
let html = "<div><P></P></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
|
||||
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html><head></head><body><div><p></p></div></body></html>"
|
||||
let meta = dom.create_element(
|
||||
QualName::new(None, ns!(), local_name!("meta")),
|
||||
vec![
|
||||
Attribute {
|
||||
name: QualName::new(None, ns!(), local_name!("http-equiv")),
|
||||
value: format_tendril!("Content-Security-Policy"),
|
||||
},
|
||||
Attribute {
|
||||
name: QualName::new(None, ns!(), local_name!("content")),
|
||||
value: format_tendril!("{}", content_attr.trim()),
|
||||
},
|
||||
],
|
||||
Default::default(),
|
||||
);
|
||||
head.children.borrow_mut().reverse();
|
||||
head.children.borrow_mut().push(meta.clone());
|
||||
head.children.borrow_mut().reverse();
|
||||
// Note: the CSP meta-tag has to be prepended, never appended,
|
||||
// since there already may be one defined in the document,
|
||||
// and browsers don't allow re-defining them (for obvious reasons)
|
||||
|
||||
serialize(&mut buf, &doc, SerializeOpts::default())
|
||||
.expect("unable to serialize DOM into buffer");
|
||||
result = String::from_utf8(buf).unwrap();
|
||||
// Note: we can't make it isolate the page right away since it may have no HEAD element,
|
||||
// ergo we have to serialize, parse DOM again, and then finally serialize the result
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets_iframe() {
|
||||
let html = "<div><P></P><iframe src=\"\"></iframe></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
|
||||
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html><head></head><body><div><p></p><iframe src=\"\"></iframe></div></body></html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets_img() {
|
||||
let html = "<div><img src=\"http://localhost/assets/mono_lisa.png\" /></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
|
||||
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html><head></head><body><div>\
|
||||
<img src=\"data:image/png;base64,\
|
||||
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0\
|
||||
lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=\">\
|
||||
</div></body></html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets_js() {
|
||||
let html = "<div><script src=\"http://localhost/assets/some.js\"></script>\
|
||||
<script>alert(1)</script></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
|
||||
walk_and_embed_assets(&url, &dom.document, true, true, "", true, true);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html><head></head><body><div><script src=\"\"></script>\
|
||||
<script></script></div></body></html>"
|
||||
);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
225
src/http.rs
225
src/http.rs
@@ -1,179 +1,68 @@
|
||||
use regex::Regex;
|
||||
use reqwest::Client;
|
||||
use reqwest::header::{CONTENT_TYPE, USER_AGENT};
|
||||
use std::time::Duration;
|
||||
use url::{ParseError, Url};
|
||||
use utils::data_to_dataurl;
|
||||
|
||||
lazy_static! {
|
||||
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
|
||||
}
|
||||
|
||||
pub fn is_data_url(url: &str) -> Result<bool, ParseError> {
|
||||
match Url::parse(url) {
|
||||
Ok(parsed_url) => Ok(parsed_url.scheme() == "data"),
|
||||
Err(err) => Err(err),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_valid_url(path: &str) -> bool {
|
||||
REGEX_URL.is_match(path)
|
||||
}
|
||||
|
||||
pub fn resolve_url(from: &str, to: &str) -> Result<String, ParseError> {
|
||||
let result = if is_valid_url(to) {
|
||||
// (anything, http://site.com/css/main.css)
|
||||
to.to_string()
|
||||
} else {
|
||||
Url::parse(from)?.join(to)?.to_string()
|
||||
};
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
use crate::utils::{clean_url, data_to_dataurl, is_data_url};
|
||||
use reqwest::blocking::Client;
|
||||
use reqwest::header::CONTENT_TYPE;
|
||||
use std::collections::HashMap;
|
||||
|
||||
pub fn retrieve_asset(
|
||||
cache: &mut HashMap<String, String>,
|
||||
client: &Client,
|
||||
url: &str,
|
||||
as_dataurl: bool,
|
||||
as_mime: &str,
|
||||
opt_user_agent: &str,
|
||||
mime: &str,
|
||||
opt_silent: bool,
|
||||
opt_insecure: bool,
|
||||
) -> Result<String, reqwest::Error> {
|
||||
if is_data_url(&url).unwrap() {
|
||||
Ok(url.to_string())
|
||||
} else {
|
||||
let client = Client::builder()
|
||||
.timeout(Duration::from_secs(10))
|
||||
.danger_accept_invalid_certs(opt_insecure)
|
||||
.build()?;
|
||||
let mut response = client
|
||||
.get(url)
|
||||
.header(USER_AGENT, opt_user_agent)
|
||||
.send()?;
|
||||
let final_url = response.url().as_str();
|
||||
) -> Result<(String, String), reqwest::Error> {
|
||||
let cache_key = clean_url(&url);
|
||||
|
||||
if !opt_silent {
|
||||
if url == final_url {
|
||||
eprintln!("[ {} ]", &url);
|
||||
if is_data_url(&url).unwrap() {
|
||||
Ok((url.to_string(), url.to_string()))
|
||||
} else {
|
||||
if cache.contains_key(&cache_key) {
|
||||
// url is in cache
|
||||
if !opt_silent {
|
||||
eprintln!("{} (from cache)", &url);
|
||||
}
|
||||
let data = cache.get(&cache_key).unwrap();
|
||||
Ok((data.to_string(), url.to_string()))
|
||||
} else {
|
||||
// url not in cache, we request it
|
||||
let mut response = client.get(url).send()?;
|
||||
let res_url = response.url().to_string();
|
||||
|
||||
if !opt_silent {
|
||||
if url == res_url {
|
||||
eprintln!("{}", &url);
|
||||
} else {
|
||||
eprintln!("{} -> {}", &url, &res_url);
|
||||
}
|
||||
}
|
||||
|
||||
let new_cache_key = clean_url(&res_url);
|
||||
|
||||
if as_dataurl {
|
||||
// Convert response into a byte array
|
||||
let mut data: Vec<u8> = vec![];
|
||||
response.copy_to(&mut data)?;
|
||||
|
||||
// Attempt to obtain MIME type by reading the Content-Type header
|
||||
let mimetype = if mime == "" {
|
||||
response
|
||||
.headers()
|
||||
.get(CONTENT_TYPE)
|
||||
.and_then(|header| header.to_str().ok())
|
||||
.unwrap_or(&mime)
|
||||
} else {
|
||||
mime
|
||||
};
|
||||
let dataurl = data_to_dataurl(&mimetype, &data);
|
||||
// insert in cache
|
||||
cache.insert(new_cache_key, dataurl.clone());
|
||||
Ok((dataurl, res_url))
|
||||
} else {
|
||||
eprintln!("[ {} -> {} ]", &url, &final_url);
|
||||
let content = response.text().unwrap();
|
||||
// insert in cache
|
||||
cache.insert(new_cache_key, content.clone());
|
||||
Ok((content, res_url))
|
||||
}
|
||||
}
|
||||
|
||||
if as_dataurl {
|
||||
// Convert response into a byte array
|
||||
let mut data: Vec<u8> = vec![];
|
||||
response.copy_to(&mut data)?;
|
||||
|
||||
// Attempt to obtain MIME type by reading the Content-Type header
|
||||
let mimetype = if as_mime == "" {
|
||||
response
|
||||
.headers()
|
||||
.get(CONTENT_TYPE)
|
||||
.and_then(|header| header.to_str().ok())
|
||||
.unwrap_or(&as_mime)
|
||||
} else {
|
||||
as_mime
|
||||
};
|
||||
|
||||
Ok(data_to_dataurl(&mimetype, &data))
|
||||
} else {
|
||||
Ok(response.text().unwrap())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_is_valid_url() {
|
||||
assert!(is_valid_url("https://www.rust-lang.org/"));
|
||||
assert!(is_valid_url("http://kernel.org"));
|
||||
assert!(!is_valid_url("./index.html"));
|
||||
assert!(!is_valid_url("some-local-page.htm"));
|
||||
assert!(!is_valid_url("ftp://1.2.3.4/www/index.html"));
|
||||
assert!(!is_valid_url(
|
||||
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_url() -> Result<(), ParseError> {
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.kernel.org",
|
||||
"../category/signatures.html",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/category/signatures.html"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.kernel.org",
|
||||
"category/signatures.html",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/category/signatures.html"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"saved_page.htm",
|
||||
"https://www.kernel.org/category/signatures.html",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/category/signatures.html"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.kernel.org",
|
||||
"//www.kernel.org/theme/images/logos/tux.png",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/theme/images/logos/tux.png"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.kernel.org",
|
||||
"//another-host.org/theme/images/logos/tux.png",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://another-host.org/theme/images/logos/tux.png"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.kernel.org/category/signatures.html",
|
||||
"/theme/images/logos/tux.png",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/theme/images/logos/tux.png"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.w3schools.com/html/html_iframe.asp",
|
||||
"default.asp",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.w3schools.com/html/default.asp"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_data_url() {
|
||||
assert!(
|
||||
is_data_url("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h")
|
||||
.unwrap_or(false)
|
||||
);
|
||||
assert!(!is_data_url("https://kernel.org").unwrap_or(false));
|
||||
assert!(!is_data_url("//kernel.org").unwrap_or(false));
|
||||
}
|
||||
}
|
||||
|
||||
103
src/js.rs
Normal file
103
src/js.rs
Normal file
@@ -0,0 +1,103 @@
|
||||
const JS_DOM_EVENT_ATTRS: &[&str] = &[
|
||||
// From WHATWG HTML spec 8.1.5.2 'Event handlers on elements, Document objects, and Window objects':
|
||||
// https://html.spec.whatwg.org/#event-handlers-on-elements,-document-objects,-and-window-objects
|
||||
// https://html.spec.whatwg.org/#attributes-3 (table 'List of event handler content attributes')
|
||||
|
||||
// Global event handlers
|
||||
"onabort",
|
||||
"onauxclick",
|
||||
"onblur",
|
||||
"oncancel",
|
||||
"oncanplay",
|
||||
"oncanplaythrough",
|
||||
"onchange",
|
||||
"onclick",
|
||||
"onclose",
|
||||
"oncontextmenu",
|
||||
"oncuechange",
|
||||
"ondblclick",
|
||||
"ondrag",
|
||||
"ondragend",
|
||||
"ondragenter",
|
||||
"ondragexit",
|
||||
"ondragleave",
|
||||
"ondragover",
|
||||
"ondragstart",
|
||||
"ondrop",
|
||||
"ondurationchange",
|
||||
"onemptied",
|
||||
"onended",
|
||||
"onerror",
|
||||
"onfocus",
|
||||
"onformdata",
|
||||
"oninput",
|
||||
"oninvalid",
|
||||
"onkeydown",
|
||||
"onkeypress",
|
||||
"onkeyup",
|
||||
"onload",
|
||||
"onloadeddata",
|
||||
"onloadedmetadata",
|
||||
"onloadstart",
|
||||
"onmousedown",
|
||||
"onmouseenter",
|
||||
"onmouseleave",
|
||||
"onmousemove",
|
||||
"onmouseout",
|
||||
"onmouseover",
|
||||
"onmouseup",
|
||||
"onwheel",
|
||||
"onpause",
|
||||
"onplay",
|
||||
"onplaying",
|
||||
"onprogress",
|
||||
"onratechange",
|
||||
"onreset",
|
||||
"onresize",
|
||||
"onscroll",
|
||||
"onsecuritypolicyviolation",
|
||||
"onseeked",
|
||||
"onseeking",
|
||||
"onselect",
|
||||
"onslotchange",
|
||||
"onstalled",
|
||||
"onsubmit",
|
||||
"onsuspend",
|
||||
"ontimeupdate",
|
||||
"ontoggle",
|
||||
"onvolumechange",
|
||||
"onwaiting",
|
||||
"onwebkitanimationend",
|
||||
"onwebkitanimationiteration",
|
||||
"onwebkitanimationstart",
|
||||
"onwebkittransitionend",
|
||||
// Event handlers for <body/> and <frameset/> elements
|
||||
"onafterprint",
|
||||
"onbeforeprint",
|
||||
"onbeforeunload",
|
||||
"onhashchange",
|
||||
"onlanguagechange",
|
||||
"onmessage",
|
||||
"onmessageerror",
|
||||
"onoffline",
|
||||
"ononline",
|
||||
"onpagehide",
|
||||
"onpageshow",
|
||||
"onpopstate",
|
||||
"onrejectionhandled",
|
||||
"onstorage",
|
||||
"onunhandledrejection",
|
||||
"onunload",
|
||||
// Event handlers for <html/> element
|
||||
"oncut",
|
||||
"oncopy",
|
||||
"onpaste",
|
||||
];
|
||||
|
||||
// Returns true if DOM attribute name matches a native JavaScript event handler
|
||||
pub fn attr_is_event_handler(attr_name: &str) -> bool {
|
||||
JS_DOM_EVENT_ATTRS
|
||||
.iter()
|
||||
.find(|a| attr_name.eq_ignore_ascii_case(a))
|
||||
.is_some()
|
||||
}
|
||||
11
src/lib.rs
11
src/lib.rs
@@ -1,10 +1,13 @@
|
||||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
extern crate html5ever;
|
||||
extern crate regex;
|
||||
extern crate reqwest;
|
||||
extern crate url;
|
||||
|
||||
#[macro_use]
|
||||
mod macros;
|
||||
|
||||
pub mod html;
|
||||
pub mod http;
|
||||
pub mod js;
|
||||
pub mod utils;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests;
|
||||
|
||||
9
src/macros.rs
Normal file
9
src/macros.rs
Normal file
@@ -0,0 +1,9 @@
|
||||
#[macro_export]
|
||||
macro_rules! str {
|
||||
() => {
|
||||
String::new()
|
||||
};
|
||||
($val: expr) => {
|
||||
ToString::to_string(&$val)
|
||||
};
|
||||
}
|
||||
156
src/main.rs
156
src/main.rs
@@ -1,63 +1,111 @@
|
||||
#[macro_use]
|
||||
extern crate clap;
|
||||
extern crate monolith;
|
||||
|
||||
use clap::{App, Arg};
|
||||
use monolith::html::{html_to_dom, print_dom, walk_and_embed_assets};
|
||||
use monolith::http::{is_valid_url, retrieve_asset};
|
||||
mod args;
|
||||
mod macros;
|
||||
|
||||
static DEFAULT_USER_AGENT: &str =
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0";
|
||||
use crate::args::AppArgs;
|
||||
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
|
||||
use monolith::http::retrieve_asset;
|
||||
use monolith::utils::is_valid_url;
|
||||
use reqwest::blocking::Client;
|
||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::{self, Error, Write};
|
||||
use std::process;
|
||||
use std::time::Duration;
|
||||
|
||||
fn main() {
|
||||
let command = App::new("monolith")
|
||||
.version(crate_version!())
|
||||
.author(crate_authors!("\n"))
|
||||
.about(crate_description!())
|
||||
.arg(
|
||||
Arg::with_name("url")
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
.index(1)
|
||||
.help("URL to download"),
|
||||
)
|
||||
.args_from_usage("-i, --no-images 'Removes images'")
|
||||
.args_from_usage("-j, --no-js 'Excludes JavaScript'")
|
||||
.args_from_usage("-k, --insecure 'Accept invalid X.509 (TLS) certificates'")
|
||||
.args_from_usage("-s, --silent 'Suppress verbosity'")
|
||||
.args_from_usage("-u, --user-agent=[Iceweasel] 'Custom User-Agent string'")
|
||||
.get_matches();
|
||||
enum Output {
|
||||
Stdout(io::Stdout),
|
||||
File(File),
|
||||
}
|
||||
|
||||
// Process the command
|
||||
let arg_target = command.value_of("url").unwrap();
|
||||
let opt_no_images = command.is_present("no-images");
|
||||
let opt_no_js = command.is_present("no-js");
|
||||
let opt_insecure = command.is_present("insecure");
|
||||
let opt_silent = command.is_present("silent");
|
||||
let opt_user_agent = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT);
|
||||
impl Output {
|
||||
fn new(file_path: &str) -> Result<Output, Error> {
|
||||
if file_path.is_empty() {
|
||||
Ok(Output::Stdout(io::stdout()))
|
||||
} else {
|
||||
Ok(Output::File(File::create(file_path)?))
|
||||
}
|
||||
}
|
||||
|
||||
if is_valid_url(arg_target) {
|
||||
let data = retrieve_asset(
|
||||
&arg_target,
|
||||
false,
|
||||
"",
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
).unwrap();
|
||||
let dom = html_to_dom(&data);
|
||||
|
||||
walk_and_embed_assets(
|
||||
&arg_target,
|
||||
&dom.document,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_user_agent,
|
||||
opt_silent,
|
||||
opt_insecure,
|
||||
);
|
||||
|
||||
print_dom(&dom.document);
|
||||
println!(); // Ensure newline at end of output
|
||||
fn writeln_str(&mut self, s: &str) -> Result<(), Error> {
|
||||
match self {
|
||||
Output::Stdout(stdout) => {
|
||||
writeln!(stdout, "{}", s)?;
|
||||
stdout.flush()
|
||||
}
|
||||
Output::File(f) => {
|
||||
writeln!(f, "{}", s)?;
|
||||
f.flush()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let app_args = AppArgs::get();
|
||||
|
||||
if !is_valid_url(app_args.url_target.as_str()) {
|
||||
eprintln!(
|
||||
"Only HTTP and HTTPS URLs are allowed but got: {}",
|
||||
&app_args.url_target
|
||||
);
|
||||
process::exit(1);
|
||||
}
|
||||
|
||||
let mut output = Output::new(&app_args.output).expect("Could not prepare output");
|
||||
|
||||
// Initialize client
|
||||
let mut cache = HashMap::new();
|
||||
let mut header_map = HeaderMap::new();
|
||||
header_map.insert(
|
||||
USER_AGENT,
|
||||
HeaderValue::from_str(&app_args.user_agent).expect("Invalid User-Agent header specified"),
|
||||
);
|
||||
|
||||
let client = Client::builder()
|
||||
.timeout(Duration::from_secs(10))
|
||||
.danger_accept_invalid_certs(app_args.insecure)
|
||||
.default_headers(header_map)
|
||||
.build()
|
||||
.expect("Failed to initialize HTTP client");
|
||||
|
||||
// Retrieve root document
|
||||
let (data, final_url) = retrieve_asset(
|
||||
&mut cache,
|
||||
&client,
|
||||
app_args.url_target.as_str(),
|
||||
false,
|
||||
"",
|
||||
app_args.silent,
|
||||
)
|
||||
.expect("Could not retrieve assets in HTML");
|
||||
let dom = html_to_dom(&data);
|
||||
|
||||
walk_and_embed_assets(
|
||||
&mut cache,
|
||||
&client,
|
||||
&final_url,
|
||||
&dom.document,
|
||||
app_args.no_css,
|
||||
app_args.no_js,
|
||||
app_args.no_images,
|
||||
app_args.silent,
|
||||
app_args.no_frames,
|
||||
);
|
||||
|
||||
let html: String = stringify_document(
|
||||
&dom.document,
|
||||
app_args.no_css,
|
||||
app_args.no_frames,
|
||||
app_args.no_js,
|
||||
app_args.no_images,
|
||||
app_args.isolate,
|
||||
);
|
||||
|
||||
output
|
||||
.writeln_str(&html)
|
||||
.expect("Could not write HTML output");
|
||||
}
|
||||
|
||||
520
src/tests/html.rs
Normal file
520
src/tests/html.rs
Normal file
@@ -0,0 +1,520 @@
|
||||
use crate::html::{
|
||||
get_node_name, get_parent_node, html_to_dom, is_icon, stringify_document, walk_and_embed_assets,
|
||||
};
|
||||
use html5ever::rcdom::{Handle, NodeData};
|
||||
use html5ever::serialize::{serialize, SerializeOpts};
|
||||
use reqwest::blocking::Client;
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[test]
|
||||
fn test_is_icon() {
|
||||
assert_eq!(is_icon("icon"), true);
|
||||
assert_eq!(is_icon("Shortcut Icon"), true);
|
||||
assert_eq!(is_icon("ICON"), true);
|
||||
assert_eq!(is_icon("mask-icon"), true);
|
||||
assert_eq!(is_icon("fluid-icon"), true);
|
||||
assert_eq!(is_icon("stylesheet"), false);
|
||||
assert_eq!(is_icon(""), false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_parent_node_name() {
|
||||
let html = "<!doctype html><html><HEAD></HEAD><body><div><P></P></div></body></html>";
|
||||
let dom = html_to_dom(&html);
|
||||
let mut count = 0;
|
||||
|
||||
fn test_walk(node: &Handle, i: &mut i8) {
|
||||
*i += 1;
|
||||
|
||||
match &node.data {
|
||||
NodeData::Document => {
|
||||
for child in node.children.borrow().iter() {
|
||||
test_walk(child, &mut *i);
|
||||
}
|
||||
}
|
||||
NodeData::Element { ref name, .. } => {
|
||||
let node_name = name.local.as_ref().to_string();
|
||||
let parent = get_parent_node(node);
|
||||
let parent_node_name = get_node_name(&parent);
|
||||
if node_name == "head" || node_name == "body" {
|
||||
assert_eq!(parent_node_name, "html");
|
||||
} else if node_name == "div" {
|
||||
assert_eq!(parent_node_name, "body");
|
||||
} else if node_name == "p" {
|
||||
assert_eq!(parent_node_name, "div");
|
||||
}
|
||||
|
||||
println!("{}", node_name);
|
||||
|
||||
for child in node.children.borrow().iter() {
|
||||
test_walk(child, &mut *i);
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
};
|
||||
}
|
||||
|
||||
test_walk(&dom.document, &mut count);
|
||||
|
||||
assert_eq!(count, 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets() {
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let html = "<div><P></P></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = false;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
|
||||
let client = Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html><head></head><body><div><p></p></div></body></html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets_ensure_no_recursive_iframe() {
|
||||
let html = "<div><P></P><iframe src=\"\"></iframe></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = false;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
|
||||
let client = Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html><head></head><body><div><p></p><iframe src=\"\"></iframe></div></body></html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets_no_css() {
|
||||
let html = "<link rel=\"stylesheet\" href=\"main.css\">\
|
||||
<style>html{background-color: #000;}</style>\
|
||||
<div style=\"display: none;\"></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let opt_no_css: bool = true;
|
||||
let opt_no_frames: bool = false;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
let client = Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html>\
|
||||
<head>\
|
||||
<link rel=\"stylesheet\" href=\"\">\
|
||||
<style></style>\
|
||||
</head>\
|
||||
<body>\
|
||||
<div></div>\
|
||||
</body>\
|
||||
</html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets_no_images() {
|
||||
let html = "<link rel=\"icon\" href=\"favicon.ico\">\
|
||||
<div><img src=\"http://localhost/assets/mono_lisa.png\" /></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = false;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = true;
|
||||
let opt_silent = true;
|
||||
|
||||
let client = Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html>\
|
||||
<head>\
|
||||
<link rel=\"icon\" href=\"\">\
|
||||
</head>\
|
||||
<body>\
|
||||
<div>\
|
||||
<img src=\"data:image/png;base64,\
|
||||
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0\
|
||||
lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=\">\
|
||||
</div>\
|
||||
</body>\
|
||||
</html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets_no_frames() {
|
||||
let html = "<iframe src=\"http://trackbook.com\"></iframe>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = true;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
let client = Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html><head></head><body><iframe src=\"\"></iframe></body></html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_assets_no_js() {
|
||||
let html = "<div onClick=\"void(0)\">\
|
||||
<script src=\"http://localhost/assets/some.js\"></script>\
|
||||
<script>alert(1)</script>\
|
||||
</div>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = false;
|
||||
let opt_no_js: bool = true;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
|
||||
let client = Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html><head></head><body><div><script src=\"\"></script>\
|
||||
<script></script></div></body></html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_with_no_integrity() {
|
||||
let html = "<title>No integrity</title>\
|
||||
<link integrity=\"sha384-...\" rel=\"something\"/>\
|
||||
<script integrity=\"sha384-...\" src=\"some.js\"></script>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
let client = Client::new();
|
||||
let opt_no_css: bool = true;
|
||||
let opt_no_frames: bool = true;
|
||||
let opt_no_js: bool = true;
|
||||
let opt_no_images: bool = true;
|
||||
let opt_silent = true;
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html>\
|
||||
<head><title>No integrity</title><link rel=\"something\"><script src=\"\"></script></head>\
|
||||
<body></body>\
|
||||
</html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stringify_document() {
|
||||
let html = "<div><script src=\"some.js\"></script></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = false;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_isolate: bool = false;
|
||||
|
||||
assert_eq!(
|
||||
stringify_document(
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_frames,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_isolate,
|
||||
),
|
||||
"<html><head></head><body><div><script src=\"some.js\"></script></div></body></html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stringify_document_isolate() {
|
||||
let html = "<title>Isolated document</title>\
|
||||
<link rel=\"something\" href=\"some.css\" />\
|
||||
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src https:\">\
|
||||
<div><script src=\"some.js\"></script></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = false;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_isolate: bool = true;
|
||||
|
||||
assert_eq!(
|
||||
stringify_document(
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_frames,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_isolate,
|
||||
),
|
||||
"<html>\
|
||||
<head>\
|
||||
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src 'unsafe-inline' data:;\"></meta>\
|
||||
<title>Isolated document</title>\
|
||||
<link rel=\"something\" href=\"some.css\">\
|
||||
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src https:\">\
|
||||
</head>\
|
||||
<body>\
|
||||
<div>\
|
||||
<script src=\"some.js\"></script>\
|
||||
</div>\
|
||||
</body>\
|
||||
</html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stringify_document_no_css() {
|
||||
let html = "<!doctype html>\
|
||||
<title>Unstyled document</title>\
|
||||
<link rel=\"stylesheet\" href=\"main.css\"/>\
|
||||
<div style=\"display: none;\"></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
|
||||
let opt_no_css: bool = true;
|
||||
let opt_no_frames: bool = false;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_isolate: bool = false;
|
||||
|
||||
assert_eq!(
|
||||
stringify_document(
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_frames,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_isolate,
|
||||
),
|
||||
"<!DOCTYPE html>\
|
||||
<html>\
|
||||
<head>\
|
||||
<meta http-equiv=\"Content-Security-Policy\" content=\"style-src 'none';\"></meta>\
|
||||
<title>Unstyled document</title>\
|
||||
<link rel=\"stylesheet\" href=\"main.css\">\
|
||||
</head>\
|
||||
<body><div style=\"display: none;\"></div></body>\
|
||||
</html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stringify_document_no_frames() {
|
||||
let html = "<!doctype html>\
|
||||
<title>Frameless document</title>\
|
||||
<link rel=\"something\"/>\
|
||||
<div><script src=\"some.js\"></script></div>";
|
||||
let dom = html_to_dom(&html);
|
||||
|
||||
let opt_no_css: bool = false;
|
||||
let opt_no_frames: bool = true;
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_isolate: bool = false;
|
||||
|
||||
assert_eq!(
|
||||
stringify_document(
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_frames,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_isolate,
|
||||
),
|
||||
"<!DOCTYPE html>\
|
||||
<html>\
|
||||
<head>\
|
||||
<meta http-equiv=\"Content-Security-Policy\" content=\"frame-src 'none';child-src 'none';\"></meta>\
|
||||
<title>Frameless document</title>\
|
||||
<link rel=\"something\">\
|
||||
</head>\
|
||||
<body><div><script src=\"some.js\"></script></div></body>\
|
||||
</html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stringify_document_isolate_no_frames_no_js_no_css_no_images() {
|
||||
let html = "<!doctype html>\
|
||||
<title>no-frame no-css no-js no-image isolated document</title>\
|
||||
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src https:\">\
|
||||
<link rel=\"stylesheet\" href=\"some.css\">\
|
||||
<div>\
|
||||
<script src=\"some.js\"></script>\
|
||||
<img style=\"width: 100%;\" src=\"some.png\" />\
|
||||
<iframe src=\"some.html\"></iframe>\
|
||||
</div>";
|
||||
let dom = html_to_dom(&html);
|
||||
|
||||
let opt_isolate: bool = true;
|
||||
let opt_no_css: bool = true;
|
||||
let opt_no_frames: bool = true;
|
||||
let opt_no_js: bool = true;
|
||||
let opt_no_images: bool = true;
|
||||
|
||||
assert_eq!(
|
||||
stringify_document(
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_frames,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_isolate,
|
||||
),
|
||||
"<!DOCTYPE html>\
|
||||
<html>\
|
||||
<head>\
|
||||
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src \'unsafe-inline\' data:; style-src \'none\'; frame-src \'none\';child-src \'none\'; script-src \'none\'; img-src data:;\"></meta>\
|
||||
<title>no-frame no-css no-js no-image isolated document</title>\
|
||||
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src https:\">\
|
||||
<link rel=\"stylesheet\" href=\"some.css\">\
|
||||
</head>\
|
||||
<body>\
|
||||
<div>\
|
||||
<script src=\"some.js\"></script>\
|
||||
<img style=\"width: 100%;\" src=\"some.png\">\
|
||||
<iframe src=\"some.html\"></iframe>\
|
||||
</div>\
|
||||
</body>\
|
||||
</html>"
|
||||
);
|
||||
}
|
||||
25
src/tests/http.rs
Normal file
25
src/tests/http.rs
Normal file
@@ -0,0 +1,25 @@
|
||||
use crate::http::retrieve_asset;
|
||||
use reqwest::blocking::Client;
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[test]
|
||||
fn test_retrieve_asset() {
|
||||
let cache = &mut HashMap::new();
|
||||
let client = Client::new();
|
||||
let (data, final_url) =
|
||||
retrieve_asset(cache, &client, "data:text/html;base64,...", true, "", false).unwrap();
|
||||
assert_eq!(&data, "data:text/html;base64,...");
|
||||
assert_eq!(&final_url, "data:text/html;base64,...");
|
||||
|
||||
let (data, final_url) = retrieve_asset(
|
||||
cache,
|
||||
&client,
|
||||
"data:text/html;base64,...",
|
||||
true,
|
||||
"image/png",
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(&data, "data:text/html;base64,...");
|
||||
assert_eq!(&final_url, "data:text/html;base64,...");
|
||||
}
|
||||
13
src/tests/js.rs
Normal file
13
src/tests/js.rs
Normal file
@@ -0,0 +1,13 @@
|
||||
use crate::js::attr_is_event_handler;
|
||||
|
||||
#[test]
|
||||
fn test_attr_is_event_handler() {
|
||||
// succeeding
|
||||
assert!(attr_is_event_handler("onBlur"));
|
||||
assert!(attr_is_event_handler("onclick"));
|
||||
assert!(attr_is_event_handler("onClick"));
|
||||
// failing
|
||||
assert!(!attr_is_event_handler("href"));
|
||||
assert!(!attr_is_event_handler(""));
|
||||
assert!(!attr_is_event_handler("class"));
|
||||
}
|
||||
4
src/tests/mod.rs
Normal file
4
src/tests/mod.rs
Normal file
@@ -0,0 +1,4 @@
|
||||
mod html;
|
||||
mod http;
|
||||
mod js;
|
||||
mod utils;
|
||||
177
src/tests/utils.rs
Normal file
177
src/tests/utils.rs
Normal file
@@ -0,0 +1,177 @@
|
||||
use crate::utils::{
|
||||
clean_url, data_to_dataurl, detect_mimetype, is_data_url, is_valid_url, resolve_url,
|
||||
url_has_protocol,
|
||||
};
|
||||
use url::ParseError;
|
||||
|
||||
#[test]
|
||||
fn test_data_to_dataurl() {
|
||||
let mime = "application/javascript";
|
||||
let data = "var word = 'hello';\nalert(word);\n";
|
||||
let datauri = data_to_dataurl(mime, data.as_bytes());
|
||||
assert_eq!(
|
||||
&datauri,
|
||||
"data:application/javascript;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_mimetype() {
|
||||
// image
|
||||
assert_eq!(detect_mimetype(b"GIF87a"), "image/gif");
|
||||
assert_eq!(detect_mimetype(b"GIF89a"), "image/gif");
|
||||
assert_eq!(detect_mimetype(b"\xFF\xD8\xFF"), "image/jpeg");
|
||||
assert_eq!(detect_mimetype(b"\x89PNG\x0D\x0A\x1A\x0A"), "image/png");
|
||||
assert_eq!(detect_mimetype(b"<?xml "), "image/svg+xml");
|
||||
assert_eq!(detect_mimetype(b"<svg "), "image/svg+xml");
|
||||
assert_eq!(detect_mimetype(b"RIFF....WEBPVP8 "), "image/webp");
|
||||
assert_eq!(detect_mimetype(b"\x00\x00\x01\x00"), "image/x-icon");
|
||||
// audio
|
||||
assert_eq!(detect_mimetype(b"ID3"), "audio/mpeg");
|
||||
assert_eq!(detect_mimetype(b"\xFF\x0E"), "audio/mpeg");
|
||||
assert_eq!(detect_mimetype(b"\xFF\x0F"), "audio/mpeg");
|
||||
assert_eq!(detect_mimetype(b"OggS"), "audio/ogg");
|
||||
assert_eq!(detect_mimetype(b"RIFF....WAVEfmt "), "audio/wav");
|
||||
assert_eq!(detect_mimetype(b"fLaC"), "audio/x-flac");
|
||||
// video
|
||||
assert_eq!(detect_mimetype(b"RIFF....AVI LIST"), "video/avi");
|
||||
assert_eq!(detect_mimetype(b"....ftyp"), "video/mp4");
|
||||
assert_eq!(detect_mimetype(b"\x00\x00\x01\x0B"), "video/mpeg");
|
||||
assert_eq!(detect_mimetype(b"....moov"), "video/quicktime");
|
||||
assert_eq!(detect_mimetype(b"\x1A\x45\xDF\xA3"), "video/webm");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_url_has_protocol() {
|
||||
// succeeding
|
||||
assert_eq!(
|
||||
url_has_protocol("mailto:somebody@somewhere.com?subject=hello"),
|
||||
true
|
||||
);
|
||||
assert_eq!(url_has_protocol("tel:5551234567"), true);
|
||||
assert_eq!(
|
||||
url_has_protocol("ftp:user:password@some-ftp-server.com"),
|
||||
true
|
||||
);
|
||||
assert_eq!(url_has_protocol("javascript:void(0)"), true);
|
||||
assert_eq!(url_has_protocol("http://news.ycombinator.com"), true);
|
||||
assert_eq!(url_has_protocol("https://github.com"), true);
|
||||
assert_eq!(
|
||||
url_has_protocol("MAILTO:somebody@somewhere.com?subject=hello"),
|
||||
true
|
||||
);
|
||||
// failing
|
||||
assert_eq!(
|
||||
url_has_protocol("//some-hostname.com/some-file.html"),
|
||||
false
|
||||
);
|
||||
assert_eq!(url_has_protocol("some-hostname.com/some-file.html"), false);
|
||||
assert_eq!(url_has_protocol("/some-file.html"), false);
|
||||
assert_eq!(url_has_protocol(""), false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_valid_url() {
|
||||
// succeeding
|
||||
assert!(is_valid_url("https://www.rust-lang.org/"));
|
||||
assert!(is_valid_url("http://kernel.org"));
|
||||
// failing
|
||||
assert!(!is_valid_url("//kernel.org"));
|
||||
assert!(!is_valid_url("./index.html"));
|
||||
assert!(!is_valid_url("some-local-page.htm"));
|
||||
assert!(!is_valid_url("ftp://1.2.3.4/www/index.html"));
|
||||
assert!(!is_valid_url(
|
||||
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_url() -> Result<(), ParseError> {
|
||||
let resolved_url = resolve_url("https://www.kernel.org", "../category/signatures.html")?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/category/signatures.html"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url("https://www.kernel.org", "category/signatures.html")?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/category/signatures.html"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"saved_page.htm",
|
||||
"https://www.kernel.org/category/signatures.html",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/category/signatures.html"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.kernel.org",
|
||||
"//www.kernel.org/theme/images/logos/tux.png",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/theme/images/logos/tux.png"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.kernel.org",
|
||||
"//another-host.org/theme/images/logos/tux.png",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://another-host.org/theme/images/logos/tux.png"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.kernel.org/category/signatures.html",
|
||||
"/theme/images/logos/tux.png",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.kernel.org/theme/images/logos/tux.png"
|
||||
);
|
||||
|
||||
let resolved_url = resolve_url(
|
||||
"https://www.w3schools.com/html/html_iframe.asp",
|
||||
"default.asp",
|
||||
)?;
|
||||
assert_eq!(
|
||||
resolved_url.as_str(),
|
||||
"https://www.w3schools.com/html/default.asp"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_data_url() {
|
||||
// succeeding
|
||||
assert!(
|
||||
is_data_url("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h")
|
||||
.unwrap_or(false)
|
||||
);
|
||||
// failing
|
||||
assert!(!is_data_url("https://kernel.org").unwrap_or(false));
|
||||
assert!(!is_data_url("//kernel.org").unwrap_or(false));
|
||||
assert!(!is_data_url("").unwrap_or(false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_clean_url() {
|
||||
assert_eq!(
|
||||
clean_url("https://somewhere.com/font.eot#iefix"),
|
||||
"https://somewhere.com/font.eot"
|
||||
);
|
||||
assert_eq!(
|
||||
clean_url("https://somewhere.com/font.eot#"),
|
||||
"https://somewhere.com/font.eot"
|
||||
);
|
||||
assert_eq!(
|
||||
clean_url("https://somewhere.com/font.eot?#"),
|
||||
"https://somewhere.com/font.eot"
|
||||
);
|
||||
}
|
||||
211
src/utils.rs
211
src/utils.rs
@@ -1,8 +1,48 @@
|
||||
extern crate base64;
|
||||
use crate::http::retrieve_asset;
|
||||
use base64::encode;
|
||||
use regex::Regex;
|
||||
use reqwest::blocking::Client;
|
||||
use std::collections::HashMap;
|
||||
use url::{ParseError, Url};
|
||||
|
||||
use self::base64::encode;
|
||||
/// This monster of a regex is used to match any kind of URL found in CSS.
|
||||
///
|
||||
/// There are roughly three different categories that a found URL could fit
|
||||
/// into:
|
||||
/// - Font [found after a src: property in an @font-family rule]
|
||||
/// - Stylesheet [denoted by an @import before the url
|
||||
/// - Image [covers all other uses of the url() function]
|
||||
///
|
||||
/// This regex aims to extract the following information:
|
||||
/// - What type of URL is it (font/image/css)
|
||||
/// - Where is the part that needs to be replaced (incl any wrapping quotes)
|
||||
/// - What is the URL (excl any wrapping quotes)
|
||||
///
|
||||
/// Essentially, the regex can be broken down into two parts:
|
||||
///
|
||||
/// `(?:(?P<import>@import)|(?P<font>src\s*:)\s+)?`
|
||||
/// This matches the precursor to a font or CSS URL, and fills in a match under
|
||||
/// either `<import>` (if it's a CSS URL) or `<font>` (if it's a font).
|
||||
/// Determining whether or not it's an image can be done by the negation of both
|
||||
/// of these. Either zero or one of these can match.
|
||||
///
|
||||
/// `url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)`
|
||||
/// This matches the actual URL part of the url(), and must always match. It also
|
||||
/// sets `<to_repl>` and `<url>` which correspond to everything within
|
||||
/// `url(...)` and a usable URL, respectively.
|
||||
///
|
||||
/// Note, however, that this does not perform any validation of the found URL.
|
||||
/// Malformed CSS could lead to an invalid URL being present. It is therefore
|
||||
/// recomended that the URL gets manually validated.
|
||||
const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P<stylesheet>@import)|(?P<font>src\s*:))\s+)?url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)"###;
|
||||
|
||||
static MAGIC: [[&[u8]; 2]; 19] = [
|
||||
lazy_static! {
|
||||
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
|
||||
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
|
||||
static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap();
|
||||
}
|
||||
|
||||
const MAGIC: [[&[u8]; 2]; 19] = [
|
||||
// Image
|
||||
[b"GIF87a", b"image/gif"],
|
||||
[b"GIF89a", b"image/gif"],
|
||||
@@ -28,7 +68,7 @@ static MAGIC: [[&[u8]; 2]; 19] = [
|
||||
];
|
||||
|
||||
pub fn data_to_dataurl(mime: &str, data: &[u8]) -> String {
|
||||
let mimetype = if mime == "" {
|
||||
let mimetype = if mime.is_empty() {
|
||||
detect_mimetype(data)
|
||||
} else {
|
||||
mime.to_string()
|
||||
@@ -36,57 +76,132 @@ pub fn data_to_dataurl(mime: &str, data: &[u8]) -> String {
|
||||
format!("data:{};base64,{}", mimetype, encode(data))
|
||||
}
|
||||
|
||||
fn detect_mimetype(data: &[u8]) -> String {
|
||||
let mut re = String::new();
|
||||
|
||||
for item in MAGIC.iter() {
|
||||
pub fn detect_mimetype(data: &[u8]) -> String {
|
||||
for item in MAGIC.iter() {
|
||||
if data.starts_with(item[0]) {
|
||||
re = String::from_utf8(item[1].to_vec()).unwrap();
|
||||
break;
|
||||
return String::from_utf8(item[1].to_vec()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
re
|
||||
"".to_owned()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
pub fn url_has_protocol<T: AsRef<str>>(url: T) -> bool {
|
||||
HAS_PROTOCOL.is_match(url.as_ref().to_lowercase().as_str())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_data_to_dataurl() {
|
||||
let mime = "application/javascript";
|
||||
let data = "var word = 'hello';\nalert(word);\n";
|
||||
let datauri = data_to_dataurl(mime, data.as_bytes());
|
||||
assert_eq!(
|
||||
&datauri,
|
||||
"data:application/javascript;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK"
|
||||
);
|
||||
pub fn is_data_url<T: AsRef<str>>(url: T) -> Result<bool, ParseError> {
|
||||
Url::parse(url.as_ref()).and_then(|u| Ok(u.scheme() == "data"))
|
||||
}
|
||||
|
||||
pub fn is_valid_url<T: AsRef<str>>(path: T) -> bool {
|
||||
REGEX_URL.is_match(path.as_ref())
|
||||
}
|
||||
|
||||
pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<String, ParseError> {
|
||||
let result = if is_valid_url(to.as_ref()) {
|
||||
to.as_ref().to_string()
|
||||
} else {
|
||||
Url::parse(from.as_ref())?
|
||||
.join(to.as_ref())?
|
||||
.as_ref()
|
||||
.to_string()
|
||||
};
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn resolve_css_imports(
|
||||
cache: &mut HashMap<String, String>,
|
||||
client: &Client,
|
||||
css_string: &str,
|
||||
as_dataurl: bool,
|
||||
href: &str,
|
||||
opt_no_images: bool,
|
||||
opt_silent: bool,
|
||||
) -> String {
|
||||
let mut resolved_css = String::from(css_string);
|
||||
|
||||
for link in REGEX_CSS_URL.captures_iter(&css_string) {
|
||||
let target_link = link.name("url").unwrap().as_str();
|
||||
|
||||
// Determine the type of link
|
||||
let is_stylesheet = link.name("stylesheet").is_some();
|
||||
let is_font = link.name("font").is_some();
|
||||
let is_image = !is_stylesheet && !is_font;
|
||||
|
||||
// Generate absolute URL for content
|
||||
let embedded_url = match resolve_url(href, target_link) {
|
||||
Ok(url) => url,
|
||||
Err(_) => continue, // Malformed URL
|
||||
};
|
||||
|
||||
// Download the asset. If it's more CSS, resolve that too
|
||||
let content = if is_stylesheet {
|
||||
// The link is an @import link
|
||||
retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&embedded_url,
|
||||
false, // Formating as data URL will be done later
|
||||
"text/css", // Expect CSS
|
||||
opt_silent,
|
||||
)
|
||||
.map(|(content, _)| {
|
||||
resolve_css_imports(
|
||||
cache,
|
||||
client,
|
||||
&content,
|
||||
true, // Finally, convert to a dataurl
|
||||
&embedded_url,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
)
|
||||
})
|
||||
} else if (is_image && !opt_no_images) || is_font {
|
||||
// The link is some other, non-@import link
|
||||
retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&embedded_url,
|
||||
true, // Format as data URL
|
||||
"", // Unknown MIME type
|
||||
opt_silent,
|
||||
)
|
||||
.map(|(a, _)| a)
|
||||
} else {
|
||||
// If it's a datatype that has been opt_no'd out of, replace with
|
||||
// absolute URL
|
||||
|
||||
Ok(embedded_url.clone())
|
||||
}
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("Warning: {}", e);
|
||||
|
||||
// If failed to resolve, replace with absolute URL
|
||||
embedded_url
|
||||
});
|
||||
|
||||
let replacement = format!("\"{}\"", &content);
|
||||
let dest = link.name("to_repl").unwrap();
|
||||
let offset = resolved_css.len() - css_string.len();
|
||||
let target_range = (dest.start() + offset)..(dest.end() + offset);
|
||||
|
||||
resolved_css.replace_range(target_range, &replacement);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_mimetype() {
|
||||
// Image
|
||||
assert_eq!(detect_mimetype(b"GIF87a"), "image/gif");
|
||||
assert_eq!(detect_mimetype(b"GIF89a"), "image/gif");
|
||||
assert_eq!(detect_mimetype(b"\xFF\xD8\xFF"), "image/jpeg");
|
||||
assert_eq!(detect_mimetype(b"\x89PNG\x0D\x0A\x1A\x0A"), "image/png");
|
||||
assert_eq!(detect_mimetype(b"<?xml "), "image/svg+xml");
|
||||
assert_eq!(detect_mimetype(b"<svg "), "image/svg+xml");
|
||||
assert_eq!(detect_mimetype(b"RIFF....WEBPVP8 "), "image/webp");
|
||||
assert_eq!(detect_mimetype(b"\x00\x00\x01\x00"), "image/x-icon");
|
||||
// Audio
|
||||
assert_eq!(detect_mimetype(b"ID3"), "audio/mpeg");
|
||||
assert_eq!(detect_mimetype(b"\xFF\x0E"), "audio/mpeg");
|
||||
assert_eq!(detect_mimetype(b"\xFF\x0F"), "audio/mpeg");
|
||||
assert_eq!(detect_mimetype(b"OggS"), "audio/ogg");
|
||||
assert_eq!(detect_mimetype(b"RIFF....WAVEfmt "), "audio/wav");
|
||||
assert_eq!(detect_mimetype(b"fLaC"), "audio/x-flac");
|
||||
// Video
|
||||
assert_eq!(detect_mimetype(b"RIFF....AVI LIST"), "video/avi");
|
||||
assert_eq!(detect_mimetype(b"....ftyp"), "video/mp4");
|
||||
assert_eq!(detect_mimetype(b"\x00\x00\x01\x0B"), "video/mpeg");
|
||||
assert_eq!(detect_mimetype(b"....moov"), "video/quicktime");
|
||||
assert_eq!(detect_mimetype(b"\x1A\x45\xDF\xA3"), "video/webm");
|
||||
if as_dataurl {
|
||||
data_to_dataurl("text/css", resolved_css.as_bytes())
|
||||
} else {
|
||||
resolved_css
|
||||
}
|
||||
}
|
||||
|
||||
pub fn clean_url<T: AsRef<str>>(url: T) -> String {
|
||||
let mut result = Url::parse(url.as_ref()).unwrap();
|
||||
// Clear fragment
|
||||
result.set_fragment(None);
|
||||
// Get rid of stray question mark
|
||||
if result.query() == Some("") {
|
||||
result.set_query(None);
|
||||
}
|
||||
result.to_string()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user