54 Commits

Author SHA1 Message Date
Sunshine
f876e9243c Merge pull request #109 from snshn/version-bump
version bump (2.1.1 → 2.1.2)
2020-01-21 08:39:10 -05:00
Sunshine
b6896febf1 version bump (2.1.1 → 2.1.2) 2020-01-21 02:32:29 -05:00
Sunshine
29d2ba5857 Merge pull request #107 from snshn/update-readme
Update README.md
2020-01-21 02:18:10 -05:00
Sunshine
8b1ebc7871 Update README.md 2020-01-21 02:16:36 -05:00
Sunshine
d753c83c76 Merge pull request #108 from rhysd/revert-manual-proxy-support
Revert #106 since reqwest supports system proxies by default
2020-01-21 02:15:29 -05:00
rhysd
47a825f5ed add proxies instruction in README.md 2020-01-21 13:02:45 +09:00
rhysd
0e12cecd85 Revert "Merge pull request #106 from rhysd/proxy-support"
This reverts commit d8def879b2, reversing
changes made to a9d114d04d.
2020-01-21 13:01:22 +09:00
Sunshine
d8def879b2 Merge pull request #106 from rhysd/proxy-support
Support HTTP and HTTPS proxies
2020-01-20 18:36:00 -05:00
Linda_pp
0420854ed6 remove '$' from environment variable names in README.md 2020-01-20 23:11:14 +09:00
rhysd
d47482fcd9 fix crash at setting empty values to HTTP proxies
with this patch `https_proxy=` and `http_proxy=` will work well.
2020-01-20 17:17:24 +09:00
rhysd
b68624f2f3 support HTTP and HTTPS proxies (fix #103) 2020-01-20 17:02:43 +09:00
Sunshine
a9d114d04d Merge pull request #105 from rhysd/refactor-main
Refactoring for main.rs to address several issues
2020-01-20 01:10:29 -05:00
rhysd
4e4ebe9c98 refactor main to address several issues
Addressed issues:

- when specified URL is invalid, it exited successfully with doing
  nothing. There was no way why it does not work for users
- it exited successfully even if invalid User-Agent value is specified
- it created file twice on `--output` option specified. It may cause an
  issue when some file watcher (e.g. FsEvents on macOS) is watching

Improvements:
- handle errors with `Result::expect` consistently it correctly exits
  with non-zero status on error
- define `Output` enum for handling both stdout and file outputs
2020-01-15 16:52:20 +09:00
Sunshine
429217d8f7 Merge pull request #104 from rhysd/complete-dom-event-handlers
Use complete list of DOM event handlers for detecting JS attributes
2020-01-15 01:34:01 -05:00
rhysd
1779f4a374 better comments for JS_DOM_EVENT_ATTRS constant 2020-01-15 14:33:27 +09:00
rhysd
26e89ae6d3 use complete list of DOM event handlers 2020-01-15 13:58:09 +09:00
Sunshine
b333d19d04 Update README.md 2020-01-14 03:42:04 -05:00
Sunshine
c1dc798ded Merge pull request #101 from rhysd/ignore-preload
Improve handling preload links and white spaces in attribute values
2020-01-13 17:51:25 -05:00
rhysd
69d99b69e8 remove . in line comment 2020-01-13 23:47:07 +09:00
Sunshine
aae53d20f0 Merge pull request #102 from popey/update-snap-config
Update snapcraft configuration
2020-01-13 08:39:15 -05:00
Alan Pope
14cf2ce8a6 Update snapcraft configuration
This changes the build slightly. If snapcraft is triggered when there is a new tagged release in the project github release page, and it's newer than the version in the Snap Store beta channel, we build that stable release. If however, the latest stable release in github releases is already the same as the Snap Store beta channel, then we build the tip of master.

This gives a couple of advantages. 

  * One yaml can be used to build tip-of-git snaps, and stable releases alike
  * Closing the beta channel in the Snap Store will mean the next triggered build will re-build whatever the last stable release is. This is useful to force a rebuild of the stable version in case a dependency (not that there are many) has a security issue.

We also now set the version dynamically based on the git tags.
2020-01-13 11:14:08 +00:00
rhysd
67b79e92f9 simplify &x.into_iter() to x.iter() 2020-01-10 14:45:02 +09:00
rhysd
b51f41fe34 trim attribute values 2020-01-10 14:41:05 +09:00
rhysd
6f158dc6db compare value of 'rel' properties in case-insensitive 2020-01-10 13:52:31 +09:00
rhysd
8d7052b39c ignore preload and prefetch sources
since all resources are embedded as data URL.
2020-01-09 18:18:21 +09:00
rhysd
660511b8a0 define link type of <link> element as enum and prefer match statement
since match statement checks exhaustiveness
2020-01-09 16:55:42 +09:00
Sunshine
929512f4f5 Merge pull request #97 from rhysd/reqwest-0.10.0
Upgrade reqwest to v0.10.0 for better binary size and build time
2020-01-08 01:43:55 -05:00
Sunshine
a46d89cefc Merge pull request #98 from rhysd/fix-ci
Fix nighly and beta CI
2020-01-07 18:14:30 -05:00
rhysd
f93646e17a ignore beta channel again on AppVeyor
since rustc command crashes on combination of
channel=beta & target=i686-pc-windows-gnu
2020-01-07 17:31:36 +09:00
rhysd
9d14b6dfea rename appveyor.yml to .appveyor.yml
align to .travis.yml
2020-01-07 15:28:29 +09:00
rhysd
9783b96524 check beta channel on CI not to break this crate with next Rust version 2020-01-07 15:28:29 +09:00
rhysd
106efe58ce fix nighly and beta on CI are failing
we always use stable rustfmt so checking with nighly/beta rustfmt is not
necessary.
2020-01-07 15:28:29 +09:00
rhysd
6e99ad13e7 upgrade reqwest to v0.10.0
This will improve build time and binary size as follows:

* Before

- **Compile targets**: 220
- **Build time**: `cargo build --release  1264.95s user 39.72s system 335% cpu 6:29.14 total`
- **Binary size**: 6578568 bytes

* After

- **Compile targets**: 170
- **Build time**: `cargo build --release  1130.64s user 32.15s system 359% cpu 5:23.69 total`
- **Binary size**: 6107088 bytes

* Differences

- **Compile targets**: 1.29x smaller
- **Build time**: 1.23x faster
- **Binary size**: 1.07x smaller
2020-01-07 14:22:32 +09:00
Sunshine
413dd66886 Merge pull request #96 from rhysd/refactorings
Refactorings
2020-01-05 18:46:31 -05:00
rhysd
dc7ec6e7a8 remove more redundant type annotations 2020-01-04 16:33:11 +09:00
rhysd
ed879231af fix test code was broken by refactoring 2020-01-04 08:07:19 +09:00
rhysd
ddf4b8ac13 prefer &str to String for reducing allocations 2020-01-04 08:05:02 +09:00
rhysd
84c13f0605 prefer unwrap_or_default to unwrap_or 2020-01-04 07:58:29 +09:00
rhysd
ce03e0e487 reduce allocation on checking DOM attributes and do not hard-code number of elements of array constant
`to_lower` allocates new string but the allocation is not necessary
here.
2020-01-04 07:52:47 +09:00
rhysd
63e19998d0 reduce clones and fix some code styles and redundant code 2020-01-04 07:49:26 +09:00
Sunshine
e3321bbb07 Merge pull request #95 from rhysd/rust2018
Migrate to Rust2018 edition
2020-01-03 02:00:47 -05:00
rhysd
0a38cd0eae add rhysd to authors list 2020-01-03 15:43:25 +09:00
rhysd
75fb6961ed migrate to Rust 2018 2020-01-03 00:33:49 +09:00
Sunshine
5ba8931502 Merge pull request #92 from snshn/output-file-option
Add option for saving output to file
2019-12-26 18:13:15 -05:00
Sunshine
13d2ea1607 Merge pull request #94 from snshn/no-integrity
Get rid of integrity attributes
2019-12-26 10:11:52 -05:00
Sunshine
88ffde0c3b wipe integrity attributes 2019-12-26 09:44:01 -05:00
Sunshine
bfb97bd062 add option for saving output to file 2019-12-26 00:45:20 -05:00
Sunshine
295931041c Merge pull request #80 from Alch-Emi/lazyload
Add support for lazy loaded images
2019-12-24 17:11:21 -05:00
Sunshine
2e623dd9f8 Merge pull request #84 from snshn/ignore-hash-in-cache-url
use clean URLs as hashmap keys
2019-12-24 17:08:57 -05:00
Sunshine
169b9657e5 ignore failures for both beta and nightly in the pipeline 2019-12-24 16:07:15 -05:00
Emi Simpson
dab4ae6965 Merged Y2Z/master with Alch-Emi/lazyload 2019-12-24 10:07:56 -05:00
Sunshine
c7fc121c7c use clean URLs as hashmap keys 2019-12-18 11:49:38 -05:00
Emi Simpson
292221ea28 Lazyloaded images are now loaded at compilation, with placeholders omitted 2019-12-09 19:40:29 -05:00
Emi Simpson
feb37f5812 Added support for lazy loaded images
Note: The way this patch works is by resolving any data-src tags on images in
the same way as normal source tags are resolved.  It is assumed that most
lazy-load libraries will use this tag, and that if this tag is set, then it is a
URL that is in use.
2019-12-06 19:27:41 -05:00
16 changed files with 887 additions and 772 deletions

View File

@@ -95,6 +95,7 @@ environment:
matrix:
allow_failures:
- channel: nightly
- channel: beta
# If you only care about stable channel build failures, uncomment the following line:
#- channel: beta
@@ -127,4 +128,3 @@ build: false
# environment variable.
test_script:
- cargo test --verbose %cargoflags%
- cargo fmt --all -- --check

View File

@@ -4,21 +4,24 @@ cache: cargo
sudo: false
os:
- linux
- osx
- linux
- osx
rust:
- stable
- beta
- nightly
- stable
- beta
- nightly
before_script:
- rustup component add rustfmt
- rustup component add rustfmt
script:
- cargo build --all --locked --verbose
- cargo test --all --locked --verbose
- cargo fmt --all -- --check
- cargo build --all --locked --verbose
- cargo test --all --locked --verbose
- |
if [[ "$TRAVIS_RUST_VERSION" == "stable" ]]; then
cargo fmt --all -- --check
fi
jobs:
allow_failures:

865
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,11 +1,13 @@
[package]
name = "monolith"
version = "2.1.0"
version = "2.1.2"
edition = "2018"
authors = [
"Sunshine <sunshine@uberspace.net>",
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
"Emmanuel Delaborde <th3rac25@gmail.com>",
"Emi Simpson <emi@alchemi.dev>",
"rhysd <lin90162@yahoo.co.jp>",
]
description = "CLI tool for saving web pages as a single HTML file"
@@ -15,5 +17,9 @@ clap = "2.33.0"
html5ever = "0.24.1"
lazy_static = "1.4.0"
regex = "1.3.1"
reqwest = "0.9.20"
url = "2.1.0"
[dependencies.reqwest]
version = "0.10.*"
default-features = false
features = ["default-tls", "blocking", "gzip"]

View File

@@ -11,8 +11,7 @@
|___| |__________| \____________________| |___| |___| |___|
```
A data hoarder's dream come true: bundle any web page into a single HTML file.
You can finally replace that gazillion of open tabs with a gazillion of .html files stored somewhere on your precious little drive.
A data hoarder's dream come true: bundle any web page into a single HTML file. You can finally replace that gazillion of open tabs with a gazillion of .html files stored somewhere on your precious little drive.
Unlike the conventional "Save page as", `monolith` not only saves the target document, it embeds CSS, image, and JavaScript assets **all at once**, producing a single HTML5 document that is a joy to store and share.
@@ -25,11 +24,14 @@ If compared to saving websites with `wget -mpk`, this tool embeds all assets as
$ cd monolith
$ cargo install --path .
### On macOS (via Homebrew)
### With Homebrew (on macOS and GNU/Linux)
$ brew install monolith
### Using Snapcraft (on GNU/Linux)
$ snap install monolith
## Usage
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html
$ monolith https://lyrics.github.io/db/P/Portishead/Dummy/Roads/ -o portishead-roads-lyrics.html
## Options
- `-c`: Ignore styles
@@ -38,11 +40,20 @@ If compared to saving websites with `wget -mpk`, this tool embeds all assets as
- `-I`: Isolate document
- `-j`: Exclude JavaScript
- `-k`: Accept invalid X.509 (TLS) certificates
- `-o`: Write output to file
- `-s`: Silent mode
- `-u`: Specify custom User-Agent
## HTTPS and HTTP proxies
Please set `https_proxy`, `http_proxy` and `no_proxy` environment variables.
## Contributing
Please open an issue if something is wrong, that helps make this project better.
## Related projects
- `Monolith Chrome Extension`: https://github.com/rhysd/monolith-of-web
- `Pagesaver`: https://github.com/distributed-mind/pagesaver
- `Personal WayBack Machine`: https://github.com/popey/pwbm
- `SingleFile`: https://github.com/gildas-lormeau/SingleFile
## License

View File

@@ -1,6 +1,7 @@
name: monolith
base: core18
version: git
# Version data defined inside the monolith part below
adopt-info: monolith
summary: Monolith - Save HTML pages with ease
description: |
A data hoarder's dream come true: bundle any web page into a single
@@ -17,6 +18,14 @@ description: |
confinement: strict
# Building on armhf fails, so we specify all supported non-armhf architectures
architectures:
- build-on: amd64
- build-on: i386
- build-on: arm64
- build-on: ppc64el
- build-on: s390x
parts:
monolith:
plugin: rust
@@ -24,6 +33,21 @@ parts:
build-packages:
- libssl-dev
- pkg-config
override-pull: |
snapcraftctl pull
# Determine the current tag
last_committed_tag="$(git describe --tags --abbrev=0)"
last_committed_tag_ver="$(echo ${last_committed_tag} | sed 's/v//')"
# Determine the most recent version in the beta channel in the Snap Store
last_released_tag="$(snap info $SNAPCRAFT_PROJECT_NAME | awk '$1 == "beta:" { print $2 }')"
# If the latest tag from the upstream project has not been released to
# beta, build that tag instead of master.
if [ "${last_committed_tag_ver}" != "${last_released_tag}" ]; then
git fetch
git checkout "${last_committed_tag}"
fi
# set version number of the snap based on what we did above
snapcraftctl set-version $(git describe --tags --abbrev=0)
apps:
monolith:

View File

@@ -9,6 +9,7 @@ pub struct AppArgs {
pub no_js: bool,
pub insecure: bool,
pub isolate: bool,
pub output: String,
pub silent: bool,
pub user_agent: String,
}
@@ -36,6 +37,7 @@ impl AppArgs {
.args_from_usage("-I, --isolate 'Cut off from the Internet'")
.args_from_usage("-j, --no-js 'Exclude JavaScript'")
.args_from_usage("-k, --insecure 'Accept invalid X.509 (TLS) certificates'")
.args_from_usage("-o, --output=[document.html] 'Write output to <file>'")
.args_from_usage("-s, --silent 'Suppress verbosity'")
.args_from_usage("-u, --user-agent=[Iceweasel] 'Custom User-Agent string'")
// .args_from_usage("-v, --include-video 'Embed video sources'")
@@ -53,9 +55,10 @@ impl AppArgs {
app_args.insecure = app.is_present("insecure");
app_args.isolate = app.is_present("isolate");
app_args.silent = app.is_present("silent");
app_args.output = app.value_of("output").unwrap_or("").to_string();
app_args.user_agent = app
.value_of("user-agent")
.unwrap_or_else(|| DEFAULT_USER_AGENT)
.unwrap_or(DEFAULT_USER_AGENT)
.to_string();
app_args
}

View File

@@ -1,18 +1,20 @@
use crate::http::retrieve_asset;
use crate::js::attr_is_event_handler;
use crate::utils::{
data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol,
};
use html5ever::interface::QualName;
use html5ever::parse_document;
use html5ever::rcdom::{Handle, NodeData, RcDom};
use html5ever::serialize::{serialize, SerializeOpts};
use html5ever::tendril::{format_tendril, TendrilSink};
use html5ever::tendril::{format_tendril, Tendril, TendrilSink};
use html5ever::tree_builder::{Attribute, TreeSink};
use html5ever::{local_name, namespace_url, ns};
use http::retrieve_asset;
use js::attr_is_event_handler;
use reqwest::Client;
use reqwest::blocking::Client;
use std::collections::HashMap;
use std::default::Default;
use utils::{data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol};
const ICON_VALUES: [&str; 5] = [
const ICON_VALUES: &[&str] = &[
"icon",
"shortcut icon",
"mask-icon",
@@ -29,15 +31,18 @@ pub fn get_parent_node(node: &Handle) -> Handle {
parent.and_then(|node| node.upgrade()).unwrap()
}
pub fn get_node_name(node: &Handle) -> String {
pub fn get_node_name(node: &Handle) -> &'_ str {
match &node.data {
NodeData::Element { ref name, .. } => name.local.as_ref().to_string(),
_ => str!(),
NodeData::Element { ref name, .. } => name.local.as_ref(),
_ => "",
}
}
pub fn is_icon(attr_value: &str) -> bool {
ICON_VALUES.contains(&&*attr_value.to_lowercase())
ICON_VALUES
.iter()
.find(|a| attr_value.eq_ignore_ascii_case(a))
.is_some()
}
pub fn walk_and_embed_assets(
@@ -77,134 +82,185 @@ pub fn walk_and_embed_assets(
match name.local.as_ref() {
"link" => {
let mut link_type: &str = "";
// Remove integrity attributes
let mut i = 0;
while i < attrs_mut.len() {
let attr_name = attrs_mut[i].name.local.as_ref();
if attr_name.eq_ignore_ascii_case("integrity") {
attrs_mut.remove(i);
} else {
i += 1;
}
}
enum LinkType {
Icon,
Stylesheet,
Preload,
DnsPrefetch,
Unknown,
}
let mut link_type = LinkType::Unknown;
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "rel" {
if is_icon(&attr.value.to_string()) {
link_type = "icon";
let value = attr.value.trim();
if is_icon(value) {
link_type = LinkType::Icon;
break;
} else if attr.value.to_string() == "stylesheet" {
link_type = "stylesheet";
} else if value.eq_ignore_ascii_case("stylesheet") {
link_type = LinkType::Stylesheet;
break;
} else if value.eq_ignore_ascii_case("preload") {
link_type = LinkType::Preload;
break;
} else if value.eq_ignore_ascii_case("dns-prefetch") {
link_type = LinkType::DnsPrefetch;
break;
}
}
}
let link_type = link_type;
if link_type == "icon" {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
if opt_no_images {
attr.value.clear();
} else {
let href_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(str!());
let (favicon_dataurl, _) = retrieve_asset(
cache,
client,
&href_full_url,
true,
"",
opt_silent,
)
.unwrap_or((str!(), str!()));
attr.value.clear();
attr.value.push_slice(favicon_dataurl.as_str());
}
}
}
} else if link_type == "stylesheet" {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
if opt_no_css {
attr.value.clear();
} else {
let href_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(str!());
let replacement_text = match retrieve_asset(
cache,
client,
&href_full_url,
false,
"text/css",
opt_silent,
) {
// On successful retrieval, traverse CSS
Ok((css_data, _)) => resolve_css_imports(
match link_type {
LinkType::Icon => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
if opt_no_images {
attr.value.clear();
} else {
let href_full_url = resolve_url(&url, attr.value.as_ref())
.unwrap_or_default();
let (favicon_dataurl, _) = retrieve_asset(
cache,
client,
&css_data,
true,
&href_full_url,
opt_no_images,
true,
"",
opt_silent,
),
// If a network error occured, warn
Err(e) => {
eprintln!("Warning: {}", e,);
// If failed to resolve, replace with absolute URL
href_full_url
}
};
attr.value.clear();
attr.value.push_slice(&replacement_text);
)
.unwrap_or_default();
attr.value.clear();
attr.value.push_slice(favicon_dataurl.as_str());
}
}
}
}
} else {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
let href_full_url: String =
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
LinkType::Stylesheet => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
if opt_no_css {
attr.value.clear();
} else {
let href_full_url = resolve_url(&url, &attr.value.as_ref())
.unwrap_or_default();
let replacement_text = match retrieve_asset(
cache,
client,
&href_full_url,
false,
"text/css",
opt_silent,
) {
// On successful retrieval, traverse CSS
Ok((css_data, _)) => resolve_css_imports(
cache,
client,
&css_data,
true,
&href_full_url,
opt_no_images,
opt_silent,
),
// If a network error occured, warn
Err(e) => {
eprintln!("Warning: {}", e);
// If failed to resolve, replace with absolute URL
href_full_url
}
};
attr.value.clear();
attr.value.push_slice(&replacement_text);
}
}
}
}
LinkType::Preload | LinkType::DnsPrefetch => {
// Since all resources are embedded as data URL, preloading and prefetching are unnecessary
if let Some(attr) =
attrs_mut.iter_mut().find(|a| &a.name.local == "href")
{
attr.value.clear();
attr.value.push_slice(&href_full_url.as_str());
}
}
LinkType::Unknown => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
let href_full_url =
resolve_url(&url, attr.value.as_ref()).unwrap_or_default();
attr.value.clear();
attr.value.push_slice(&href_full_url.as_str());
}
}
}
}
}
"img" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
let value = attr.value.to_string();
// Ignore images with empty source
if value == str!() {
continue;
}
if opt_no_images {
attr.value.clear();
attr.value.push_slice(TRANSPARENT_PIXEL);
} else {
let src_full_url: String =
resolve_url(&url, &value).unwrap_or(str!());
let (img_dataurl, _) = retrieve_asset(
cache,
client,
&src_full_url,
true,
"",
opt_silent,
)
.unwrap_or((str!(), str!()));
attr.value.clear();
attr.value.push_slice(img_dataurl.as_str());
}
// Find source tags
let mut found_src: Option<Attribute> = None;
let mut found_datasrc: Option<Attribute> = None;
let mut i = 0;
while i < attrs_mut.len() {
let name = attrs_mut[i].name.local.as_ref();
if name.eq_ignore_ascii_case("src") {
found_src = Some(attrs_mut.remove(i));
} else if name.eq_ignore_ascii_case("data-src") {
found_datasrc = Some(attrs_mut.remove(i));
} else {
i += 1;
}
}
// If images are disabled, clear both sources
if opt_no_images {
attrs_mut.push(Attribute {
name: QualName::new(None, ns!(), local_name!("src")),
value: Tendril::from_slice(TRANSPARENT_PIXEL),
});
} else if let Some((dataurl, _)) = found_datasrc
.iter()
.chain(&found_src) // Give dataurl priority
.map(|attr| attr.value.trim())
.filter(|src| !src.is_empty()) // Ignore empty srcs
.next()
.and_then(|src| resolve_url(&url, src).ok()) // Make absolute
.and_then(|abs_src| // Download and convert to dataurl
retrieve_asset(
cache,
client,
&abs_src,
true,
"",
opt_silent,
).ok())
{
// Add the new dataurl src attribute
attrs_mut.push(Attribute {
name: QualName::new(None, ns!(), local_name!("src")),
value: Tendril::from_slice(dataurl.as_ref()),
});
}
}
"source" => {
for attr in attrs_mut.iter_mut() {
let attr_name: &str = &attr.name.local;
if attr_name == "src" {
let src_full_url: String = resolve_url(&url, &attr.value.to_string())
.unwrap_or(attr.value.to_string());
let src_full_url = resolve_url(&url, attr.value.trim())
.unwrap_or_else(|_| attr.value.to_string());
attr.value.clear();
attr.value.push_slice(src_full_url.as_str());
} else if attr_name == "srcset" {
@@ -213,9 +269,8 @@ pub fn walk_and_embed_assets(
attr.value.clear();
attr.value.push_slice(TRANSPARENT_PIXEL);
} else {
let srcset_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(str!());
let srcset_full_url =
resolve_url(&url, attr.value.trim()).unwrap_or_default();
let (source_dataurl, _) = retrieve_asset(
cache,
client,
@@ -235,19 +290,30 @@ pub fn walk_and_embed_assets(
"a" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
let attr_value = attr.value.trim();
// Don't touch email links or hrefs which begin with a hash sign
if attr.value.starts_with('#') || url_has_protocol(&attr.value) {
if attr_value.starts_with('#') || url_has_protocol(attr_value) {
continue;
}
let href_full_url: String =
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
let href_full_url = resolve_url(&url, attr_value).unwrap_or_default();
attr.value.clear();
attr.value.push_slice(href_full_url.as_str());
}
}
}
"script" => {
// Remove integrity attributes
let mut i = 0;
while i < attrs_mut.len() {
let attr_name = attrs_mut[i].name.local.as_ref();
if attr_name.eq_ignore_ascii_case("integrity") {
attrs_mut.remove(i);
} else {
i += 1;
}
}
if opt_no_js {
// Empty src and inner content of SCRIPT tags
for attr in attrs_mut.iter_mut() {
@@ -259,8 +325,8 @@ pub fn walk_and_embed_assets(
} else {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
let src_full_url: String =
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
let src_full_url =
resolve_url(&url, attr.value.trim()).unwrap_or_default();
let (js_dataurl, _) = retrieve_asset(
cache,
client,
@@ -302,10 +368,11 @@ pub fn walk_and_embed_assets(
"form" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "action" {
let attr_value = attr.value.trim();
// Modify action to be a full URL
if !is_valid_url(&attr.value) {
let href_full_url: String =
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
if !is_valid_url(attr_value) {
let href_full_url =
resolve_url(&url, attr_value).unwrap_or_default();
attr.value.clear();
attr.value.push_slice(href_full_url.as_str());
}
@@ -321,15 +388,14 @@ pub fn walk_and_embed_assets(
continue;
}
let iframe_src: String = attr.value.to_string();
let iframe_src = attr.value.trim();
// Ignore iframes with empty source (they cause infinite loops)
if iframe_src == str!() {
if iframe_src.is_empty() {
continue;
}
let src_full_url: String =
resolve_url(&url, &iframe_src).unwrap_or(str!());
let src_full_url = resolve_url(&url, iframe_src).unwrap_or_default();
let (iframe_data, iframe_final_url) = retrieve_asset(
cache,
client,
@@ -362,18 +428,18 @@ pub fn walk_and_embed_assets(
"video" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "poster" {
let video_poster = attr.value.to_string();
let video_poster = attr.value.trim();
// Skip posters with empty source
if video_poster == str!() {
if video_poster.is_empty() {
continue;
}
if opt_no_images {
attr.value.clear();
} else {
let poster_full_url: String =
resolve_url(&url, &video_poster).unwrap_or(str!());
let poster_full_url =
resolve_url(&url, video_poster).unwrap_or_default();
let (poster_dataurl, _) = retrieve_asset(
cache,
client,
@@ -478,9 +544,7 @@ fn get_child_node_by_name(handle: &Handle, node_name: &str) -> Handle {
});
match matching_children {
Some(node) => node.clone(),
_ => {
return handle.clone();
}
_ => handle.clone(),
}
}
@@ -496,7 +560,7 @@ pub fn stringify_document(
serialize(&mut buf, handle, SerializeOpts::default())
.expect("unable to serialize DOM into buffer");
let mut result: String = String::from_utf8(buf).unwrap();
let mut result = String::from_utf8(buf).unwrap();
if opt_isolate || opt_no_css || opt_no_frames || opt_no_js || opt_no_images {
let mut buf: Vec<u8> = Vec::new();
@@ -520,7 +584,6 @@ pub fn stringify_document(
if opt_no_images {
content_attr += " img-src data:;";
}
content_attr = content_attr.trim().to_string();
let meta = dom.create_element(
QualName::new(None, ns!(), local_name!("meta")),
@@ -531,7 +594,7 @@ pub fn stringify_document(
},
Attribute {
name: QualName::new(None, ns!(), local_name!("content")),
value: format_tendril!("{}", content_attr),
value: format_tendril!("{}", content_attr.trim()),
},
],
Default::default(),

View File

@@ -1,7 +1,7 @@
use crate::utils::{clean_url, data_to_dataurl, is_data_url};
use reqwest::blocking::Client;
use reqwest::header::CONTENT_TYPE;
use reqwest::Client;
use std::collections::HashMap;
use utils::{data_to_dataurl, is_data_url};
pub fn retrieve_asset(
cache: &mut HashMap<String, String>,
@@ -11,28 +11,33 @@ pub fn retrieve_asset(
mime: &str,
opt_silent: bool,
) -> Result<(String, String), reqwest::Error> {
let cache_key = clean_url(&url);
if is_data_url(&url).unwrap() {
Ok((url.to_string(), url.to_string()))
} else {
if cache.contains_key(&url.to_string()) {
if cache.contains_key(&cache_key) {
// url is in cache
if !opt_silent {
eprintln!("{} (from cache)", &url);
}
let data = cache.get(&url.to_string()).unwrap();
let data = cache.get(&cache_key).unwrap();
Ok((data.to_string(), url.to_string()))
} else {
// url not in cache, we request it
let mut response = client.get(url).send()?;
let res_url = response.url().to_string();
if !opt_silent {
if url == response.url().as_str() {
if url == res_url {
eprintln!("{}", &url);
} else {
eprintln!("{} -> {}", &url, &response.url().as_str());
eprintln!("{} -> {}", &url, &res_url);
}
}
let new_cache_key = clean_url(&res_url);
if as_dataurl {
// Convert response into a byte array
let mut data: Vec<u8> = vec![];
@@ -50,13 +55,13 @@ pub fn retrieve_asset(
};
let dataurl = data_to_dataurl(&mimetype, &data);
// insert in cache
cache.insert(response.url().to_string(), dataurl.to_string());
Ok((dataurl, response.url().to_string()))
cache.insert(new_cache_key, dataurl.clone());
Ok((dataurl, res_url))
} else {
let content = response.text().unwrap();
// insert in cache
cache.insert(response.url().to_string(), content.clone());
Ok((content, response.url().to_string()))
cache.insert(new_cache_key, content.clone());
Ok((content, res_url))
}
}
}

111
src/js.rs
View File

@@ -1,32 +1,103 @@
const JS_DOM_EVENT_ATTRS: [&str; 21] = [
// Input
"onfocus",
const JS_DOM_EVENT_ATTRS: &[&str] = &[
// From WHATWG HTML spec 8.1.5.2 'Event handlers on elements, Document objects, and Window objects':
// https://html.spec.whatwg.org/#event-handlers-on-elements,-document-objects,-and-window-objects
// https://html.spec.whatwg.org/#attributes-3 (table 'List of event handler content attributes')
// Global event handlers
"onabort",
"onauxclick",
"onblur",
"onselect",
"oncancel",
"oncanplay",
"oncanplaythrough",
"onchange",
"onsubmit",
"onreset",
"onclick",
"onclose",
"oncontextmenu",
"oncuechange",
"ondblclick",
"ondrag",
"ondragend",
"ondragenter",
"ondragexit",
"ondragleave",
"ondragover",
"ondragstart",
"ondrop",
"ondurationchange",
"onemptied",
"onended",
"onerror",
"onfocus",
"onformdata",
"oninput",
"oninvalid",
"onkeydown",
"onkeypress",
"onkeyup",
// Mouse
"onmouseover",
"onmouseout",
"onmousedown",
"onmouseup",
"onmousemove",
// Click
"onclick",
"ondblclick",
// Load
"onload",
"onunload",
"onabort",
"onerror",
"onloadeddata",
"onloadedmetadata",
"onloadstart",
"onmousedown",
"onmouseenter",
"onmouseleave",
"onmousemove",
"onmouseout",
"onmouseover",
"onmouseup",
"onwheel",
"onpause",
"onplay",
"onplaying",
"onprogress",
"onratechange",
"onreset",
"onresize",
"onscroll",
"onsecuritypolicyviolation",
"onseeked",
"onseeking",
"onselect",
"onslotchange",
"onstalled",
"onsubmit",
"onsuspend",
"ontimeupdate",
"ontoggle",
"onvolumechange",
"onwaiting",
"onwebkitanimationend",
"onwebkitanimationiteration",
"onwebkitanimationstart",
"onwebkittransitionend",
// Event handlers for <body/> and <frameset/> elements
"onafterprint",
"onbeforeprint",
"onbeforeunload",
"onhashchange",
"onlanguagechange",
"onmessage",
"onmessageerror",
"onoffline",
"ononline",
"onpagehide",
"onpageshow",
"onpopstate",
"onrejectionhandled",
"onstorage",
"onunhandledrejection",
"onunload",
// Event handlers for <html/> element
"oncut",
"oncopy",
"onpaste",
];
// Returns true if DOM attribute name matches a native JavaScript event handler
pub fn attr_is_event_handler(attr_name: &str) -> bool {
JS_DOM_EVENT_ATTRS.contains(&attr_name.to_lowercase().as_str())
JS_DOM_EVENT_ATTRS
.iter()
.find(|a| attr_name.eq_ignore_ascii_case(a))
.is_some()
}

View File

@@ -1,9 +1,5 @@
extern crate html5ever;
#[macro_use]
extern crate lazy_static;
extern crate regex;
extern crate reqwest;
extern crate url;
#[macro_use]
mod macros;

View File

@@ -1,70 +1,111 @@
#[macro_use]
extern crate clap;
extern crate monolith;
extern crate reqwest;
mod args;
mod macros;
use args::AppArgs;
use crate::args::AppArgs;
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
use monolith::http::retrieve_asset;
use monolith::utils::is_valid_url;
use reqwest::blocking::Client;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use std::collections::HashMap;
use std::fs::File;
use std::io::{self, Error, Write};
use std::process;
use std::time::Duration;
enum Output {
Stdout(io::Stdout),
File(File),
}
impl Output {
fn new(file_path: &str) -> Result<Output, Error> {
if file_path.is_empty() {
Ok(Output::Stdout(io::stdout()))
} else {
Ok(Output::File(File::create(file_path)?))
}
}
fn writeln_str(&mut self, s: &str) -> Result<(), Error> {
match self {
Output::Stdout(stdout) => {
writeln!(stdout, "{}", s)?;
stdout.flush()
}
Output::File(f) => {
writeln!(f, "{}", s)?;
f.flush()
}
}
}
}
fn main() {
let app_args = AppArgs::get();
let cache = &mut HashMap::new();
if is_valid_url(app_args.url_target.as_str()) {
// Initialize client
let mut header_map = HeaderMap::new();
match HeaderValue::from_str(&app_args.user_agent) {
Ok(header) => header_map.insert(USER_AGENT, header),
Err(err) => {
eprintln!("Invalid user agent! {}", err);
return;
}
};
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(10))
.danger_accept_invalid_certs(app_args.insecure)
.default_headers(header_map)
.build()
.expect("Failed to initialize HTTP client");
let (data, final_url) = retrieve_asset(
cache,
&client,
app_args.url_target.as_str(),
false,
"",
app_args.silent,
)
.unwrap();
let dom = html_to_dom(&data);
walk_and_embed_assets(
cache,
&client,
&final_url,
&dom.document,
app_args.no_css,
app_args.no_js,
app_args.no_images,
app_args.silent,
app_args.no_frames,
if !is_valid_url(app_args.url_target.as_str()) {
eprintln!(
"Only HTTP and HTTPS URLs are allowed but got: {}",
&app_args.url_target
);
let html: String = stringify_document(
&dom.document,
app_args.no_css,
app_args.no_frames,
app_args.no_js,
app_args.no_images,
app_args.isolate,
);
println!("{}", html);
process::exit(1);
}
let mut output = Output::new(&app_args.output).expect("Could not prepare output");
// Initialize client
let mut cache = HashMap::new();
let mut header_map = HeaderMap::new();
header_map.insert(
USER_AGENT,
HeaderValue::from_str(&app_args.user_agent).expect("Invalid User-Agent header specified"),
);
let client = Client::builder()
.timeout(Duration::from_secs(10))
.danger_accept_invalid_certs(app_args.insecure)
.default_headers(header_map)
.build()
.expect("Failed to initialize HTTP client");
// Retrieve root document
let (data, final_url) = retrieve_asset(
&mut cache,
&client,
app_args.url_target.as_str(),
false,
"",
app_args.silent,
)
.expect("Could not retrieve assets in HTML");
let dom = html_to_dom(&data);
walk_and_embed_assets(
&mut cache,
&client,
&final_url,
&dom.document,
app_args.no_css,
app_args.no_js,
app_args.no_images,
app_args.silent,
app_args.no_frames,
);
let html: String = stringify_document(
&dom.document,
app_args.no_css,
app_args.no_frames,
app_args.no_js,
app_args.no_images,
app_args.isolate,
);
output
.writeln_str(&html)
.expect("Could not write HTML output");
}

View File

@@ -3,6 +3,7 @@ use crate::html::{
};
use html5ever::rcdom::{Handle, NodeData};
use html5ever::serialize::{serialize, SerializeOpts};
use reqwest::blocking::Client;
use std::collections::HashMap;
#[test]
@@ -33,7 +34,8 @@ fn test_get_parent_node_name() {
}
NodeData::Element { ref name, .. } => {
let node_name = name.local.as_ref().to_string();
let parent_node_name = get_node_name(&get_parent_node(node));
let parent = get_parent_node(node);
let parent_node_name = get_node_name(&parent);
if node_name == "head" || node_name == "body" {
assert_eq!(parent_node_name, "html");
} else if node_name == "div" {
@@ -71,7 +73,7 @@ fn test_walk_and_embed_assets() {
let opt_no_images: bool = false;
let opt_silent = true;
let client = reqwest::Client::new();
let client = Client::new();
walk_and_embed_assets(
cache,
@@ -107,7 +109,7 @@ fn test_walk_and_embed_assets_ensure_no_recursive_iframe() {
let opt_no_images: bool = false;
let opt_silent = true;
let client = reqwest::Client::new();
let client = Client::new();
walk_and_embed_assets(
cache,
@@ -144,7 +146,7 @@ fn test_walk_and_embed_assets_no_css() {
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_silent = true;
let client = reqwest::Client::new();
let client = Client::new();
walk_and_embed_assets(
cache,
@@ -189,7 +191,7 @@ fn test_walk_and_embed_assets_no_images() {
let opt_no_images: bool = true;
let opt_silent = true;
let client = reqwest::Client::new();
let client = Client::new();
walk_and_embed_assets(
cache,
@@ -235,7 +237,7 @@ fn test_walk_and_embed_assets_no_frames() {
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_silent = true;
let client = reqwest::Client::new();
let client = Client::new();
walk_and_embed_assets(
cache,
@@ -274,7 +276,7 @@ fn test_walk_and_embed_assets_no_js() {
let opt_no_images: bool = false;
let opt_silent = true;
let client = reqwest::Client::new();
let client = Client::new();
walk_and_embed_assets(
cache,
@@ -298,6 +300,45 @@ fn test_walk_and_embed_assets_no_js() {
);
}
#[test]
fn test_walk_and_embed_with_no_integrity() {
let html = "<title>No integrity</title>\
<link integrity=\"sha384-...\" rel=\"something\"/>\
<script integrity=\"sha384-...\" src=\"some.js\"></script>";
let dom = html_to_dom(&html);
let url = "http://localhost";
let cache = &mut HashMap::new();
let client = Client::new();
let opt_no_css: bool = true;
let opt_no_frames: bool = true;
let opt_no_js: bool = true;
let opt_no_images: bool = true;
let opt_silent = true;
walk_and_embed_assets(
cache,
&client,
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
opt_silent,
opt_no_frames,
);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html>\
<head><title>No integrity</title><link rel=\"something\"><script src=\"\"></script></head>\
<body></body>\
</html>"
);
}
#[test]
fn test_stringify_document() {
let html = "<div><script src=\"some.js\"></script></div>";

View File

@@ -1,9 +1,11 @@
use crate::http::retrieve_asset;
use reqwest::blocking::Client;
use std::collections::HashMap;
#[test]
fn test_retrieve_asset() {
let cache = &mut HashMap::new();
let client = reqwest::Client::new();
let client = Client::new();
let (data, final_url) =
retrieve_asset(cache, &client, "data:text/html;base64,...", true, "", false).unwrap();
assert_eq!(&data, "data:text/html;base64,...");

View File

@@ -1,5 +1,6 @@
use crate::utils::{
data_to_dataurl, detect_mimetype, is_data_url, is_valid_url, resolve_url, url_has_protocol,
clean_url, data_to_dataurl, detect_mimetype, is_data_url, is_valid_url, resolve_url,
url_has_protocol,
};
use url::ParseError;
@@ -158,3 +159,19 @@ fn test_is_data_url() {
assert!(!is_data_url("//kernel.org").unwrap_or(false));
assert!(!is_data_url("").unwrap_or(false));
}
#[test]
fn test_clean_url() {
assert_eq!(
clean_url("https://somewhere.com/font.eot#iefix"),
"https://somewhere.com/font.eot"
);
assert_eq!(
clean_url("https://somewhere.com/font.eot#"),
"https://somewhere.com/font.eot"
);
assert_eq!(
clean_url("https://somewhere.com/font.eot?#"),
"https://somewhere.com/font.eot"
);
}

View File

@@ -1,9 +1,7 @@
extern crate base64;
use self::base64::encode;
use http::retrieve_asset;
use crate::http::retrieve_asset;
use base64::encode;
use regex::Regex;
use reqwest::Client;
use reqwest::blocking::Client;
use std::collections::HashMap;
use url::{ParseError, Url};
@@ -196,3 +194,14 @@ pub fn resolve_css_imports(
resolved_css
}
}
pub fn clean_url<T: AsRef<str>>(url: T) -> String {
let mut result = Url::parse(url.as_ref()).unwrap();
// Clear fragment
result.set_fragment(None);
// Get rid of stray question mark
if result.query() == Some("") {
result.set_query(None);
}
result.to_string()
}