43 Commits

Author SHA1 Message Date
Sunshine
f876e9243c Merge pull request #109 from snshn/version-bump
version bump (2.1.1 → 2.1.2)
2020-01-21 08:39:10 -05:00
Sunshine
b6896febf1 version bump (2.1.1 → 2.1.2) 2020-01-21 02:32:29 -05:00
Sunshine
29d2ba5857 Merge pull request #107 from snshn/update-readme
Update README.md
2020-01-21 02:18:10 -05:00
Sunshine
8b1ebc7871 Update README.md 2020-01-21 02:16:36 -05:00
Sunshine
d753c83c76 Merge pull request #108 from rhysd/revert-manual-proxy-support
Revert #106 since reqwest supports system proxies by default
2020-01-21 02:15:29 -05:00
rhysd
47a825f5ed add proxies instruction in README.md 2020-01-21 13:02:45 +09:00
rhysd
0e12cecd85 Revert "Merge pull request #106 from rhysd/proxy-support"
This reverts commit d8def879b2, reversing
changes made to a9d114d04d.
2020-01-21 13:01:22 +09:00
Sunshine
d8def879b2 Merge pull request #106 from rhysd/proxy-support
Support HTTP and HTTPS proxies
2020-01-20 18:36:00 -05:00
Linda_pp
0420854ed6 remove '$' from environment variable names in README.md 2020-01-20 23:11:14 +09:00
rhysd
d47482fcd9 fix crash at setting empty values to HTTP proxies
with this patch `https_proxy=` and `http_proxy=` will work well.
2020-01-20 17:17:24 +09:00
rhysd
b68624f2f3 support HTTP and HTTPS proxies (fix #103) 2020-01-20 17:02:43 +09:00
Sunshine
a9d114d04d Merge pull request #105 from rhysd/refactor-main
Refactoring for main.rs to address several issues
2020-01-20 01:10:29 -05:00
rhysd
4e4ebe9c98 refactor main to address several issues
Addressed issues:

- when specified URL is invalid, it exited successfully with doing
  nothing. There was no way why it does not work for users
- it exited successfully even if invalid User-Agent value is specified
- it created file twice on `--output` option specified. It may cause an
  issue when some file watcher (e.g. FsEvents on macOS) is watching

Improvements:
- handle errors with `Result::expect` consistently it correctly exits
  with non-zero status on error
- define `Output` enum for handling both stdout and file outputs
2020-01-15 16:52:20 +09:00
Sunshine
429217d8f7 Merge pull request #104 from rhysd/complete-dom-event-handlers
Use complete list of DOM event handlers for detecting JS attributes
2020-01-15 01:34:01 -05:00
rhysd
1779f4a374 better comments for JS_DOM_EVENT_ATTRS constant 2020-01-15 14:33:27 +09:00
rhysd
26e89ae6d3 use complete list of DOM event handlers 2020-01-15 13:58:09 +09:00
Sunshine
b333d19d04 Update README.md 2020-01-14 03:42:04 -05:00
Sunshine
c1dc798ded Merge pull request #101 from rhysd/ignore-preload
Improve handling preload links and white spaces in attribute values
2020-01-13 17:51:25 -05:00
rhysd
69d99b69e8 remove . in line comment 2020-01-13 23:47:07 +09:00
Sunshine
aae53d20f0 Merge pull request #102 from popey/update-snap-config
Update snapcraft configuration
2020-01-13 08:39:15 -05:00
Alan Pope
14cf2ce8a6 Update snapcraft configuration
This changes the build slightly. If snapcraft is triggered when there is a new tagged release in the project github release page, and it's newer than the version in the Snap Store beta channel, we build that stable release. If however, the latest stable release in github releases is already the same as the Snap Store beta channel, then we build the tip of master.

This gives a couple of advantages. 

  * One yaml can be used to build tip-of-git snaps, and stable releases alike
  * Closing the beta channel in the Snap Store will mean the next triggered build will re-build whatever the last stable release is. This is useful to force a rebuild of the stable version in case a dependency (not that there are many) has a security issue.

We also now set the version dynamically based on the git tags.
2020-01-13 11:14:08 +00:00
rhysd
67b79e92f9 simplify &x.into_iter() to x.iter() 2020-01-10 14:45:02 +09:00
rhysd
b51f41fe34 trim attribute values 2020-01-10 14:41:05 +09:00
rhysd
6f158dc6db compare value of 'rel' properties in case-insensitive 2020-01-10 13:52:31 +09:00
rhysd
8d7052b39c ignore preload and prefetch sources
since all resources are embedded as data URL.
2020-01-09 18:18:21 +09:00
rhysd
660511b8a0 define link type of <link> element as enum and prefer match statement
since match statement checks exhaustiveness
2020-01-09 16:55:42 +09:00
Sunshine
929512f4f5 Merge pull request #97 from rhysd/reqwest-0.10.0
Upgrade reqwest to v0.10.0 for better binary size and build time
2020-01-08 01:43:55 -05:00
Sunshine
a46d89cefc Merge pull request #98 from rhysd/fix-ci
Fix nighly and beta CI
2020-01-07 18:14:30 -05:00
rhysd
f93646e17a ignore beta channel again on AppVeyor
since rustc command crashes on combination of
channel=beta & target=i686-pc-windows-gnu
2020-01-07 17:31:36 +09:00
rhysd
9d14b6dfea rename appveyor.yml to .appveyor.yml
align to .travis.yml
2020-01-07 15:28:29 +09:00
rhysd
9783b96524 check beta channel on CI not to break this crate with next Rust version 2020-01-07 15:28:29 +09:00
rhysd
106efe58ce fix nighly and beta on CI are failing
we always use stable rustfmt so checking with nighly/beta rustfmt is not
necessary.
2020-01-07 15:28:29 +09:00
rhysd
6e99ad13e7 upgrade reqwest to v0.10.0
This will improve build time and binary size as follows:

* Before

- **Compile targets**: 220
- **Build time**: `cargo build --release  1264.95s user 39.72s system 335% cpu 6:29.14 total`
- **Binary size**: 6578568 bytes

* After

- **Compile targets**: 170
- **Build time**: `cargo build --release  1130.64s user 32.15s system 359% cpu 5:23.69 total`
- **Binary size**: 6107088 bytes

* Differences

- **Compile targets**: 1.29x smaller
- **Build time**: 1.23x faster
- **Binary size**: 1.07x smaller
2020-01-07 14:22:32 +09:00
Sunshine
413dd66886 Merge pull request #96 from rhysd/refactorings
Refactorings
2020-01-05 18:46:31 -05:00
rhysd
dc7ec6e7a8 remove more redundant type annotations 2020-01-04 16:33:11 +09:00
rhysd
ed879231af fix test code was broken by refactoring 2020-01-04 08:07:19 +09:00
rhysd
ddf4b8ac13 prefer &str to String for reducing allocations 2020-01-04 08:05:02 +09:00
rhysd
84c13f0605 prefer unwrap_or_default to unwrap_or 2020-01-04 07:58:29 +09:00
rhysd
ce03e0e487 reduce allocation on checking DOM attributes and do not hard-code number of elements of array constant
`to_lower` allocates new string but the allocation is not necessary
here.
2020-01-04 07:52:47 +09:00
rhysd
63e19998d0 reduce clones and fix some code styles and redundant code 2020-01-04 07:49:26 +09:00
Sunshine
e3321bbb07 Merge pull request #95 from rhysd/rust2018
Migrate to Rust2018 edition
2020-01-03 02:00:47 -05:00
rhysd
0a38cd0eae add rhysd to authors list 2020-01-03 15:43:25 +09:00
rhysd
75fb6961ed migrate to Rust 2018 2020-01-03 00:33:49 +09:00
15 changed files with 731 additions and 764 deletions

View File

@@ -94,8 +94,8 @@ environment:
# or test failure in the matching channels/targets from failing the entire build.
matrix:
allow_failures:
- channel: beta
- channel: nightly
- channel: beta
# If you only care about stable channel build failures, uncomment the following line:
#- channel: beta
@@ -128,4 +128,3 @@ build: false
# environment variable.
test_script:
- cargo test --verbose %cargoflags%
- cargo fmt --all -- --check

View File

@@ -18,10 +18,12 @@ before_script:
script:
- cargo build --all --locked --verbose
- cargo test --all --locked --verbose
- cargo fmt --all -- --check
- |
if [[ "$TRAVIS_RUST_VERSION" == "stable" ]]; then
cargo fmt --all -- --check
fi
jobs:
allow_failures:
- rust: beta
- rust: nightly
fast_finish: true

865
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,11 +1,13 @@
[package]
name = "monolith"
version = "2.1.1"
version = "2.1.2"
edition = "2018"
authors = [
"Sunshine <sunshine@uberspace.net>",
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
"Emmanuel Delaborde <th3rac25@gmail.com>",
"Emi Simpson <emi@alchemi.dev>",
"rhysd <lin90162@yahoo.co.jp>",
]
description = "CLI tool for saving web pages as a single HTML file"
@@ -15,5 +17,9 @@ clap = "2.33.0"
html5ever = "0.24.1"
lazy_static = "1.4.0"
regex = "1.3.1"
reqwest = "0.9.20"
url = "2.1.0"
[dependencies.reqwest]
version = "0.10.*"
default-features = false
features = ["default-tls", "blocking", "gzip"]

View File

@@ -11,8 +11,7 @@
|___| |__________| \____________________| |___| |___| |___|
```
A data hoarder's dream come true: bundle any web page into a single HTML file.
You can finally replace that gazillion of open tabs with a gazillion of .html files stored somewhere on your precious little drive.
A data hoarder's dream come true: bundle any web page into a single HTML file. You can finally replace that gazillion of open tabs with a gazillion of .html files stored somewhere on your precious little drive.
Unlike the conventional "Save page as", `monolith` not only saves the target document, it embeds CSS, image, and JavaScript assets **all at once**, producing a single HTML5 document that is a joy to store and share.
@@ -25,11 +24,14 @@ If compared to saving websites with `wget -mpk`, this tool embeds all assets as
$ cd monolith
$ cargo install --path .
### On macOS (via Homebrew)
### With Homebrew (on macOS and GNU/Linux)
$ brew install monolith
### Using Snapcraft (on GNU/Linux)
$ snap install monolith
## Usage
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ -o portishead-roads-lyrics.html
$ monolith https://lyrics.github.io/db/P/Portishead/Dummy/Roads/ -o portishead-roads-lyrics.html
## Options
- `-c`: Ignore styles
@@ -42,8 +44,16 @@ If compared to saving websites with `wget -mpk`, this tool embeds all assets as
- `-s`: Silent mode
- `-u`: Specify custom User-Agent
## HTTPS and HTTP proxies
Please set `https_proxy`, `http_proxy` and `no_proxy` environment variables.
## Contributing
Please open an issue if something is wrong, that helps make this project better.
## Related projects
- `Monolith Chrome Extension`: https://github.com/rhysd/monolith-of-web
- `Pagesaver`: https://github.com/distributed-mind/pagesaver
- `Personal WayBack Machine`: https://github.com/popey/pwbm
- `SingleFile`: https://github.com/gildas-lormeau/SingleFile
## License

View File

@@ -1,6 +1,7 @@
name: monolith
base: core18
version: git
# Version data defined inside the monolith part below
adopt-info: monolith
summary: Monolith - Save HTML pages with ease
description: |
A data hoarder's dream come true: bundle any web page into a single
@@ -17,6 +18,14 @@ description: |
confinement: strict
# Building on armhf fails, so we specify all supported non-armhf architectures
architectures:
- build-on: amd64
- build-on: i386
- build-on: arm64
- build-on: ppc64el
- build-on: s390x
parts:
monolith:
plugin: rust
@@ -24,6 +33,21 @@ parts:
build-packages:
- libssl-dev
- pkg-config
override-pull: |
snapcraftctl pull
# Determine the current tag
last_committed_tag="$(git describe --tags --abbrev=0)"
last_committed_tag_ver="$(echo ${last_committed_tag} | sed 's/v//')"
# Determine the most recent version in the beta channel in the Snap Store
last_released_tag="$(snap info $SNAPCRAFT_PROJECT_NAME | awk '$1 == "beta:" { print $2 }')"
# If the latest tag from the upstream project has not been released to
# beta, build that tag instead of master.
if [ "${last_committed_tag_ver}" != "${last_released_tag}" ]; then
git fetch
git checkout "${last_committed_tag}"
fi
# set version number of the snap based on what we did above
snapcraftctl set-version $(git describe --tags --abbrev=0)
apps:
monolith:

View File

@@ -58,7 +58,7 @@ impl AppArgs {
app_args.output = app.value_of("output").unwrap_or("").to_string();
app_args.user_agent = app
.value_of("user-agent")
.unwrap_or_else(|| DEFAULT_USER_AGENT)
.unwrap_or(DEFAULT_USER_AGENT)
.to_string();
app_args
}

View File

@@ -1,3 +1,8 @@
use crate::http::retrieve_asset;
use crate::js::attr_is_event_handler;
use crate::utils::{
data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol,
};
use html5ever::interface::QualName;
use html5ever::parse_document;
use html5ever::rcdom::{Handle, NodeData, RcDom};
@@ -5,14 +10,11 @@ use html5ever::serialize::{serialize, SerializeOpts};
use html5ever::tendril::{format_tendril, Tendril, TendrilSink};
use html5ever::tree_builder::{Attribute, TreeSink};
use html5ever::{local_name, namespace_url, ns};
use http::retrieve_asset;
use js::attr_is_event_handler;
use reqwest::Client;
use reqwest::blocking::Client;
use std::collections::HashMap;
use std::default::Default;
use utils::{data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol};
const ICON_VALUES: [&str; 5] = [
const ICON_VALUES: &[&str] = &[
"icon",
"shortcut icon",
"mask-icon",
@@ -29,15 +31,18 @@ pub fn get_parent_node(node: &Handle) -> Handle {
parent.and_then(|node| node.upgrade()).unwrap()
}
pub fn get_node_name(node: &Handle) -> String {
pub fn get_node_name(node: &Handle) -> &'_ str {
match &node.data {
NodeData::Element { ref name, .. } => name.local.as_ref().to_string(),
_ => str!(),
NodeData::Element { ref name, .. } => name.local.as_ref(),
_ => "",
}
}
pub fn is_icon(attr_value: &str) -> bool {
ICON_VALUES.contains(&&*attr_value.to_lowercase())
ICON_VALUES
.iter()
.find(|a| attr_value.eq_ignore_ascii_case(a))
.is_some()
}
pub fn walk_and_embed_assets(
@@ -77,8 +82,6 @@ pub fn walk_and_embed_assets(
match name.local.as_ref() {
"link" => {
let mut link_type: &str = "";
// Remove integrity attributes
let mut i = 0;
while i < attrs_mut.len() {
@@ -90,90 +93,117 @@ pub fn walk_and_embed_assets(
}
}
enum LinkType {
Icon,
Stylesheet,
Preload,
DnsPrefetch,
Unknown,
}
let mut link_type = LinkType::Unknown;
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "rel" {
if is_icon(&attr.value.to_string()) {
link_type = "icon";
let value = attr.value.trim();
if is_icon(value) {
link_type = LinkType::Icon;
break;
} else if attr.value.to_string() == "stylesheet" {
link_type = "stylesheet";
} else if value.eq_ignore_ascii_case("stylesheet") {
link_type = LinkType::Stylesheet;
break;
} else if value.eq_ignore_ascii_case("preload") {
link_type = LinkType::Preload;
break;
} else if value.eq_ignore_ascii_case("dns-prefetch") {
link_type = LinkType::DnsPrefetch;
break;
}
}
}
let link_type = link_type;
if link_type == "icon" {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
if opt_no_images {
attr.value.clear();
} else {
let href_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(str!());
let (favicon_dataurl, _) = retrieve_asset(
cache,
client,
&href_full_url,
true,
"",
opt_silent,
)
.unwrap_or((str!(), str!()));
attr.value.clear();
attr.value.push_slice(favicon_dataurl.as_str());
}
}
}
} else if link_type == "stylesheet" {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
if opt_no_css {
attr.value.clear();
} else {
let href_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(str!());
let replacement_text = match retrieve_asset(
cache,
client,
&href_full_url,
false,
"text/css",
opt_silent,
) {
// On successful retrieval, traverse CSS
Ok((css_data, _)) => resolve_css_imports(
match link_type {
LinkType::Icon => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
if opt_no_images {
attr.value.clear();
} else {
let href_full_url = resolve_url(&url, attr.value.as_ref())
.unwrap_or_default();
let (favicon_dataurl, _) = retrieve_asset(
cache,
client,
&css_data,
true,
&href_full_url,
opt_no_images,
true,
"",
opt_silent,
),
// If a network error occured, warn
Err(e) => {
eprintln!("Warning: {}", e,);
// If failed to resolve, replace with absolute URL
href_full_url
}
};
attr.value.clear();
attr.value.push_slice(&replacement_text);
)
.unwrap_or_default();
attr.value.clear();
attr.value.push_slice(favicon_dataurl.as_str());
}
}
}
}
} else {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
let href_full_url: String =
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
LinkType::Stylesheet => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
if opt_no_css {
attr.value.clear();
} else {
let href_full_url = resolve_url(&url, &attr.value.as_ref())
.unwrap_or_default();
let replacement_text = match retrieve_asset(
cache,
client,
&href_full_url,
false,
"text/css",
opt_silent,
) {
// On successful retrieval, traverse CSS
Ok((css_data, _)) => resolve_css_imports(
cache,
client,
&css_data,
true,
&href_full_url,
opt_no_images,
opt_silent,
),
// If a network error occured, warn
Err(e) => {
eprintln!("Warning: {}", e);
// If failed to resolve, replace with absolute URL
href_full_url
}
};
attr.value.clear();
attr.value.push_slice(&replacement_text);
}
}
}
}
LinkType::Preload | LinkType::DnsPrefetch => {
// Since all resources are embedded as data URL, preloading and prefetching are unnecessary
if let Some(attr) =
attrs_mut.iter_mut().find(|a| &a.name.local == "href")
{
attr.value.clear();
attr.value.push_slice(&href_full_url.as_str());
}
}
LinkType::Unknown => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
let href_full_url =
resolve_url(&url, attr.value.as_ref()).unwrap_or_default();
attr.value.clear();
attr.value.push_slice(&href_full_url.as_str());
}
}
}
}
@@ -200,10 +230,10 @@ pub fn walk_and_embed_assets(
name: QualName::new(None, ns!(), local_name!("src")),
value: Tendril::from_slice(TRANSPARENT_PIXEL),
});
} else if let Some((dataurl, _)) = (&found_datasrc)
.into_iter()
} else if let Some((dataurl, _)) = found_datasrc
.iter()
.chain(&found_src) // Give dataurl priority
.map(|attr| &attr.value)
.map(|attr| attr.value.trim())
.filter(|src| !src.is_empty()) // Ignore empty srcs
.next()
.and_then(|src| resolve_url(&url, src).ok()) // Make absolute
@@ -229,8 +259,8 @@ pub fn walk_and_embed_assets(
let attr_name: &str = &attr.name.local;
if attr_name == "src" {
let src_full_url: String = resolve_url(&url, &attr.value.to_string())
.unwrap_or(attr.value.to_string());
let src_full_url = resolve_url(&url, attr.value.trim())
.unwrap_or_else(|_| attr.value.to_string());
attr.value.clear();
attr.value.push_slice(src_full_url.as_str());
} else if attr_name == "srcset" {
@@ -239,9 +269,8 @@ pub fn walk_and_embed_assets(
attr.value.clear();
attr.value.push_slice(TRANSPARENT_PIXEL);
} else {
let srcset_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(str!());
let srcset_full_url =
resolve_url(&url, attr.value.trim()).unwrap_or_default();
let (source_dataurl, _) = retrieve_asset(
cache,
client,
@@ -261,13 +290,13 @@ pub fn walk_and_embed_assets(
"a" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" {
let attr_value = attr.value.trim();
// Don't touch email links or hrefs which begin with a hash sign
if attr.value.starts_with('#') || url_has_protocol(&attr.value) {
if attr_value.starts_with('#') || url_has_protocol(attr_value) {
continue;
}
let href_full_url: String =
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
let href_full_url = resolve_url(&url, attr_value).unwrap_or_default();
attr.value.clear();
attr.value.push_slice(href_full_url.as_str());
}
@@ -296,8 +325,8 @@ pub fn walk_and_embed_assets(
} else {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
let src_full_url: String =
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
let src_full_url =
resolve_url(&url, attr.value.trim()).unwrap_or_default();
let (js_dataurl, _) = retrieve_asset(
cache,
client,
@@ -339,10 +368,11 @@ pub fn walk_and_embed_assets(
"form" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "action" {
let attr_value = attr.value.trim();
// Modify action to be a full URL
if !is_valid_url(&attr.value) {
let href_full_url: String =
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
if !is_valid_url(attr_value) {
let href_full_url =
resolve_url(&url, attr_value).unwrap_or_default();
attr.value.clear();
attr.value.push_slice(href_full_url.as_str());
}
@@ -358,15 +388,14 @@ pub fn walk_and_embed_assets(
continue;
}
let iframe_src: String = attr.value.to_string();
let iframe_src = attr.value.trim();
// Ignore iframes with empty source (they cause infinite loops)
if iframe_src == str!() {
if iframe_src.is_empty() {
continue;
}
let src_full_url: String =
resolve_url(&url, &iframe_src).unwrap_or(str!());
let src_full_url = resolve_url(&url, iframe_src).unwrap_or_default();
let (iframe_data, iframe_final_url) = retrieve_asset(
cache,
client,
@@ -399,18 +428,18 @@ pub fn walk_and_embed_assets(
"video" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "poster" {
let video_poster = attr.value.to_string();
let video_poster = attr.value.trim();
// Skip posters with empty source
if video_poster == str!() {
if video_poster.is_empty() {
continue;
}
if opt_no_images {
attr.value.clear();
} else {
let poster_full_url: String =
resolve_url(&url, &video_poster).unwrap_or(str!());
let poster_full_url =
resolve_url(&url, video_poster).unwrap_or_default();
let (poster_dataurl, _) = retrieve_asset(
cache,
client,
@@ -515,9 +544,7 @@ fn get_child_node_by_name(handle: &Handle, node_name: &str) -> Handle {
});
match matching_children {
Some(node) => node.clone(),
_ => {
return handle.clone();
}
_ => handle.clone(),
}
}
@@ -533,7 +560,7 @@ pub fn stringify_document(
serialize(&mut buf, handle, SerializeOpts::default())
.expect("unable to serialize DOM into buffer");
let mut result: String = String::from_utf8(buf).unwrap();
let mut result = String::from_utf8(buf).unwrap();
if opt_isolate || opt_no_css || opt_no_frames || opt_no_js || opt_no_images {
let mut buf: Vec<u8> = Vec::new();
@@ -557,7 +584,6 @@ pub fn stringify_document(
if opt_no_images {
content_attr += " img-src data:;";
}
content_attr = content_attr.trim().to_string();
let meta = dom.create_element(
QualName::new(None, ns!(), local_name!("meta")),
@@ -568,7 +594,7 @@ pub fn stringify_document(
},
Attribute {
name: QualName::new(None, ns!(), local_name!("content")),
value: format_tendril!("{}", content_attr),
value: format_tendril!("{}", content_attr.trim()),
},
],
Default::default(),

View File

@@ -1,7 +1,7 @@
use crate::utils::{clean_url, data_to_dataurl, is_data_url};
use reqwest::blocking::Client;
use reqwest::header::CONTENT_TYPE;
use reqwest::Client;
use std::collections::HashMap;
use utils::{clean_url, data_to_dataurl, is_data_url};
pub fn retrieve_asset(
cache: &mut HashMap<String, String>,
@@ -26,16 +26,17 @@ pub fn retrieve_asset(
} else {
// url not in cache, we request it
let mut response = client.get(url).send()?;
let res_url = response.url().to_string();
if !opt_silent {
if url == response.url().as_str() {
if url == res_url {
eprintln!("{}", &url);
} else {
eprintln!("{} -> {}", &url, &response.url().as_str());
eprintln!("{} -> {}", &url, &res_url);
}
}
let new_cache_key = clean_url(response.url().to_string());
let new_cache_key = clean_url(&res_url);
if as_dataurl {
// Convert response into a byte array
@@ -54,13 +55,13 @@ pub fn retrieve_asset(
};
let dataurl = data_to_dataurl(&mimetype, &data);
// insert in cache
cache.insert(new_cache_key, dataurl.to_string());
Ok((dataurl, response.url().to_string()))
cache.insert(new_cache_key, dataurl.clone());
Ok((dataurl, res_url))
} else {
let content = response.text().unwrap();
// insert in cache
cache.insert(new_cache_key, content.clone());
Ok((content, response.url().to_string()))
Ok((content, res_url))
}
}
}

111
src/js.rs
View File

@@ -1,32 +1,103 @@
const JS_DOM_EVENT_ATTRS: [&str; 21] = [
// Input
"onfocus",
const JS_DOM_EVENT_ATTRS: &[&str] = &[
// From WHATWG HTML spec 8.1.5.2 'Event handlers on elements, Document objects, and Window objects':
// https://html.spec.whatwg.org/#event-handlers-on-elements,-document-objects,-and-window-objects
// https://html.spec.whatwg.org/#attributes-3 (table 'List of event handler content attributes')
// Global event handlers
"onabort",
"onauxclick",
"onblur",
"onselect",
"oncancel",
"oncanplay",
"oncanplaythrough",
"onchange",
"onsubmit",
"onreset",
"onclick",
"onclose",
"oncontextmenu",
"oncuechange",
"ondblclick",
"ondrag",
"ondragend",
"ondragenter",
"ondragexit",
"ondragleave",
"ondragover",
"ondragstart",
"ondrop",
"ondurationchange",
"onemptied",
"onended",
"onerror",
"onfocus",
"onformdata",
"oninput",
"oninvalid",
"onkeydown",
"onkeypress",
"onkeyup",
// Mouse
"onmouseover",
"onmouseout",
"onmousedown",
"onmouseup",
"onmousemove",
// Click
"onclick",
"ondblclick",
// Load
"onload",
"onunload",
"onabort",
"onerror",
"onloadeddata",
"onloadedmetadata",
"onloadstart",
"onmousedown",
"onmouseenter",
"onmouseleave",
"onmousemove",
"onmouseout",
"onmouseover",
"onmouseup",
"onwheel",
"onpause",
"onplay",
"onplaying",
"onprogress",
"onratechange",
"onreset",
"onresize",
"onscroll",
"onsecuritypolicyviolation",
"onseeked",
"onseeking",
"onselect",
"onslotchange",
"onstalled",
"onsubmit",
"onsuspend",
"ontimeupdate",
"ontoggle",
"onvolumechange",
"onwaiting",
"onwebkitanimationend",
"onwebkitanimationiteration",
"onwebkitanimationstart",
"onwebkittransitionend",
// Event handlers for <body/> and <frameset/> elements
"onafterprint",
"onbeforeprint",
"onbeforeunload",
"onhashchange",
"onlanguagechange",
"onmessage",
"onmessageerror",
"onoffline",
"ononline",
"onpagehide",
"onpageshow",
"onpopstate",
"onrejectionhandled",
"onstorage",
"onunhandledrejection",
"onunload",
// Event handlers for <html/> element
"oncut",
"oncopy",
"onpaste",
];
// Returns true if DOM attribute name matches a native JavaScript event handler
pub fn attr_is_event_handler(attr_name: &str) -> bool {
JS_DOM_EVENT_ATTRS.contains(&attr_name.to_lowercase().as_str())
JS_DOM_EVENT_ATTRS
.iter()
.find(|a| attr_name.eq_ignore_ascii_case(a))
.is_some()
}

View File

@@ -1,9 +1,5 @@
extern crate html5ever;
#[macro_use]
extern crate lazy_static;
extern crate regex;
extern crate reqwest;
extern crate url;
#[macro_use]
mod macros;

View File

@@ -1,104 +1,111 @@
#[macro_use]
extern crate clap;
extern crate monolith;
extern crate reqwest;
mod args;
mod macros;
use args::AppArgs;
use crate::args::AppArgs;
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
use monolith::http::retrieve_asset;
use monolith::utils::is_valid_url;
use reqwest::blocking::Client;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use std::collections::HashMap;
use std::fs::{remove_file, File};
use std::io::{Error, Write};
use std::fs::File;
use std::io::{self, Error, Write};
use std::process;
use std::time::Duration;
fn create_file(file_path: &String, content: String) -> Result<(), Error> {
let file = File::create(file_path.as_str());
enum Output {
Stdout(io::Stdout),
File(File),
}
let mut file = match file {
Ok(file) => file,
Err(error) => return Err(error),
};
if content != str!() {
file.write_all(content.as_bytes())?;
file.write_all("\n".as_bytes())?;
file.sync_all()?;
} else {
// Remove the file right away if it had no content
remove_file(file_path.as_str())?;
impl Output {
fn new(file_path: &str) -> Result<Output, Error> {
if file_path.is_empty() {
Ok(Output::Stdout(io::stdout()))
} else {
Ok(Output::File(File::create(file_path)?))
}
}
Ok(())
fn writeln_str(&mut self, s: &str) -> Result<(), Error> {
match self {
Output::Stdout(stdout) => {
writeln!(stdout, "{}", s)?;
stdout.flush()
}
Output::File(f) => {
writeln!(f, "{}", s)?;
f.flush()
}
}
}
}
fn main() {
let app_args = AppArgs::get();
let cache = &mut HashMap::new();
// Attempt to create output file
if app_args.output != str!() {
create_file(&app_args.output, str!()).unwrap();
if !is_valid_url(app_args.url_target.as_str()) {
eprintln!(
"Only HTTP and HTTPS URLs are allowed but got: {}",
&app_args.url_target
);
process::exit(1);
}
if is_valid_url(app_args.url_target.as_str()) {
// Initialize client
let mut header_map = HeaderMap::new();
match HeaderValue::from_str(&app_args.user_agent) {
Ok(header) => header_map.insert(USER_AGENT, header),
Err(err) => {
eprintln!("Invalid user agent! {}", err);
return;
}
};
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(10))
.danger_accept_invalid_certs(app_args.insecure)
.default_headers(header_map)
.build()
.expect("Failed to initialize HTTP client");
let mut output = Output::new(&app_args.output).expect("Could not prepare output");
// Retrieve root document
let (data, final_url) = retrieve_asset(
cache,
&client,
app_args.url_target.as_str(),
false,
"",
app_args.silent,
)
.unwrap();
let dom = html_to_dom(&data);
// Initialize client
let mut cache = HashMap::new();
let mut header_map = HeaderMap::new();
header_map.insert(
USER_AGENT,
HeaderValue::from_str(&app_args.user_agent).expect("Invalid User-Agent header specified"),
);
walk_and_embed_assets(
cache,
&client,
&final_url,
&dom.document,
app_args.no_css,
app_args.no_js,
app_args.no_images,
app_args.silent,
app_args.no_frames,
);
let client = Client::builder()
.timeout(Duration::from_secs(10))
.danger_accept_invalid_certs(app_args.insecure)
.default_headers(header_map)
.build()
.expect("Failed to initialize HTTP client");
let html: String = stringify_document(
&dom.document,
app_args.no_css,
app_args.no_frames,
app_args.no_js,
app_args.no_images,
app_args.isolate,
);
// Retrieve root document
let (data, final_url) = retrieve_asset(
&mut cache,
&client,
app_args.url_target.as_str(),
false,
"",
app_args.silent,
)
.expect("Could not retrieve assets in HTML");
let dom = html_to_dom(&data);
if app_args.output == str!() {
println!("{}", html);
} else {
create_file(&app_args.output, html).unwrap();
}
}
walk_and_embed_assets(
&mut cache,
&client,
&final_url,
&dom.document,
app_args.no_css,
app_args.no_js,
app_args.no_images,
app_args.silent,
app_args.no_frames,
);
let html: String = stringify_document(
&dom.document,
app_args.no_css,
app_args.no_frames,
app_args.no_js,
app_args.no_images,
app_args.isolate,
);
output
.writeln_str(&html)
.expect("Could not write HTML output");
}

View File

@@ -3,6 +3,7 @@ use crate::html::{
};
use html5ever::rcdom::{Handle, NodeData};
use html5ever::serialize::{serialize, SerializeOpts};
use reqwest::blocking::Client;
use std::collections::HashMap;
#[test]
@@ -33,7 +34,8 @@ fn test_get_parent_node_name() {
}
NodeData::Element { ref name, .. } => {
let node_name = name.local.as_ref().to_string();
let parent_node_name = get_node_name(&get_parent_node(node));
let parent = get_parent_node(node);
let parent_node_name = get_node_name(&parent);
if node_name == "head" || node_name == "body" {
assert_eq!(parent_node_name, "html");
} else if node_name == "div" {
@@ -71,7 +73,7 @@ fn test_walk_and_embed_assets() {
let opt_no_images: bool = false;
let opt_silent = true;
let client = reqwest::Client::new();
let client = Client::new();
walk_and_embed_assets(
cache,
@@ -107,7 +109,7 @@ fn test_walk_and_embed_assets_ensure_no_recursive_iframe() {
let opt_no_images: bool = false;
let opt_silent = true;
let client = reqwest::Client::new();
let client = Client::new();
walk_and_embed_assets(
cache,
@@ -144,7 +146,7 @@ fn test_walk_and_embed_assets_no_css() {
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_silent = true;
let client = reqwest::Client::new();
let client = Client::new();
walk_and_embed_assets(
cache,
@@ -189,7 +191,7 @@ fn test_walk_and_embed_assets_no_images() {
let opt_no_images: bool = true;
let opt_silent = true;
let client = reqwest::Client::new();
let client = Client::new();
walk_and_embed_assets(
cache,
@@ -235,7 +237,7 @@ fn test_walk_and_embed_assets_no_frames() {
let opt_no_js: bool = false;
let opt_no_images: bool = false;
let opt_silent = true;
let client = reqwest::Client::new();
let client = Client::new();
walk_and_embed_assets(
cache,
@@ -274,7 +276,7 @@ fn test_walk_and_embed_assets_no_js() {
let opt_no_images: bool = false;
let opt_silent = true;
let client = reqwest::Client::new();
let client = Client::new();
walk_and_embed_assets(
cache,
@@ -306,7 +308,7 @@ fn test_walk_and_embed_with_no_integrity() {
let dom = html_to_dom(&html);
let url = "http://localhost";
let cache = &mut HashMap::new();
let client = reqwest::Client::new();
let client = Client::new();
let opt_no_css: bool = true;
let opt_no_frames: bool = true;
let opt_no_js: bool = true;

View File

@@ -1,9 +1,11 @@
use crate::http::retrieve_asset;
use reqwest::blocking::Client;
use std::collections::HashMap;
#[test]
fn test_retrieve_asset() {
let cache = &mut HashMap::new();
let client = reqwest::Client::new();
let client = Client::new();
let (data, final_url) =
retrieve_asset(cache, &client, "data:text/html;base64,...", true, "", false).unwrap();
assert_eq!(&data, "data:text/html;base64,...");

View File

@@ -1,9 +1,7 @@
extern crate base64;
use self::base64::encode;
use http::retrieve_asset;
use crate::http::retrieve_asset;
use base64::encode;
use regex::Regex;
use reqwest::Client;
use reqwest::blocking::Client;
use std::collections::HashMap;
use url::{ParseError, Url};