Compare commits
54 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f876e9243c | ||
|
|
b6896febf1 | ||
|
|
29d2ba5857 | ||
|
|
8b1ebc7871 | ||
|
|
d753c83c76 | ||
|
|
47a825f5ed | ||
|
|
0e12cecd85 | ||
|
|
d8def879b2 | ||
|
|
0420854ed6 | ||
|
|
d47482fcd9 | ||
|
|
b68624f2f3 | ||
|
|
a9d114d04d | ||
|
|
4e4ebe9c98 | ||
|
|
429217d8f7 | ||
|
|
1779f4a374 | ||
|
|
26e89ae6d3 | ||
|
|
b333d19d04 | ||
|
|
c1dc798ded | ||
|
|
69d99b69e8 | ||
|
|
aae53d20f0 | ||
|
|
14cf2ce8a6 | ||
|
|
67b79e92f9 | ||
|
|
b51f41fe34 | ||
|
|
6f158dc6db | ||
|
|
8d7052b39c | ||
|
|
660511b8a0 | ||
|
|
929512f4f5 | ||
|
|
a46d89cefc | ||
|
|
f93646e17a | ||
|
|
9d14b6dfea | ||
|
|
9783b96524 | ||
|
|
106efe58ce | ||
|
|
6e99ad13e7 | ||
|
|
413dd66886 | ||
|
|
dc7ec6e7a8 | ||
|
|
ed879231af | ||
|
|
ddf4b8ac13 | ||
|
|
84c13f0605 | ||
|
|
ce03e0e487 | ||
|
|
63e19998d0 | ||
|
|
e3321bbb07 | ||
|
|
0a38cd0eae | ||
|
|
75fb6961ed | ||
|
|
5ba8931502 | ||
|
|
13d2ea1607 | ||
|
|
88ffde0c3b | ||
|
|
bfb97bd062 | ||
|
|
295931041c | ||
|
|
2e623dd9f8 | ||
|
|
169b9657e5 | ||
|
|
dab4ae6965 | ||
|
|
c7fc121c7c | ||
|
|
292221ea28 | ||
|
|
feb37f5812 |
@@ -95,6 +95,7 @@ environment:
|
||||
matrix:
|
||||
allow_failures:
|
||||
- channel: nightly
|
||||
- channel: beta
|
||||
|
||||
# If you only care about stable channel build failures, uncomment the following line:
|
||||
#- channel: beta
|
||||
@@ -127,4 +128,3 @@ build: false
|
||||
# environment variable.
|
||||
test_script:
|
||||
- cargo test --verbose %cargoflags%
|
||||
- cargo fmt --all -- --check
|
||||
21
.travis.yml
21
.travis.yml
@@ -4,21 +4,24 @@ cache: cargo
|
||||
sudo: false
|
||||
|
||||
os:
|
||||
- linux
|
||||
- osx
|
||||
- linux
|
||||
- osx
|
||||
|
||||
rust:
|
||||
- stable
|
||||
- beta
|
||||
- nightly
|
||||
- stable
|
||||
- beta
|
||||
- nightly
|
||||
|
||||
before_script:
|
||||
- rustup component add rustfmt
|
||||
- rustup component add rustfmt
|
||||
|
||||
script:
|
||||
- cargo build --all --locked --verbose
|
||||
- cargo test --all --locked --verbose
|
||||
- cargo fmt --all -- --check
|
||||
- cargo build --all --locked --verbose
|
||||
- cargo test --all --locked --verbose
|
||||
- |
|
||||
if [[ "$TRAVIS_RUST_VERSION" == "stable" ]]; then
|
||||
cargo fmt --all -- --check
|
||||
fi
|
||||
|
||||
jobs:
|
||||
allow_failures:
|
||||
|
||||
865
Cargo.lock
generated
865
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
10
Cargo.toml
10
Cargo.toml
@@ -1,11 +1,13 @@
|
||||
[package]
|
||||
name = "monolith"
|
||||
version = "2.1.0"
|
||||
version = "2.1.2"
|
||||
edition = "2018"
|
||||
authors = [
|
||||
"Sunshine <sunshine@uberspace.net>",
|
||||
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
|
||||
"Emmanuel Delaborde <th3rac25@gmail.com>",
|
||||
"Emi Simpson <emi@alchemi.dev>",
|
||||
"rhysd <lin90162@yahoo.co.jp>",
|
||||
]
|
||||
description = "CLI tool for saving web pages as a single HTML file"
|
||||
|
||||
@@ -15,5 +17,9 @@ clap = "2.33.0"
|
||||
html5ever = "0.24.1"
|
||||
lazy_static = "1.4.0"
|
||||
regex = "1.3.1"
|
||||
reqwest = "0.9.20"
|
||||
url = "2.1.0"
|
||||
|
||||
[dependencies.reqwest]
|
||||
version = "0.10.*"
|
||||
default-features = false
|
||||
features = ["default-tls", "blocking", "gzip"]
|
||||
|
||||
19
README.md
19
README.md
@@ -11,8 +11,7 @@
|
||||
|___| |__________| \____________________| |___| |___| |___|
|
||||
```
|
||||
|
||||
A data hoarder's dream come true: bundle any web page into a single HTML file.
|
||||
You can finally replace that gazillion of open tabs with a gazillion of .html files stored somewhere on your precious little drive.
|
||||
A data hoarder's dream come true: bundle any web page into a single HTML file. You can finally replace that gazillion of open tabs with a gazillion of .html files stored somewhere on your precious little drive.
|
||||
|
||||
Unlike the conventional "Save page as", `monolith` not only saves the target document, it embeds CSS, image, and JavaScript assets **all at once**, producing a single HTML5 document that is a joy to store and share.
|
||||
|
||||
@@ -25,11 +24,14 @@ If compared to saving websites with `wget -mpk`, this tool embeds all assets as
|
||||
$ cd monolith
|
||||
$ cargo install --path .
|
||||
|
||||
### On macOS (via Homebrew)
|
||||
### With Homebrew (on macOS and GNU/Linux)
|
||||
$ brew install monolith
|
||||
|
||||
### Using Snapcraft (on GNU/Linux)
|
||||
$ snap install monolith
|
||||
|
||||
## Usage
|
||||
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html
|
||||
$ monolith https://lyrics.github.io/db/P/Portishead/Dummy/Roads/ -o portishead-roads-lyrics.html
|
||||
|
||||
## Options
|
||||
- `-c`: Ignore styles
|
||||
@@ -38,11 +40,20 @@ If compared to saving websites with `wget -mpk`, this tool embeds all assets as
|
||||
- `-I`: Isolate document
|
||||
- `-j`: Exclude JavaScript
|
||||
- `-k`: Accept invalid X.509 (TLS) certificates
|
||||
- `-o`: Write output to file
|
||||
- `-s`: Silent mode
|
||||
- `-u`: Specify custom User-Agent
|
||||
|
||||
## HTTPS and HTTP proxies
|
||||
Please set `https_proxy`, `http_proxy` and `no_proxy` environment variables.
|
||||
|
||||
## Contributing
|
||||
Please open an issue if something is wrong, that helps make this project better.
|
||||
|
||||
## Related projects
|
||||
- `Monolith Chrome Extension`: https://github.com/rhysd/monolith-of-web
|
||||
- `Pagesaver`: https://github.com/distributed-mind/pagesaver
|
||||
- `Personal WayBack Machine`: https://github.com/popey/pwbm
|
||||
- `SingleFile`: https://github.com/gildas-lormeau/SingleFile
|
||||
|
||||
## License
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
name: monolith
|
||||
base: core18
|
||||
version: git
|
||||
# Version data defined inside the monolith part below
|
||||
adopt-info: monolith
|
||||
summary: Monolith - Save HTML pages with ease
|
||||
description: |
|
||||
A data hoarder's dream come true: bundle any web page into a single
|
||||
@@ -17,6 +18,14 @@ description: |
|
||||
|
||||
confinement: strict
|
||||
|
||||
# Building on armhf fails, so we specify all supported non-armhf architectures
|
||||
architectures:
|
||||
- build-on: amd64
|
||||
- build-on: i386
|
||||
- build-on: arm64
|
||||
- build-on: ppc64el
|
||||
- build-on: s390x
|
||||
|
||||
parts:
|
||||
monolith:
|
||||
plugin: rust
|
||||
@@ -24,6 +33,21 @@ parts:
|
||||
build-packages:
|
||||
- libssl-dev
|
||||
- pkg-config
|
||||
override-pull: |
|
||||
snapcraftctl pull
|
||||
# Determine the current tag
|
||||
last_committed_tag="$(git describe --tags --abbrev=0)"
|
||||
last_committed_tag_ver="$(echo ${last_committed_tag} | sed 's/v//')"
|
||||
# Determine the most recent version in the beta channel in the Snap Store
|
||||
last_released_tag="$(snap info $SNAPCRAFT_PROJECT_NAME | awk '$1 == "beta:" { print $2 }')"
|
||||
# If the latest tag from the upstream project has not been released to
|
||||
# beta, build that tag instead of master.
|
||||
if [ "${last_committed_tag_ver}" != "${last_released_tag}" ]; then
|
||||
git fetch
|
||||
git checkout "${last_committed_tag}"
|
||||
fi
|
||||
# set version number of the snap based on what we did above
|
||||
snapcraftctl set-version $(git describe --tags --abbrev=0)
|
||||
|
||||
apps:
|
||||
monolith:
|
||||
|
||||
@@ -9,6 +9,7 @@ pub struct AppArgs {
|
||||
pub no_js: bool,
|
||||
pub insecure: bool,
|
||||
pub isolate: bool,
|
||||
pub output: String,
|
||||
pub silent: bool,
|
||||
pub user_agent: String,
|
||||
}
|
||||
@@ -36,6 +37,7 @@ impl AppArgs {
|
||||
.args_from_usage("-I, --isolate 'Cut off from the Internet'")
|
||||
.args_from_usage("-j, --no-js 'Exclude JavaScript'")
|
||||
.args_from_usage("-k, --insecure 'Accept invalid X.509 (TLS) certificates'")
|
||||
.args_from_usage("-o, --output=[document.html] 'Write output to <file>'")
|
||||
.args_from_usage("-s, --silent 'Suppress verbosity'")
|
||||
.args_from_usage("-u, --user-agent=[Iceweasel] 'Custom User-Agent string'")
|
||||
// .args_from_usage("-v, --include-video 'Embed video sources'")
|
||||
@@ -53,9 +55,10 @@ impl AppArgs {
|
||||
app_args.insecure = app.is_present("insecure");
|
||||
app_args.isolate = app.is_present("isolate");
|
||||
app_args.silent = app.is_present("silent");
|
||||
app_args.output = app.value_of("output").unwrap_or("").to_string();
|
||||
app_args.user_agent = app
|
||||
.value_of("user-agent")
|
||||
.unwrap_or_else(|| DEFAULT_USER_AGENT)
|
||||
.unwrap_or(DEFAULT_USER_AGENT)
|
||||
.to_string();
|
||||
app_args
|
||||
}
|
||||
|
||||
329
src/html.rs
329
src/html.rs
@@ -1,18 +1,20 @@
|
||||
use crate::http::retrieve_asset;
|
||||
use crate::js::attr_is_event_handler;
|
||||
use crate::utils::{
|
||||
data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol,
|
||||
};
|
||||
use html5ever::interface::QualName;
|
||||
use html5ever::parse_document;
|
||||
use html5ever::rcdom::{Handle, NodeData, RcDom};
|
||||
use html5ever::serialize::{serialize, SerializeOpts};
|
||||
use html5ever::tendril::{format_tendril, TendrilSink};
|
||||
use html5ever::tendril::{format_tendril, Tendril, TendrilSink};
|
||||
use html5ever::tree_builder::{Attribute, TreeSink};
|
||||
use html5ever::{local_name, namespace_url, ns};
|
||||
use http::retrieve_asset;
|
||||
use js::attr_is_event_handler;
|
||||
use reqwest::Client;
|
||||
use reqwest::blocking::Client;
|
||||
use std::collections::HashMap;
|
||||
use std::default::Default;
|
||||
use utils::{data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol};
|
||||
|
||||
const ICON_VALUES: [&str; 5] = [
|
||||
const ICON_VALUES: &[&str] = &[
|
||||
"icon",
|
||||
"shortcut icon",
|
||||
"mask-icon",
|
||||
@@ -29,15 +31,18 @@ pub fn get_parent_node(node: &Handle) -> Handle {
|
||||
parent.and_then(|node| node.upgrade()).unwrap()
|
||||
}
|
||||
|
||||
pub fn get_node_name(node: &Handle) -> String {
|
||||
pub fn get_node_name(node: &Handle) -> &'_ str {
|
||||
match &node.data {
|
||||
NodeData::Element { ref name, .. } => name.local.as_ref().to_string(),
|
||||
_ => str!(),
|
||||
NodeData::Element { ref name, .. } => name.local.as_ref(),
|
||||
_ => "",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_icon(attr_value: &str) -> bool {
|
||||
ICON_VALUES.contains(&&*attr_value.to_lowercase())
|
||||
ICON_VALUES
|
||||
.iter()
|
||||
.find(|a| attr_value.eq_ignore_ascii_case(a))
|
||||
.is_some()
|
||||
}
|
||||
|
||||
pub fn walk_and_embed_assets(
|
||||
@@ -77,134 +82,185 @@ pub fn walk_and_embed_assets(
|
||||
|
||||
match name.local.as_ref() {
|
||||
"link" => {
|
||||
let mut link_type: &str = "";
|
||||
// Remove integrity attributes
|
||||
let mut i = 0;
|
||||
while i < attrs_mut.len() {
|
||||
let attr_name = attrs_mut[i].name.local.as_ref();
|
||||
if attr_name.eq_ignore_ascii_case("integrity") {
|
||||
attrs_mut.remove(i);
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
enum LinkType {
|
||||
Icon,
|
||||
Stylesheet,
|
||||
Preload,
|
||||
DnsPrefetch,
|
||||
Unknown,
|
||||
}
|
||||
|
||||
let mut link_type = LinkType::Unknown;
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "rel" {
|
||||
if is_icon(&attr.value.to_string()) {
|
||||
link_type = "icon";
|
||||
let value = attr.value.trim();
|
||||
if is_icon(value) {
|
||||
link_type = LinkType::Icon;
|
||||
break;
|
||||
} else if attr.value.to_string() == "stylesheet" {
|
||||
link_type = "stylesheet";
|
||||
} else if value.eq_ignore_ascii_case("stylesheet") {
|
||||
link_type = LinkType::Stylesheet;
|
||||
break;
|
||||
} else if value.eq_ignore_ascii_case("preload") {
|
||||
link_type = LinkType::Preload;
|
||||
break;
|
||||
} else if value.eq_ignore_ascii_case("dns-prefetch") {
|
||||
link_type = LinkType::DnsPrefetch;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
let link_type = link_type;
|
||||
|
||||
if link_type == "icon" {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
if opt_no_images {
|
||||
attr.value.clear();
|
||||
} else {
|
||||
let href_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(str!());
|
||||
let (favicon_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&href_full_url,
|
||||
true,
|
||||
"",
|
||||
opt_silent,
|
||||
)
|
||||
.unwrap_or((str!(), str!()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(favicon_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if link_type == "stylesheet" {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
if opt_no_css {
|
||||
attr.value.clear();
|
||||
} else {
|
||||
let href_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(str!());
|
||||
let replacement_text = match retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&href_full_url,
|
||||
false,
|
||||
"text/css",
|
||||
opt_silent,
|
||||
) {
|
||||
// On successful retrieval, traverse CSS
|
||||
Ok((css_data, _)) => resolve_css_imports(
|
||||
match link_type {
|
||||
LinkType::Icon => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
if opt_no_images {
|
||||
attr.value.clear();
|
||||
} else {
|
||||
let href_full_url = resolve_url(&url, attr.value.as_ref())
|
||||
.unwrap_or_default();
|
||||
let (favicon_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&css_data,
|
||||
true,
|
||||
&href_full_url,
|
||||
opt_no_images,
|
||||
true,
|
||||
"",
|
||||
opt_silent,
|
||||
),
|
||||
|
||||
// If a network error occured, warn
|
||||
Err(e) => {
|
||||
eprintln!("Warning: {}", e,);
|
||||
|
||||
// If failed to resolve, replace with absolute URL
|
||||
href_full_url
|
||||
}
|
||||
};
|
||||
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(&replacement_text);
|
||||
)
|
||||
.unwrap_or_default();
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(favicon_dataurl.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
let href_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
|
||||
LinkType::Stylesheet => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
if opt_no_css {
|
||||
attr.value.clear();
|
||||
} else {
|
||||
let href_full_url = resolve_url(&url, &attr.value.as_ref())
|
||||
.unwrap_or_default();
|
||||
let replacement_text = match retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&href_full_url,
|
||||
false,
|
||||
"text/css",
|
||||
opt_silent,
|
||||
) {
|
||||
// On successful retrieval, traverse CSS
|
||||
Ok((css_data, _)) => resolve_css_imports(
|
||||
cache,
|
||||
client,
|
||||
&css_data,
|
||||
true,
|
||||
&href_full_url,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
),
|
||||
|
||||
// If a network error occured, warn
|
||||
Err(e) => {
|
||||
eprintln!("Warning: {}", e);
|
||||
|
||||
// If failed to resolve, replace with absolute URL
|
||||
href_full_url
|
||||
}
|
||||
};
|
||||
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(&replacement_text);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
LinkType::Preload | LinkType::DnsPrefetch => {
|
||||
// Since all resources are embedded as data URL, preloading and prefetching are unnecessary
|
||||
if let Some(attr) =
|
||||
attrs_mut.iter_mut().find(|a| &a.name.local == "href")
|
||||
{
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(&href_full_url.as_str());
|
||||
}
|
||||
}
|
||||
LinkType::Unknown => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
let href_full_url =
|
||||
resolve_url(&url, attr.value.as_ref()).unwrap_or_default();
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(&href_full_url.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"img" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "src" {
|
||||
let value = attr.value.to_string();
|
||||
|
||||
// Ignore images with empty source
|
||||
if value == str!() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if opt_no_images {
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(TRANSPARENT_PIXEL);
|
||||
} else {
|
||||
let src_full_url: String =
|
||||
resolve_url(&url, &value).unwrap_or(str!());
|
||||
let (img_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&src_full_url,
|
||||
true,
|
||||
"",
|
||||
opt_silent,
|
||||
)
|
||||
.unwrap_or((str!(), str!()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(img_dataurl.as_str());
|
||||
}
|
||||
// Find source tags
|
||||
let mut found_src: Option<Attribute> = None;
|
||||
let mut found_datasrc: Option<Attribute> = None;
|
||||
let mut i = 0;
|
||||
while i < attrs_mut.len() {
|
||||
let name = attrs_mut[i].name.local.as_ref();
|
||||
if name.eq_ignore_ascii_case("src") {
|
||||
found_src = Some(attrs_mut.remove(i));
|
||||
} else if name.eq_ignore_ascii_case("data-src") {
|
||||
found_datasrc = Some(attrs_mut.remove(i));
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// If images are disabled, clear both sources
|
||||
if opt_no_images {
|
||||
attrs_mut.push(Attribute {
|
||||
name: QualName::new(None, ns!(), local_name!("src")),
|
||||
value: Tendril::from_slice(TRANSPARENT_PIXEL),
|
||||
});
|
||||
} else if let Some((dataurl, _)) = found_datasrc
|
||||
.iter()
|
||||
.chain(&found_src) // Give dataurl priority
|
||||
.map(|attr| attr.value.trim())
|
||||
.filter(|src| !src.is_empty()) // Ignore empty srcs
|
||||
.next()
|
||||
.and_then(|src| resolve_url(&url, src).ok()) // Make absolute
|
||||
.and_then(|abs_src| // Download and convert to dataurl
|
||||
retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&abs_src,
|
||||
true,
|
||||
"",
|
||||
opt_silent,
|
||||
).ok())
|
||||
{
|
||||
// Add the new dataurl src attribute
|
||||
attrs_mut.push(Attribute {
|
||||
name: QualName::new(None, ns!(), local_name!("src")),
|
||||
value: Tendril::from_slice(dataurl.as_ref()),
|
||||
});
|
||||
}
|
||||
}
|
||||
"source" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
let attr_name: &str = &attr.name.local;
|
||||
|
||||
if attr_name == "src" {
|
||||
let src_full_url: String = resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(attr.value.to_string());
|
||||
let src_full_url = resolve_url(&url, attr.value.trim())
|
||||
.unwrap_or_else(|_| attr.value.to_string());
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(src_full_url.as_str());
|
||||
} else if attr_name == "srcset" {
|
||||
@@ -213,9 +269,8 @@ pub fn walk_and_embed_assets(
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(TRANSPARENT_PIXEL);
|
||||
} else {
|
||||
let srcset_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string())
|
||||
.unwrap_or(str!());
|
||||
let srcset_full_url =
|
||||
resolve_url(&url, attr.value.trim()).unwrap_or_default();
|
||||
let (source_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
@@ -235,19 +290,30 @@ pub fn walk_and_embed_assets(
|
||||
"a" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "href" {
|
||||
let attr_value = attr.value.trim();
|
||||
// Don't touch email links or hrefs which begin with a hash sign
|
||||
if attr.value.starts_with('#') || url_has_protocol(&attr.value) {
|
||||
if attr_value.starts_with('#') || url_has_protocol(attr_value) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let href_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
|
||||
let href_full_url = resolve_url(&url, attr_value).unwrap_or_default();
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(href_full_url.as_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
"script" => {
|
||||
// Remove integrity attributes
|
||||
let mut i = 0;
|
||||
while i < attrs_mut.len() {
|
||||
let attr_name = attrs_mut[i].name.local.as_ref();
|
||||
if attr_name.eq_ignore_ascii_case("integrity") {
|
||||
attrs_mut.remove(i);
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if opt_no_js {
|
||||
// Empty src and inner content of SCRIPT tags
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
@@ -259,8 +325,8 @@ pub fn walk_and_embed_assets(
|
||||
} else {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "src" {
|
||||
let src_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
|
||||
let src_full_url =
|
||||
resolve_url(&url, attr.value.trim()).unwrap_or_default();
|
||||
let (js_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
@@ -302,10 +368,11 @@ pub fn walk_and_embed_assets(
|
||||
"form" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "action" {
|
||||
let attr_value = attr.value.trim();
|
||||
// Modify action to be a full URL
|
||||
if !is_valid_url(&attr.value) {
|
||||
let href_full_url: String =
|
||||
resolve_url(&url, &attr.value.to_string()).unwrap_or(str!());
|
||||
if !is_valid_url(attr_value) {
|
||||
let href_full_url =
|
||||
resolve_url(&url, attr_value).unwrap_or_default();
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(href_full_url.as_str());
|
||||
}
|
||||
@@ -321,15 +388,14 @@ pub fn walk_and_embed_assets(
|
||||
continue;
|
||||
}
|
||||
|
||||
let iframe_src: String = attr.value.to_string();
|
||||
let iframe_src = attr.value.trim();
|
||||
|
||||
// Ignore iframes with empty source (they cause infinite loops)
|
||||
if iframe_src == str!() {
|
||||
if iframe_src.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let src_full_url: String =
|
||||
resolve_url(&url, &iframe_src).unwrap_or(str!());
|
||||
let src_full_url = resolve_url(&url, iframe_src).unwrap_or_default();
|
||||
let (iframe_data, iframe_final_url) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
@@ -362,18 +428,18 @@ pub fn walk_and_embed_assets(
|
||||
"video" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "poster" {
|
||||
let video_poster = attr.value.to_string();
|
||||
let video_poster = attr.value.trim();
|
||||
|
||||
// Skip posters with empty source
|
||||
if video_poster == str!() {
|
||||
if video_poster.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if opt_no_images {
|
||||
attr.value.clear();
|
||||
} else {
|
||||
let poster_full_url: String =
|
||||
resolve_url(&url, &video_poster).unwrap_or(str!());
|
||||
let poster_full_url =
|
||||
resolve_url(&url, video_poster).unwrap_or_default();
|
||||
let (poster_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
@@ -478,9 +544,7 @@ fn get_child_node_by_name(handle: &Handle, node_name: &str) -> Handle {
|
||||
});
|
||||
match matching_children {
|
||||
Some(node) => node.clone(),
|
||||
_ => {
|
||||
return handle.clone();
|
||||
}
|
||||
_ => handle.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -496,7 +560,7 @@ pub fn stringify_document(
|
||||
serialize(&mut buf, handle, SerializeOpts::default())
|
||||
.expect("unable to serialize DOM into buffer");
|
||||
|
||||
let mut result: String = String::from_utf8(buf).unwrap();
|
||||
let mut result = String::from_utf8(buf).unwrap();
|
||||
|
||||
if opt_isolate || opt_no_css || opt_no_frames || opt_no_js || opt_no_images {
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
@@ -520,7 +584,6 @@ pub fn stringify_document(
|
||||
if opt_no_images {
|
||||
content_attr += " img-src data:;";
|
||||
}
|
||||
content_attr = content_attr.trim().to_string();
|
||||
|
||||
let meta = dom.create_element(
|
||||
QualName::new(None, ns!(), local_name!("meta")),
|
||||
@@ -531,7 +594,7 @@ pub fn stringify_document(
|
||||
},
|
||||
Attribute {
|
||||
name: QualName::new(None, ns!(), local_name!("content")),
|
||||
value: format_tendril!("{}", content_attr),
|
||||
value: format_tendril!("{}", content_attr.trim()),
|
||||
},
|
||||
],
|
||||
Default::default(),
|
||||
|
||||
25
src/http.rs
25
src/http.rs
@@ -1,7 +1,7 @@
|
||||
use crate::utils::{clean_url, data_to_dataurl, is_data_url};
|
||||
use reqwest::blocking::Client;
|
||||
use reqwest::header::CONTENT_TYPE;
|
||||
use reqwest::Client;
|
||||
use std::collections::HashMap;
|
||||
use utils::{data_to_dataurl, is_data_url};
|
||||
|
||||
pub fn retrieve_asset(
|
||||
cache: &mut HashMap<String, String>,
|
||||
@@ -11,28 +11,33 @@ pub fn retrieve_asset(
|
||||
mime: &str,
|
||||
opt_silent: bool,
|
||||
) -> Result<(String, String), reqwest::Error> {
|
||||
let cache_key = clean_url(&url);
|
||||
|
||||
if is_data_url(&url).unwrap() {
|
||||
Ok((url.to_string(), url.to_string()))
|
||||
} else {
|
||||
if cache.contains_key(&url.to_string()) {
|
||||
if cache.contains_key(&cache_key) {
|
||||
// url is in cache
|
||||
if !opt_silent {
|
||||
eprintln!("{} (from cache)", &url);
|
||||
}
|
||||
let data = cache.get(&url.to_string()).unwrap();
|
||||
let data = cache.get(&cache_key).unwrap();
|
||||
Ok((data.to_string(), url.to_string()))
|
||||
} else {
|
||||
// url not in cache, we request it
|
||||
let mut response = client.get(url).send()?;
|
||||
let res_url = response.url().to_string();
|
||||
|
||||
if !opt_silent {
|
||||
if url == response.url().as_str() {
|
||||
if url == res_url {
|
||||
eprintln!("{}", &url);
|
||||
} else {
|
||||
eprintln!("{} -> {}", &url, &response.url().as_str());
|
||||
eprintln!("{} -> {}", &url, &res_url);
|
||||
}
|
||||
}
|
||||
|
||||
let new_cache_key = clean_url(&res_url);
|
||||
|
||||
if as_dataurl {
|
||||
// Convert response into a byte array
|
||||
let mut data: Vec<u8> = vec![];
|
||||
@@ -50,13 +55,13 @@ pub fn retrieve_asset(
|
||||
};
|
||||
let dataurl = data_to_dataurl(&mimetype, &data);
|
||||
// insert in cache
|
||||
cache.insert(response.url().to_string(), dataurl.to_string());
|
||||
Ok((dataurl, response.url().to_string()))
|
||||
cache.insert(new_cache_key, dataurl.clone());
|
||||
Ok((dataurl, res_url))
|
||||
} else {
|
||||
let content = response.text().unwrap();
|
||||
// insert in cache
|
||||
cache.insert(response.url().to_string(), content.clone());
|
||||
Ok((content, response.url().to_string()))
|
||||
cache.insert(new_cache_key, content.clone());
|
||||
Ok((content, res_url))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
111
src/js.rs
111
src/js.rs
@@ -1,32 +1,103 @@
|
||||
const JS_DOM_EVENT_ATTRS: [&str; 21] = [
|
||||
// Input
|
||||
"onfocus",
|
||||
const JS_DOM_EVENT_ATTRS: &[&str] = &[
|
||||
// From WHATWG HTML spec 8.1.5.2 'Event handlers on elements, Document objects, and Window objects':
|
||||
// https://html.spec.whatwg.org/#event-handlers-on-elements,-document-objects,-and-window-objects
|
||||
// https://html.spec.whatwg.org/#attributes-3 (table 'List of event handler content attributes')
|
||||
|
||||
// Global event handlers
|
||||
"onabort",
|
||||
"onauxclick",
|
||||
"onblur",
|
||||
"onselect",
|
||||
"oncancel",
|
||||
"oncanplay",
|
||||
"oncanplaythrough",
|
||||
"onchange",
|
||||
"onsubmit",
|
||||
"onreset",
|
||||
"onclick",
|
||||
"onclose",
|
||||
"oncontextmenu",
|
||||
"oncuechange",
|
||||
"ondblclick",
|
||||
"ondrag",
|
||||
"ondragend",
|
||||
"ondragenter",
|
||||
"ondragexit",
|
||||
"ondragleave",
|
||||
"ondragover",
|
||||
"ondragstart",
|
||||
"ondrop",
|
||||
"ondurationchange",
|
||||
"onemptied",
|
||||
"onended",
|
||||
"onerror",
|
||||
"onfocus",
|
||||
"onformdata",
|
||||
"oninput",
|
||||
"oninvalid",
|
||||
"onkeydown",
|
||||
"onkeypress",
|
||||
"onkeyup",
|
||||
// Mouse
|
||||
"onmouseover",
|
||||
"onmouseout",
|
||||
"onmousedown",
|
||||
"onmouseup",
|
||||
"onmousemove",
|
||||
// Click
|
||||
"onclick",
|
||||
"ondblclick",
|
||||
// Load
|
||||
"onload",
|
||||
"onunload",
|
||||
"onabort",
|
||||
"onerror",
|
||||
"onloadeddata",
|
||||
"onloadedmetadata",
|
||||
"onloadstart",
|
||||
"onmousedown",
|
||||
"onmouseenter",
|
||||
"onmouseleave",
|
||||
"onmousemove",
|
||||
"onmouseout",
|
||||
"onmouseover",
|
||||
"onmouseup",
|
||||
"onwheel",
|
||||
"onpause",
|
||||
"onplay",
|
||||
"onplaying",
|
||||
"onprogress",
|
||||
"onratechange",
|
||||
"onreset",
|
||||
"onresize",
|
||||
"onscroll",
|
||||
"onsecuritypolicyviolation",
|
||||
"onseeked",
|
||||
"onseeking",
|
||||
"onselect",
|
||||
"onslotchange",
|
||||
"onstalled",
|
||||
"onsubmit",
|
||||
"onsuspend",
|
||||
"ontimeupdate",
|
||||
"ontoggle",
|
||||
"onvolumechange",
|
||||
"onwaiting",
|
||||
"onwebkitanimationend",
|
||||
"onwebkitanimationiteration",
|
||||
"onwebkitanimationstart",
|
||||
"onwebkittransitionend",
|
||||
// Event handlers for <body/> and <frameset/> elements
|
||||
"onafterprint",
|
||||
"onbeforeprint",
|
||||
"onbeforeunload",
|
||||
"onhashchange",
|
||||
"onlanguagechange",
|
||||
"onmessage",
|
||||
"onmessageerror",
|
||||
"onoffline",
|
||||
"ononline",
|
||||
"onpagehide",
|
||||
"onpageshow",
|
||||
"onpopstate",
|
||||
"onrejectionhandled",
|
||||
"onstorage",
|
||||
"onunhandledrejection",
|
||||
"onunload",
|
||||
// Event handlers for <html/> element
|
||||
"oncut",
|
||||
"oncopy",
|
||||
"onpaste",
|
||||
];
|
||||
|
||||
// Returns true if DOM attribute name matches a native JavaScript event handler
|
||||
pub fn attr_is_event_handler(attr_name: &str) -> bool {
|
||||
JS_DOM_EVENT_ATTRS.contains(&attr_name.to_lowercase().as_str())
|
||||
JS_DOM_EVENT_ATTRS
|
||||
.iter()
|
||||
.find(|a| attr_name.eq_ignore_ascii_case(a))
|
||||
.is_some()
|
||||
}
|
||||
|
||||
@@ -1,9 +1,5 @@
|
||||
extern crate html5ever;
|
||||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
extern crate regex;
|
||||
extern crate reqwest;
|
||||
extern crate url;
|
||||
|
||||
#[macro_use]
|
||||
mod macros;
|
||||
|
||||
145
src/main.rs
145
src/main.rs
@@ -1,70 +1,111 @@
|
||||
#[macro_use]
|
||||
extern crate clap;
|
||||
extern crate monolith;
|
||||
extern crate reqwest;
|
||||
|
||||
mod args;
|
||||
mod macros;
|
||||
|
||||
use args::AppArgs;
|
||||
use crate::args::AppArgs;
|
||||
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
|
||||
use monolith::http::retrieve_asset;
|
||||
use monolith::utils::is_valid_url;
|
||||
use reqwest::blocking::Client;
|
||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::{self, Error, Write};
|
||||
use std::process;
|
||||
use std::time::Duration;
|
||||
|
||||
enum Output {
|
||||
Stdout(io::Stdout),
|
||||
File(File),
|
||||
}
|
||||
|
||||
impl Output {
|
||||
fn new(file_path: &str) -> Result<Output, Error> {
|
||||
if file_path.is_empty() {
|
||||
Ok(Output::Stdout(io::stdout()))
|
||||
} else {
|
||||
Ok(Output::File(File::create(file_path)?))
|
||||
}
|
||||
}
|
||||
|
||||
fn writeln_str(&mut self, s: &str) -> Result<(), Error> {
|
||||
match self {
|
||||
Output::Stdout(stdout) => {
|
||||
writeln!(stdout, "{}", s)?;
|
||||
stdout.flush()
|
||||
}
|
||||
Output::File(f) => {
|
||||
writeln!(f, "{}", s)?;
|
||||
f.flush()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let app_args = AppArgs::get();
|
||||
let cache = &mut HashMap::new();
|
||||
if is_valid_url(app_args.url_target.as_str()) {
|
||||
// Initialize client
|
||||
let mut header_map = HeaderMap::new();
|
||||
match HeaderValue::from_str(&app_args.user_agent) {
|
||||
Ok(header) => header_map.insert(USER_AGENT, header),
|
||||
Err(err) => {
|
||||
eprintln!("Invalid user agent! {}", err);
|
||||
return;
|
||||
}
|
||||
};
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(10))
|
||||
.danger_accept_invalid_certs(app_args.insecure)
|
||||
.default_headers(header_map)
|
||||
.build()
|
||||
.expect("Failed to initialize HTTP client");
|
||||
|
||||
let (data, final_url) = retrieve_asset(
|
||||
cache,
|
||||
&client,
|
||||
app_args.url_target.as_str(),
|
||||
false,
|
||||
"",
|
||||
app_args.silent,
|
||||
)
|
||||
.unwrap();
|
||||
let dom = html_to_dom(&data);
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&final_url,
|
||||
&dom.document,
|
||||
app_args.no_css,
|
||||
app_args.no_js,
|
||||
app_args.no_images,
|
||||
app_args.silent,
|
||||
app_args.no_frames,
|
||||
if !is_valid_url(app_args.url_target.as_str()) {
|
||||
eprintln!(
|
||||
"Only HTTP and HTTPS URLs are allowed but got: {}",
|
||||
&app_args.url_target
|
||||
);
|
||||
|
||||
let html: String = stringify_document(
|
||||
&dom.document,
|
||||
app_args.no_css,
|
||||
app_args.no_frames,
|
||||
app_args.no_js,
|
||||
app_args.no_images,
|
||||
app_args.isolate,
|
||||
);
|
||||
|
||||
println!("{}", html);
|
||||
process::exit(1);
|
||||
}
|
||||
|
||||
let mut output = Output::new(&app_args.output).expect("Could not prepare output");
|
||||
|
||||
// Initialize client
|
||||
let mut cache = HashMap::new();
|
||||
let mut header_map = HeaderMap::new();
|
||||
header_map.insert(
|
||||
USER_AGENT,
|
||||
HeaderValue::from_str(&app_args.user_agent).expect("Invalid User-Agent header specified"),
|
||||
);
|
||||
|
||||
let client = Client::builder()
|
||||
.timeout(Duration::from_secs(10))
|
||||
.danger_accept_invalid_certs(app_args.insecure)
|
||||
.default_headers(header_map)
|
||||
.build()
|
||||
.expect("Failed to initialize HTTP client");
|
||||
|
||||
// Retrieve root document
|
||||
let (data, final_url) = retrieve_asset(
|
||||
&mut cache,
|
||||
&client,
|
||||
app_args.url_target.as_str(),
|
||||
false,
|
||||
"",
|
||||
app_args.silent,
|
||||
)
|
||||
.expect("Could not retrieve assets in HTML");
|
||||
let dom = html_to_dom(&data);
|
||||
|
||||
walk_and_embed_assets(
|
||||
&mut cache,
|
||||
&client,
|
||||
&final_url,
|
||||
&dom.document,
|
||||
app_args.no_css,
|
||||
app_args.no_js,
|
||||
app_args.no_images,
|
||||
app_args.silent,
|
||||
app_args.no_frames,
|
||||
);
|
||||
|
||||
let html: String = stringify_document(
|
||||
&dom.document,
|
||||
app_args.no_css,
|
||||
app_args.no_frames,
|
||||
app_args.no_js,
|
||||
app_args.no_images,
|
||||
app_args.isolate,
|
||||
);
|
||||
|
||||
output
|
||||
.writeln_str(&html)
|
||||
.expect("Could not write HTML output");
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ use crate::html::{
|
||||
};
|
||||
use html5ever::rcdom::{Handle, NodeData};
|
||||
use html5ever::serialize::{serialize, SerializeOpts};
|
||||
use reqwest::blocking::Client;
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[test]
|
||||
@@ -33,7 +34,8 @@ fn test_get_parent_node_name() {
|
||||
}
|
||||
NodeData::Element { ref name, .. } => {
|
||||
let node_name = name.local.as_ref().to_string();
|
||||
let parent_node_name = get_node_name(&get_parent_node(node));
|
||||
let parent = get_parent_node(node);
|
||||
let parent_node_name = get_node_name(&parent);
|
||||
if node_name == "head" || node_name == "body" {
|
||||
assert_eq!(parent_node_name, "html");
|
||||
} else if node_name == "div" {
|
||||
@@ -71,7 +73,7 @@ fn test_walk_and_embed_assets() {
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let client = Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
@@ -107,7 +109,7 @@ fn test_walk_and_embed_assets_ensure_no_recursive_iframe() {
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let client = Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
@@ -144,7 +146,7 @@ fn test_walk_and_embed_assets_no_css() {
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
let client = reqwest::Client::new();
|
||||
let client = Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
@@ -189,7 +191,7 @@ fn test_walk_and_embed_assets_no_images() {
|
||||
let opt_no_images: bool = true;
|
||||
let opt_silent = true;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let client = Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
@@ -235,7 +237,7 @@ fn test_walk_and_embed_assets_no_frames() {
|
||||
let opt_no_js: bool = false;
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
let client = reqwest::Client::new();
|
||||
let client = Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
@@ -274,7 +276,7 @@ fn test_walk_and_embed_assets_no_js() {
|
||||
let opt_no_images: bool = false;
|
||||
let opt_silent = true;
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let client = Client::new();
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
@@ -298,6 +300,45 @@ fn test_walk_and_embed_assets_no_js() {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_with_no_integrity() {
|
||||
let html = "<title>No integrity</title>\
|
||||
<link integrity=\"sha384-...\" rel=\"something\"/>\
|
||||
<script integrity=\"sha384-...\" src=\"some.js\"></script>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
let client = Client::new();
|
||||
let opt_no_css: bool = true;
|
||||
let opt_no_frames: bool = true;
|
||||
let opt_no_js: bool = true;
|
||||
let opt_no_images: bool = true;
|
||||
let opt_silent = true;
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html>\
|
||||
<head><title>No integrity</title><link rel=\"something\"><script src=\"\"></script></head>\
|
||||
<body></body>\
|
||||
</html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stringify_document() {
|
||||
let html = "<div><script src=\"some.js\"></script></div>";
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
use crate::http::retrieve_asset;
|
||||
use reqwest::blocking::Client;
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[test]
|
||||
fn test_retrieve_asset() {
|
||||
let cache = &mut HashMap::new();
|
||||
let client = reqwest::Client::new();
|
||||
let client = Client::new();
|
||||
let (data, final_url) =
|
||||
retrieve_asset(cache, &client, "data:text/html;base64,...", true, "", false).unwrap();
|
||||
assert_eq!(&data, "data:text/html;base64,...");
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use crate::utils::{
|
||||
data_to_dataurl, detect_mimetype, is_data_url, is_valid_url, resolve_url, url_has_protocol,
|
||||
clean_url, data_to_dataurl, detect_mimetype, is_data_url, is_valid_url, resolve_url,
|
||||
url_has_protocol,
|
||||
};
|
||||
use url::ParseError;
|
||||
|
||||
@@ -158,3 +159,19 @@ fn test_is_data_url() {
|
||||
assert!(!is_data_url("//kernel.org").unwrap_or(false));
|
||||
assert!(!is_data_url("").unwrap_or(false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_clean_url() {
|
||||
assert_eq!(
|
||||
clean_url("https://somewhere.com/font.eot#iefix"),
|
||||
"https://somewhere.com/font.eot"
|
||||
);
|
||||
assert_eq!(
|
||||
clean_url("https://somewhere.com/font.eot#"),
|
||||
"https://somewhere.com/font.eot"
|
||||
);
|
||||
assert_eq!(
|
||||
clean_url("https://somewhere.com/font.eot?#"),
|
||||
"https://somewhere.com/font.eot"
|
||||
);
|
||||
}
|
||||
|
||||
19
src/utils.rs
19
src/utils.rs
@@ -1,9 +1,7 @@
|
||||
extern crate base64;
|
||||
|
||||
use self::base64::encode;
|
||||
use http::retrieve_asset;
|
||||
use crate::http::retrieve_asset;
|
||||
use base64::encode;
|
||||
use regex::Regex;
|
||||
use reqwest::Client;
|
||||
use reqwest::blocking::Client;
|
||||
use std::collections::HashMap;
|
||||
use url::{ParseError, Url};
|
||||
|
||||
@@ -196,3 +194,14 @@ pub fn resolve_css_imports(
|
||||
resolved_css
|
||||
}
|
||||
}
|
||||
|
||||
pub fn clean_url<T: AsRef<str>>(url: T) -> String {
|
||||
let mut result = Url::parse(url.as_ref()).unwrap();
|
||||
// Clear fragment
|
||||
result.set_fragment(None);
|
||||
// Get rid of stray question mark
|
||||
if result.query() == Some("") {
|
||||
result.set_query(None);
|
||||
}
|
||||
result.to_string()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user