Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5ba8931502 | ||
|
|
13d2ea1607 | ||
|
|
88ffde0c3b | ||
|
|
bfb97bd062 | ||
|
|
295931041c | ||
|
|
2e623dd9f8 | ||
|
|
169b9657e5 | ||
|
|
dab4ae6965 | ||
|
|
c7fc121c7c | ||
|
|
292221ea28 | ||
|
|
feb37f5812 |
19
.travis.yml
19
.travis.yml
@@ -4,23 +4,24 @@ cache: cargo
|
||||
sudo: false
|
||||
|
||||
os:
|
||||
- linux
|
||||
- osx
|
||||
- linux
|
||||
- osx
|
||||
|
||||
rust:
|
||||
- stable
|
||||
- beta
|
||||
- nightly
|
||||
- stable
|
||||
- beta
|
||||
- nightly
|
||||
|
||||
before_script:
|
||||
- rustup component add rustfmt
|
||||
- rustup component add rustfmt
|
||||
|
||||
script:
|
||||
- cargo build --all --locked --verbose
|
||||
- cargo test --all --locked --verbose
|
||||
- cargo fmt --all -- --check
|
||||
- cargo build --all --locked --verbose
|
||||
- cargo test --all --locked --verbose
|
||||
- cargo fmt --all -- --check
|
||||
|
||||
jobs:
|
||||
allow_failures:
|
||||
- rust: beta
|
||||
- rust: nightly
|
||||
fast_finish: true
|
||||
|
||||
2
Cargo.lock
generated
2
Cargo.lock
generated
@@ -625,7 +625,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "monolith"
|
||||
version = "2.1.0"
|
||||
version = "2.1.1"
|
||||
dependencies = [
|
||||
"base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "monolith"
|
||||
version = "2.1.0"
|
||||
version = "2.1.1"
|
||||
authors = [
|
||||
"Sunshine <sunshine@uberspace.net>",
|
||||
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
|
||||
|
||||
@@ -29,7 +29,7 @@ If compared to saving websites with `wget -mpk`, this tool embeds all assets as
|
||||
$ brew install monolith
|
||||
|
||||
## Usage
|
||||
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html
|
||||
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ -o portishead-roads-lyrics.html
|
||||
|
||||
## Options
|
||||
- `-c`: Ignore styles
|
||||
@@ -38,6 +38,7 @@ If compared to saving websites with `wget -mpk`, this tool embeds all assets as
|
||||
- `-I`: Isolate document
|
||||
- `-j`: Exclude JavaScript
|
||||
- `-k`: Accept invalid X.509 (TLS) certificates
|
||||
- `-o`: Write output to file
|
||||
- `-s`: Silent mode
|
||||
- `-u`: Specify custom User-Agent
|
||||
|
||||
|
||||
@@ -94,6 +94,7 @@ environment:
|
||||
# or test failure in the matching channels/targets from failing the entire build.
|
||||
matrix:
|
||||
allow_failures:
|
||||
- channel: beta
|
||||
- channel: nightly
|
||||
|
||||
# If you only care about stable channel build failures, uncomment the following line:
|
||||
|
||||
@@ -9,6 +9,7 @@ pub struct AppArgs {
|
||||
pub no_js: bool,
|
||||
pub insecure: bool,
|
||||
pub isolate: bool,
|
||||
pub output: String,
|
||||
pub silent: bool,
|
||||
pub user_agent: String,
|
||||
}
|
||||
@@ -36,6 +37,7 @@ impl AppArgs {
|
||||
.args_from_usage("-I, --isolate 'Cut off from the Internet'")
|
||||
.args_from_usage("-j, --no-js 'Exclude JavaScript'")
|
||||
.args_from_usage("-k, --insecure 'Accept invalid X.509 (TLS) certificates'")
|
||||
.args_from_usage("-o, --output=[document.html] 'Write output to <file>'")
|
||||
.args_from_usage("-s, --silent 'Suppress verbosity'")
|
||||
.args_from_usage("-u, --user-agent=[Iceweasel] 'Custom User-Agent string'")
|
||||
// .args_from_usage("-v, --include-video 'Embed video sources'")
|
||||
@@ -53,6 +55,7 @@ impl AppArgs {
|
||||
app_args.insecure = app.is_present("insecure");
|
||||
app_args.isolate = app.is_present("isolate");
|
||||
app_args.silent = app.is_present("silent");
|
||||
app_args.output = app.value_of("output").unwrap_or("").to_string();
|
||||
app_args.user_agent = app
|
||||
.value_of("user-agent")
|
||||
.unwrap_or_else(|| DEFAULT_USER_AGENT)
|
||||
|
||||
93
src/html.rs
93
src/html.rs
@@ -2,7 +2,7 @@ use html5ever::interface::QualName;
|
||||
use html5ever::parse_document;
|
||||
use html5ever::rcdom::{Handle, NodeData, RcDom};
|
||||
use html5ever::serialize::{serialize, SerializeOpts};
|
||||
use html5ever::tendril::{format_tendril, TendrilSink};
|
||||
use html5ever::tendril::{format_tendril, Tendril, TendrilSink};
|
||||
use html5ever::tree_builder::{Attribute, TreeSink};
|
||||
use html5ever::{local_name, namespace_url, ns};
|
||||
use http::retrieve_asset;
|
||||
@@ -79,6 +79,17 @@ pub fn walk_and_embed_assets(
|
||||
"link" => {
|
||||
let mut link_type: &str = "";
|
||||
|
||||
// Remove integrity attributes
|
||||
let mut i = 0;
|
||||
while i < attrs_mut.len() {
|
||||
let attr_name = attrs_mut[i].name.local.as_ref();
|
||||
if attr_name.eq_ignore_ascii_case("integrity") {
|
||||
attrs_mut.remove(i);
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "rel" {
|
||||
if is_icon(&attr.value.to_string()) {
|
||||
@@ -168,35 +179,50 @@ pub fn walk_and_embed_assets(
|
||||
}
|
||||
}
|
||||
"img" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
if &attr.name.local == "src" {
|
||||
let value = attr.value.to_string();
|
||||
|
||||
// Ignore images with empty source
|
||||
if value == str!() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if opt_no_images {
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(TRANSPARENT_PIXEL);
|
||||
} else {
|
||||
let src_full_url: String =
|
||||
resolve_url(&url, &value).unwrap_or(str!());
|
||||
let (img_dataurl, _) = retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&src_full_url,
|
||||
true,
|
||||
"",
|
||||
opt_silent,
|
||||
)
|
||||
.unwrap_or((str!(), str!()));
|
||||
attr.value.clear();
|
||||
attr.value.push_slice(img_dataurl.as_str());
|
||||
}
|
||||
// Find source tags
|
||||
let mut found_src: Option<Attribute> = None;
|
||||
let mut found_datasrc: Option<Attribute> = None;
|
||||
let mut i = 0;
|
||||
while i < attrs_mut.len() {
|
||||
let name = attrs_mut[i].name.local.as_ref();
|
||||
if name.eq_ignore_ascii_case("src") {
|
||||
found_src = Some(attrs_mut.remove(i));
|
||||
} else if name.eq_ignore_ascii_case("data-src") {
|
||||
found_datasrc = Some(attrs_mut.remove(i));
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// If images are disabled, clear both sources
|
||||
if opt_no_images {
|
||||
attrs_mut.push(Attribute {
|
||||
name: QualName::new(None, ns!(), local_name!("src")),
|
||||
value: Tendril::from_slice(TRANSPARENT_PIXEL),
|
||||
});
|
||||
} else if let Some((dataurl, _)) = (&found_datasrc)
|
||||
.into_iter()
|
||||
.chain(&found_src) // Give dataurl priority
|
||||
.map(|attr| &attr.value)
|
||||
.filter(|src| !src.is_empty()) // Ignore empty srcs
|
||||
.next()
|
||||
.and_then(|src| resolve_url(&url, src).ok()) // Make absolute
|
||||
.and_then(|abs_src| // Download and convert to dataurl
|
||||
retrieve_asset(
|
||||
cache,
|
||||
client,
|
||||
&abs_src,
|
||||
true,
|
||||
"",
|
||||
opt_silent,
|
||||
).ok())
|
||||
{
|
||||
// Add the new dataurl src attribute
|
||||
attrs_mut.push(Attribute {
|
||||
name: QualName::new(None, ns!(), local_name!("src")),
|
||||
value: Tendril::from_slice(dataurl.as_ref()),
|
||||
});
|
||||
}
|
||||
}
|
||||
"source" => {
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
@@ -248,6 +274,17 @@ pub fn walk_and_embed_assets(
|
||||
}
|
||||
}
|
||||
"script" => {
|
||||
// Remove integrity attributes
|
||||
let mut i = 0;
|
||||
while i < attrs_mut.len() {
|
||||
let attr_name = attrs_mut[i].name.local.as_ref();
|
||||
if attr_name.eq_ignore_ascii_case("integrity") {
|
||||
attrs_mut.remove(i);
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if opt_no_js {
|
||||
// Empty src and inner content of SCRIPT tags
|
||||
for attr in attrs_mut.iter_mut() {
|
||||
|
||||
14
src/http.rs
14
src/http.rs
@@ -1,7 +1,7 @@
|
||||
use reqwest::header::CONTENT_TYPE;
|
||||
use reqwest::Client;
|
||||
use std::collections::HashMap;
|
||||
use utils::{data_to_dataurl, is_data_url};
|
||||
use utils::{clean_url, data_to_dataurl, is_data_url};
|
||||
|
||||
pub fn retrieve_asset(
|
||||
cache: &mut HashMap<String, String>,
|
||||
@@ -11,15 +11,17 @@ pub fn retrieve_asset(
|
||||
mime: &str,
|
||||
opt_silent: bool,
|
||||
) -> Result<(String, String), reqwest::Error> {
|
||||
let cache_key = clean_url(&url);
|
||||
|
||||
if is_data_url(&url).unwrap() {
|
||||
Ok((url.to_string(), url.to_string()))
|
||||
} else {
|
||||
if cache.contains_key(&url.to_string()) {
|
||||
if cache.contains_key(&cache_key) {
|
||||
// url is in cache
|
||||
if !opt_silent {
|
||||
eprintln!("{} (from cache)", &url);
|
||||
}
|
||||
let data = cache.get(&url.to_string()).unwrap();
|
||||
let data = cache.get(&cache_key).unwrap();
|
||||
Ok((data.to_string(), url.to_string()))
|
||||
} else {
|
||||
// url not in cache, we request it
|
||||
@@ -33,6 +35,8 @@ pub fn retrieve_asset(
|
||||
}
|
||||
}
|
||||
|
||||
let new_cache_key = clean_url(response.url().to_string());
|
||||
|
||||
if as_dataurl {
|
||||
// Convert response into a byte array
|
||||
let mut data: Vec<u8> = vec![];
|
||||
@@ -50,12 +54,12 @@ pub fn retrieve_asset(
|
||||
};
|
||||
let dataurl = data_to_dataurl(&mimetype, &data);
|
||||
// insert in cache
|
||||
cache.insert(response.url().to_string(), dataurl.to_string());
|
||||
cache.insert(new_cache_key, dataurl.to_string());
|
||||
Ok((dataurl, response.url().to_string()))
|
||||
} else {
|
||||
let content = response.text().unwrap();
|
||||
// insert in cache
|
||||
cache.insert(response.url().to_string(), content.clone());
|
||||
cache.insert(new_cache_key, content.clone());
|
||||
Ok((content, response.url().to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
36
src/main.rs
36
src/main.rs
@@ -4,6 +4,7 @@ extern crate monolith;
|
||||
extern crate reqwest;
|
||||
|
||||
mod args;
|
||||
mod macros;
|
||||
|
||||
use args::AppArgs;
|
||||
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
|
||||
@@ -11,11 +12,39 @@ use monolith::http::retrieve_asset;
|
||||
use monolith::utils::is_valid_url;
|
||||
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
|
||||
use std::collections::HashMap;
|
||||
use std::fs::{remove_file, File};
|
||||
use std::io::{Error, Write};
|
||||
use std::time::Duration;
|
||||
|
||||
fn create_file(file_path: &String, content: String) -> Result<(), Error> {
|
||||
let file = File::create(file_path.as_str());
|
||||
|
||||
let mut file = match file {
|
||||
Ok(file) => file,
|
||||
Err(error) => return Err(error),
|
||||
};
|
||||
|
||||
if content != str!() {
|
||||
file.write_all(content.as_bytes())?;
|
||||
file.write_all("\n".as_bytes())?;
|
||||
file.sync_all()?;
|
||||
} else {
|
||||
// Remove the file right away if it had no content
|
||||
remove_file(file_path.as_str())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let app_args = AppArgs::get();
|
||||
let cache = &mut HashMap::new();
|
||||
|
||||
// Attempt to create output file
|
||||
if app_args.output != str!() {
|
||||
create_file(&app_args.output, str!()).unwrap();
|
||||
}
|
||||
|
||||
if is_valid_url(app_args.url_target.as_str()) {
|
||||
// Initialize client
|
||||
let mut header_map = HeaderMap::new();
|
||||
@@ -33,6 +62,7 @@ fn main() {
|
||||
.build()
|
||||
.expect("Failed to initialize HTTP client");
|
||||
|
||||
// Retrieve root document
|
||||
let (data, final_url) = retrieve_asset(
|
||||
cache,
|
||||
&client,
|
||||
@@ -65,6 +95,10 @@ fn main() {
|
||||
app_args.isolate,
|
||||
);
|
||||
|
||||
println!("{}", html);
|
||||
if app_args.output == str!() {
|
||||
println!("{}", html);
|
||||
} else {
|
||||
create_file(&app_args.output, html).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -298,6 +298,45 @@ fn test_walk_and_embed_assets_no_js() {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_walk_and_embed_with_no_integrity() {
|
||||
let html = "<title>No integrity</title>\
|
||||
<link integrity=\"sha384-...\" rel=\"something\"/>\
|
||||
<script integrity=\"sha384-...\" src=\"some.js\"></script>";
|
||||
let dom = html_to_dom(&html);
|
||||
let url = "http://localhost";
|
||||
let cache = &mut HashMap::new();
|
||||
let client = reqwest::Client::new();
|
||||
let opt_no_css: bool = true;
|
||||
let opt_no_frames: bool = true;
|
||||
let opt_no_js: bool = true;
|
||||
let opt_no_images: bool = true;
|
||||
let opt_silent = true;
|
||||
|
||||
walk_and_embed_assets(
|
||||
cache,
|
||||
&client,
|
||||
&url,
|
||||
&dom.document,
|
||||
opt_no_css,
|
||||
opt_no_js,
|
||||
opt_no_images,
|
||||
opt_silent,
|
||||
opt_no_frames,
|
||||
);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
buf.iter().map(|&c| c as char).collect::<String>(),
|
||||
"<html>\
|
||||
<head><title>No integrity</title><link rel=\"something\"><script src=\"\"></script></head>\
|
||||
<body></body>\
|
||||
</html>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stringify_document() {
|
||||
let html = "<div><script src=\"some.js\"></script></div>";
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use crate::utils::{
|
||||
data_to_dataurl, detect_mimetype, is_data_url, is_valid_url, resolve_url, url_has_protocol,
|
||||
clean_url, data_to_dataurl, detect_mimetype, is_data_url, is_valid_url, resolve_url,
|
||||
url_has_protocol,
|
||||
};
|
||||
use url::ParseError;
|
||||
|
||||
@@ -158,3 +159,19 @@ fn test_is_data_url() {
|
||||
assert!(!is_data_url("//kernel.org").unwrap_or(false));
|
||||
assert!(!is_data_url("").unwrap_or(false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_clean_url() {
|
||||
assert_eq!(
|
||||
clean_url("https://somewhere.com/font.eot#iefix"),
|
||||
"https://somewhere.com/font.eot"
|
||||
);
|
||||
assert_eq!(
|
||||
clean_url("https://somewhere.com/font.eot#"),
|
||||
"https://somewhere.com/font.eot"
|
||||
);
|
||||
assert_eq!(
|
||||
clean_url("https://somewhere.com/font.eot?#"),
|
||||
"https://somewhere.com/font.eot"
|
||||
);
|
||||
}
|
||||
|
||||
11
src/utils.rs
11
src/utils.rs
@@ -196,3 +196,14 @@ pub fn resolve_css_imports(
|
||||
resolved_css
|
||||
}
|
||||
}
|
||||
|
||||
pub fn clean_url<T: AsRef<str>>(url: T) -> String {
|
||||
let mut result = Url::parse(url.as_ref()).unwrap();
|
||||
// Clear fragment
|
||||
result.set_fragment(None);
|
||||
// Get rid of stray question mark
|
||||
if result.query() == Some("") {
|
||||
result.set_query(None);
|
||||
}
|
||||
result.to_string()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user