11 Commits

Author SHA1 Message Date
Sunshine
5ba8931502 Merge pull request #92 from snshn/output-file-option
Add option for saving output to file
2019-12-26 18:13:15 -05:00
Sunshine
13d2ea1607 Merge pull request #94 from snshn/no-integrity
Get rid of integrity attributes
2019-12-26 10:11:52 -05:00
Sunshine
88ffde0c3b wipe integrity attributes 2019-12-26 09:44:01 -05:00
Sunshine
bfb97bd062 add option for saving output to file 2019-12-26 00:45:20 -05:00
Sunshine
295931041c Merge pull request #80 from Alch-Emi/lazyload
Add support for lazy loaded images
2019-12-24 17:11:21 -05:00
Sunshine
2e623dd9f8 Merge pull request #84 from snshn/ignore-hash-in-cache-url
use clean URLs as hashmap keys
2019-12-24 17:08:57 -05:00
Sunshine
169b9657e5 ignore failures for both beta and nightly in the pipeline 2019-12-24 16:07:15 -05:00
Emi Simpson
dab4ae6965 Merged Y2Z/master with Alch-Emi/lazyload 2019-12-24 10:07:56 -05:00
Sunshine
c7fc121c7c use clean URLs as hashmap keys 2019-12-18 11:49:38 -05:00
Emi Simpson
292221ea28 Lazyloaded images are now loaded at compilation, with placeholders omitted 2019-12-09 19:40:29 -05:00
Emi Simpson
feb37f5812 Added support for lazy loaded images
Note: The way this patch works is by resolving any data-src tags on images in
the same way as normal source tags are resolved.  It is assumed that most
lazy-load libraries will use this tag, and that if this tag is set, then it is a
URL that is in use.
2019-12-06 19:27:41 -05:00
12 changed files with 195 additions and 47 deletions

View File

@@ -4,23 +4,24 @@ cache: cargo
sudo: false
os:
- linux
- osx
- linux
- osx
rust:
- stable
- beta
- nightly
- stable
- beta
- nightly
before_script:
- rustup component add rustfmt
- rustup component add rustfmt
script:
- cargo build --all --locked --verbose
- cargo test --all --locked --verbose
- cargo fmt --all -- --check
- cargo build --all --locked --verbose
- cargo test --all --locked --verbose
- cargo fmt --all -- --check
jobs:
allow_failures:
- rust: beta
- rust: nightly
fast_finish: true

2
Cargo.lock generated
View File

@@ -625,7 +625,7 @@ dependencies = [
[[package]]
name = "monolith"
version = "2.1.0"
version = "2.1.1"
dependencies = [
"base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",

View File

@@ -1,6 +1,6 @@
[package]
name = "monolith"
version = "2.1.0"
version = "2.1.1"
authors = [
"Sunshine <sunshine@uberspace.net>",
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",

View File

@@ -29,7 +29,7 @@ If compared to saving websites with `wget -mpk`, this tool embeds all assets as
$ brew install monolith
## Usage
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ -o portishead-roads-lyrics.html
## Options
- `-c`: Ignore styles
@@ -38,6 +38,7 @@ If compared to saving websites with `wget -mpk`, this tool embeds all assets as
- `-I`: Isolate document
- `-j`: Exclude JavaScript
- `-k`: Accept invalid X.509 (TLS) certificates
- `-o`: Write output to file
- `-s`: Silent mode
- `-u`: Specify custom User-Agent

View File

@@ -94,6 +94,7 @@ environment:
# or test failure in the matching channels/targets from failing the entire build.
matrix:
allow_failures:
- channel: beta
- channel: nightly
# If you only care about stable channel build failures, uncomment the following line:

View File

@@ -9,6 +9,7 @@ pub struct AppArgs {
pub no_js: bool,
pub insecure: bool,
pub isolate: bool,
pub output: String,
pub silent: bool,
pub user_agent: String,
}
@@ -36,6 +37,7 @@ impl AppArgs {
.args_from_usage("-I, --isolate 'Cut off from the Internet'")
.args_from_usage("-j, --no-js 'Exclude JavaScript'")
.args_from_usage("-k, --insecure 'Accept invalid X.509 (TLS) certificates'")
.args_from_usage("-o, --output=[document.html] 'Write output to <file>'")
.args_from_usage("-s, --silent 'Suppress verbosity'")
.args_from_usage("-u, --user-agent=[Iceweasel] 'Custom User-Agent string'")
// .args_from_usage("-v, --include-video 'Embed video sources'")
@@ -53,6 +55,7 @@ impl AppArgs {
app_args.insecure = app.is_present("insecure");
app_args.isolate = app.is_present("isolate");
app_args.silent = app.is_present("silent");
app_args.output = app.value_of("output").unwrap_or("").to_string();
app_args.user_agent = app
.value_of("user-agent")
.unwrap_or_else(|| DEFAULT_USER_AGENT)

View File

@@ -2,7 +2,7 @@ use html5ever::interface::QualName;
use html5ever::parse_document;
use html5ever::rcdom::{Handle, NodeData, RcDom};
use html5ever::serialize::{serialize, SerializeOpts};
use html5ever::tendril::{format_tendril, TendrilSink};
use html5ever::tendril::{format_tendril, Tendril, TendrilSink};
use html5ever::tree_builder::{Attribute, TreeSink};
use html5ever::{local_name, namespace_url, ns};
use http::retrieve_asset;
@@ -79,6 +79,17 @@ pub fn walk_and_embed_assets(
"link" => {
let mut link_type: &str = "";
// Remove integrity attributes
let mut i = 0;
while i < attrs_mut.len() {
let attr_name = attrs_mut[i].name.local.as_ref();
if attr_name.eq_ignore_ascii_case("integrity") {
attrs_mut.remove(i);
} else {
i += 1;
}
}
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "rel" {
if is_icon(&attr.value.to_string()) {
@@ -168,35 +179,50 @@ pub fn walk_and_embed_assets(
}
}
"img" => {
for attr in attrs_mut.iter_mut() {
if &attr.name.local == "src" {
let value = attr.value.to_string();
// Ignore images with empty source
if value == str!() {
continue;
}
if opt_no_images {
attr.value.clear();
attr.value.push_slice(TRANSPARENT_PIXEL);
} else {
let src_full_url: String =
resolve_url(&url, &value).unwrap_or(str!());
let (img_dataurl, _) = retrieve_asset(
cache,
client,
&src_full_url,
true,
"",
opt_silent,
)
.unwrap_or((str!(), str!()));
attr.value.clear();
attr.value.push_slice(img_dataurl.as_str());
}
// Find source tags
let mut found_src: Option<Attribute> = None;
let mut found_datasrc: Option<Attribute> = None;
let mut i = 0;
while i < attrs_mut.len() {
let name = attrs_mut[i].name.local.as_ref();
if name.eq_ignore_ascii_case("src") {
found_src = Some(attrs_mut.remove(i));
} else if name.eq_ignore_ascii_case("data-src") {
found_datasrc = Some(attrs_mut.remove(i));
} else {
i += 1;
}
}
// If images are disabled, clear both sources
if opt_no_images {
attrs_mut.push(Attribute {
name: QualName::new(None, ns!(), local_name!("src")),
value: Tendril::from_slice(TRANSPARENT_PIXEL),
});
} else if let Some((dataurl, _)) = (&found_datasrc)
.into_iter()
.chain(&found_src) // Give dataurl priority
.map(|attr| &attr.value)
.filter(|src| !src.is_empty()) // Ignore empty srcs
.next()
.and_then(|src| resolve_url(&url, src).ok()) // Make absolute
.and_then(|abs_src| // Download and convert to dataurl
retrieve_asset(
cache,
client,
&abs_src,
true,
"",
opt_silent,
).ok())
{
// Add the new dataurl src attribute
attrs_mut.push(Attribute {
name: QualName::new(None, ns!(), local_name!("src")),
value: Tendril::from_slice(dataurl.as_ref()),
});
}
}
"source" => {
for attr in attrs_mut.iter_mut() {
@@ -248,6 +274,17 @@ pub fn walk_and_embed_assets(
}
}
"script" => {
// Remove integrity attributes
let mut i = 0;
while i < attrs_mut.len() {
let attr_name = attrs_mut[i].name.local.as_ref();
if attr_name.eq_ignore_ascii_case("integrity") {
attrs_mut.remove(i);
} else {
i += 1;
}
}
if opt_no_js {
// Empty src and inner content of SCRIPT tags
for attr in attrs_mut.iter_mut() {

View File

@@ -1,7 +1,7 @@
use reqwest::header::CONTENT_TYPE;
use reqwest::Client;
use std::collections::HashMap;
use utils::{data_to_dataurl, is_data_url};
use utils::{clean_url, data_to_dataurl, is_data_url};
pub fn retrieve_asset(
cache: &mut HashMap<String, String>,
@@ -11,15 +11,17 @@ pub fn retrieve_asset(
mime: &str,
opt_silent: bool,
) -> Result<(String, String), reqwest::Error> {
let cache_key = clean_url(&url);
if is_data_url(&url).unwrap() {
Ok((url.to_string(), url.to_string()))
} else {
if cache.contains_key(&url.to_string()) {
if cache.contains_key(&cache_key) {
// url is in cache
if !opt_silent {
eprintln!("{} (from cache)", &url);
}
let data = cache.get(&url.to_string()).unwrap();
let data = cache.get(&cache_key).unwrap();
Ok((data.to_string(), url.to_string()))
} else {
// url not in cache, we request it
@@ -33,6 +35,8 @@ pub fn retrieve_asset(
}
}
let new_cache_key = clean_url(response.url().to_string());
if as_dataurl {
// Convert response into a byte array
let mut data: Vec<u8> = vec![];
@@ -50,12 +54,12 @@ pub fn retrieve_asset(
};
let dataurl = data_to_dataurl(&mimetype, &data);
// insert in cache
cache.insert(response.url().to_string(), dataurl.to_string());
cache.insert(new_cache_key, dataurl.to_string());
Ok((dataurl, response.url().to_string()))
} else {
let content = response.text().unwrap();
// insert in cache
cache.insert(response.url().to_string(), content.clone());
cache.insert(new_cache_key, content.clone());
Ok((content, response.url().to_string()))
}
}

View File

@@ -4,6 +4,7 @@ extern crate monolith;
extern crate reqwest;
mod args;
mod macros;
use args::AppArgs;
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
@@ -11,11 +12,39 @@ use monolith::http::retrieve_asset;
use monolith::utils::is_valid_url;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use std::collections::HashMap;
use std::fs::{remove_file, File};
use std::io::{Error, Write};
use std::time::Duration;
fn create_file(file_path: &String, content: String) -> Result<(), Error> {
let file = File::create(file_path.as_str());
let mut file = match file {
Ok(file) => file,
Err(error) => return Err(error),
};
if content != str!() {
file.write_all(content.as_bytes())?;
file.write_all("\n".as_bytes())?;
file.sync_all()?;
} else {
// Remove the file right away if it had no content
remove_file(file_path.as_str())?;
}
Ok(())
}
fn main() {
let app_args = AppArgs::get();
let cache = &mut HashMap::new();
// Attempt to create output file
if app_args.output != str!() {
create_file(&app_args.output, str!()).unwrap();
}
if is_valid_url(app_args.url_target.as_str()) {
// Initialize client
let mut header_map = HeaderMap::new();
@@ -33,6 +62,7 @@ fn main() {
.build()
.expect("Failed to initialize HTTP client");
// Retrieve root document
let (data, final_url) = retrieve_asset(
cache,
&client,
@@ -65,6 +95,10 @@ fn main() {
app_args.isolate,
);
println!("{}", html);
if app_args.output == str!() {
println!("{}", html);
} else {
create_file(&app_args.output, html).unwrap();
}
}
}

View File

@@ -298,6 +298,45 @@ fn test_walk_and_embed_assets_no_js() {
);
}
#[test]
fn test_walk_and_embed_with_no_integrity() {
let html = "<title>No integrity</title>\
<link integrity=\"sha384-...\" rel=\"something\"/>\
<script integrity=\"sha384-...\" src=\"some.js\"></script>";
let dom = html_to_dom(&html);
let url = "http://localhost";
let cache = &mut HashMap::new();
let client = reqwest::Client::new();
let opt_no_css: bool = true;
let opt_no_frames: bool = true;
let opt_no_js: bool = true;
let opt_no_images: bool = true;
let opt_silent = true;
walk_and_embed_assets(
cache,
&client,
&url,
&dom.document,
opt_no_css,
opt_no_js,
opt_no_images,
opt_silent,
opt_no_frames,
);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
"<html>\
<head><title>No integrity</title><link rel=\"something\"><script src=\"\"></script></head>\
<body></body>\
</html>"
);
}
#[test]
fn test_stringify_document() {
let html = "<div><script src=\"some.js\"></script></div>";

View File

@@ -1,5 +1,6 @@
use crate::utils::{
data_to_dataurl, detect_mimetype, is_data_url, is_valid_url, resolve_url, url_has_protocol,
clean_url, data_to_dataurl, detect_mimetype, is_data_url, is_valid_url, resolve_url,
url_has_protocol,
};
use url::ParseError;
@@ -158,3 +159,19 @@ fn test_is_data_url() {
assert!(!is_data_url("//kernel.org").unwrap_or(false));
assert!(!is_data_url("").unwrap_or(false));
}
#[test]
fn test_clean_url() {
assert_eq!(
clean_url("https://somewhere.com/font.eot#iefix"),
"https://somewhere.com/font.eot"
);
assert_eq!(
clean_url("https://somewhere.com/font.eot#"),
"https://somewhere.com/font.eot"
);
assert_eq!(
clean_url("https://somewhere.com/font.eot?#"),
"https://somewhere.com/font.eot"
);
}

View File

@@ -196,3 +196,14 @@ pub fn resolve_css_imports(
resolved_css
}
}
pub fn clean_url<T: AsRef<str>>(url: T) -> String {
let mut result = Url::parse(url.as_ref()).unwrap();
// Clear fragment
result.set_fragment(None);
// Get rid of stray question mark
if result.query() == Some("") {
result.set_query(None);
}
result.to_string()
}