19 Commits

Author SHA1 Message Date
Sunshine
6826b59ab9 Merge pull request #272 from snshn/new-release
New release (2.6.1)
2021-07-03 19:39:32 -10:00
Sunshine
2be725eeb5 bump version number (2.6.0 -> 2.6.1) 2021-07-03 19:33:09 -10:00
Sunshine
dd2e9ca2e5 update crates 2021-07-03 19:31:55 -10:00
Sunshine
50bccae476 Merge pull request #267 from snshn/aarch64-binary
Add GNU/Linux AArch64 CD job
2021-07-03 00:15:04 -10:00
Sunshine
b3bcb1d85b add GNU/Linux AArch64 CD job 2021-07-03 00:10:14 -10:00
Sunshine
c58d044459 Merge pull request #271 from snshn/fix-charset-detection-mechanism
Fix charset detection logic
2021-07-02 21:47:56 -10:00
Sunshine
eeaea0df16 fix use of wrong charset 2021-07-02 21:35:06 -10:00
Sunshine
2539aac4c0 Merge pull request #265 from snshn/version-bump
Bump version (2.5.0 -> 2.6.0)
2021-06-08 13:16:40 -10:00
Sunshine
03b9af543a bump version (2.5.0 -> 2.6.0) 2021-06-08 13:09:50 -10:00
Sunshine
1bb8141021 Merge pull request #264 from snshn/fixes
Fixes
2021-06-08 13:04:57 -10:00
Sunshine
4bc8043f0f account for charset when creating data URLs 2021-06-08 12:54:16 -10:00
Sunshine
5effa38392 use proper charset detection for linked assets 2021-06-08 12:25:19 -10:00
Sunshine
125aeeec3b improve validation of charset found in HTML, use genuinely infinite timeout 2021-06-08 11:50:46 -10:00
Sunshine
c938ba6a2f modify proper attribute for (i)frame elements 2021-06-08 04:49:14 -10:00
Sunshine
f354affc36 Merge pull request #263 from snshn/save-with-custom-charset
Add option for saving document using custom encoding
2021-06-08 04:15:49 -10:00
Sunshine
7686b2ea64 avoid excessive parsing of HTML into DOM 2021-06-08 03:57:28 -10:00
Sunshine
b29b9a6a7c add option for saving document using custom encoding 2021-06-08 03:39:27 -10:00
Sunshine
cbda57cfa8 Merge pull request #262 from snshn/support-more-encodings
Add support for wider range of charsets
2021-06-08 02:39:24 -10:00
Sunshine
b8aa545e8c add support for wider range of charsets 2021-06-08 02:30:15 -10:00
35 changed files with 1305 additions and 613 deletions

View File

@@ -30,20 +30,20 @@ jobs:
uses: actions/checkout@v2
- name: Prepare cross-platform environment
run: |
sudo mkdir -p /cross-build-arm
sudo mkdir -p /cross-build
sudo touch /etc/apt/sources.list.d/armhf.list
echo "deb [arch=armhf] http://ports.ubuntu.com/ubuntu-ports/ bionic main" | sudo tee -a /etc/apt/sources.list.d/armhf.list
sudo apt-get update
sudo apt-get install -y gcc-arm-linux-gnueabihf libc6-armhf-cross libc6-dev-armhf-cross
sudo apt-get download libssl1.1:armhf libssl-dev:armhf
sudo dpkg -x libssl1.1*.deb /cross-build-arm
sudo dpkg -x libssl-dev*.deb /cross-build-arm
sudo dpkg -x libssl1.1*.deb /cross-build
sudo dpkg -x libssl-dev*.deb /cross-build
rustup target add arm-unknown-linux-gnueabihf
echo "C_INCLUDE_PATH=/cross-build-arm/usr/include" >> $GITHUB_ENV
echo "OPENSSL_INCLUDE_DIR=/cross-build-arm/usr/include/arm-linux-gnueabihf" >> $GITHUB_ENV
echo "OPENSSL_LIB_DIR=/cross-build-arm/usr/lib/arm-linux-gnueabihf" >> $GITHUB_ENV
echo "C_INCLUDE_PATH=/cross-build/usr/include" >> $GITHUB_ENV
echo "OPENSSL_INCLUDE_DIR=/cross-build/usr/include/arm-linux-gnueabihf" >> $GITHUB_ENV
echo "OPENSSL_LIB_DIR=/cross-build/usr/lib/arm-linux-gnueabihf" >> $GITHUB_ENV
echo "PKG_CONFIG_ALLOW_CROSS=1" >> $GITHUB_ENV
echo "RUSTFLAGS=-C linker=arm-linux-gnueabihf-gcc -L/usr/arm-linux-gnueabihf/lib -L/cross-build-arm/usr/lib/arm-linux-gnueabihf -L/cross-build-arm/lib/arm-linux-gnueabihf" >> $GITHUB_ENV
echo "RUSTFLAGS=-C linker=arm-linux-gnueabihf-gcc -L/usr/arm-linux-gnueabihf/lib -L/cross-build/usr/lib/arm-linux-gnueabihf -L/cross-build/lib/arm-linux-gnueabihf" >> $GITHUB_ENV
- name: Build the executable
run: cargo build --release --target=arm-unknown-linux-gnueabihf
- name: Attach artifact to the release
@@ -53,6 +53,36 @@ jobs:
path: target/arm-unknown-linux-gnueabihf/release/monolith
repo-token: ${{ secrets.GITHUB_TOKEN }}
gnu_linux_aarch64:
runs-on: ubuntu-20.04
steps:
- name: Checkout the repository
uses: actions/checkout@v2
- name: Prepare cross-platform environment
run: |
sudo mkdir -p /cross-build
sudo touch /etc/apt/sources.list.d/arm64.list
echo "deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ focal main" | sudo tee -a /etc/apt/sources.list.d/arm64.list
sudo apt-get update
sudo apt-get install -y gcc-aarch64-linux-gnu libc6-arm64-cross libc6-dev-arm64-cross
sudo apt-get download libssl1.1:arm64 libssl-dev:arm64
sudo dpkg -x libssl1.1*.deb /cross-build
sudo dpkg -x libssl-dev*.deb /cross-build
rustup target add aarch64-unknown-linux-gnu
echo "C_INCLUDE_PATH=/cross-build/usr/include" >> $GITHUB_ENV
echo "OPENSSL_INCLUDE_DIR=/cross-build/usr/include/aarch64-linux-gnu" >> $GITHUB_ENV
echo "OPENSSL_LIB_DIR=/cross-build/usr/lib/aarch64-linux-gnu" >> $GITHUB_ENV
echo "PKG_CONFIG_ALLOW_CROSS=1" >> $GITHUB_ENV
echo "RUSTFLAGS=-C linker=aarch64-linux-gnu-gcc -L/usr/aarch64-linux-gnu/lib -L/cross-build/usr/lib/aarch64-linux-gnu" >> $GITHUB_ENV
- name: Build the executable
run: cargo build --release --target=aarch64-unknown-linux-gnu
- name: Attach artifact to the release
uses: Shopify/upload-to-release@1.0.0
with:
name: monolith-gnu-linux-aarch64
path: target/aarch64-unknown-linux-gnu/release/monolith
repo-token: ${{ secrets.GITHUB_TOKEN }}
gnu_linux_x86_64:
runs-on: ubuntu-18.04
steps:

166
Cargo.lock generated
View File

@@ -26,9 +26,9 @@ dependencies = [
[[package]]
name = "assert_cmd"
version = "1.0.4"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f57fec1ac7e4de72dcc69811795f1a7172ed06012f80a5d1ee651b62484f588"
checksum = "3d20831bd004dda4c7c372c19cdabff369f794a95e955b3f13fe460e3e1ae95f"
dependencies = [
"bstr",
"doc-comment",
@@ -185,9 +185,9 @@ checksum = "ea221b5284a47e40033bf9b66f35f984ec0ea2931eb03505246cd27a963f981b"
[[package]]
name = "cpufeatures"
version = "0.1.4"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed00c67cb5d0a7d64a44f6ad2668db7e7530311dd53ea79bcd4fb022c64911c8"
checksum = "66c99696f6c9dd7f35d486b9d04d7e6e202aa3e8c40d553f2fdf5e7e0c6a71ef"
dependencies = [
"libc",
]
@@ -229,10 +229,10 @@ dependencies = [
]
[[package]]
name = "difference"
version = "2.0.0"
name = "difflib"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "524cbf6897b527295dff137cec09ecf3a05f4fddffd7dfcd1585403449e74198"
checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8"
[[package]]
name = "digest"
@@ -264,6 +264,12 @@ dependencies = [
"dtoa",
]
[[package]]
name = "either"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]]
name = "encoding_rs"
version = "0.8.28"
@@ -434,15 +440,15 @@ dependencies = [
[[package]]
name = "hashbrown"
version = "0.9.1"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04"
checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
[[package]]
name = "hermit-abi"
version = "0.1.18"
version = "0.1.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "322f4de77956e22ed0e5032c359a0f1273f1f7f0d79bfa3b8ffbc730d7fbcc5c"
checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
dependencies = [
"libc",
]
@@ -497,9 +503,9 @@ checksum = "6456b8a6c8f33fee7d958fcd1b60d55b11940a79e63ae87013e6d22e26034440"
[[package]]
name = "hyper"
version = "0.14.8"
version = "0.14.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3f71a7eea53a3f8257a7b4795373ff886397178cd634430ea94e12d7fe4fe34"
checksum = "07d6baa1b441335f3ce5098ac421fb6547c46dda735ca1bc6d0153c838f9dd83"
dependencies = [
"bytes",
"futures-channel",
@@ -511,7 +517,7 @@ dependencies = [
"httparse",
"httpdate",
"itoa",
"pin-project",
"pin-project-lite",
"socket2",
"tokio",
"tower-service",
@@ -545,9 +551,9 @@ dependencies = [
[[package]]
name = "indexmap"
version = "1.6.2"
version = "1.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "824845a0bf897a9042383849b02c1bc219c2383772efcd5c6f9766fa4b81aef3"
checksum = "bc633605454125dec4b66843673f01c7df2b89479b32e0ed634e43a91cff62a5"
dependencies = [
"autocfg 1.0.1",
"hashbrown",
@@ -555,9 +561,18 @@ dependencies = [
[[package]]
name = "ipnet"
version = "2.3.0"
version = "2.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135"
checksum = "68f2d64f2edebec4ce84ad108148e67e1064789bee435edc5b60ad398714a3a9"
[[package]]
name = "itertools"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf"
dependencies = [
"either",
]
[[package]]
name = "itoa"
@@ -582,9 +597,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.95"
version = "0.2.97"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "789da6d93f1b866ffe175afc5322a4d76c038605a1c3319bb57b06967ca98a36"
checksum = "12b8adadd720df158f4d70dfe7ccc6adb0472d7c55ca83445f6a5ab3e36f8fb6"
[[package]]
name = "log"
@@ -648,9 +663,9 @@ dependencies = [
[[package]]
name = "mio"
version = "0.7.11"
version = "0.7.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf80d3e903b34e0bd7282b218398aec54e082c840d9baf8339e0080a0c542956"
checksum = "8c2bdb6314ec10835cd3293dd268473a835c02b7b352e788be788b3c6ca6bb16"
dependencies = [
"libc",
"log",
@@ -670,7 +685,7 @@ dependencies = [
[[package]]
name = "monolith"
version = "2.5.0"
version = "2.6.1"
dependencies = [
"assert_cmd",
"atty",
@@ -678,6 +693,7 @@ dependencies = [
"chrono",
"clap",
"cssparser",
"encoding_rs",
"html5ever",
"regex",
"reqwest",
@@ -749,9 +765,9 @@ dependencies = [
[[package]]
name = "once_cell"
version = "1.7.2"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af8b08b04175473088b46763e51ee54da5f9a164bc162f615b91bc179dbf15a3"
checksum = "692fcb63b64b1758029e0a96ee63e049ce8c5948587f2f7208df04625e5f6b56"
[[package]]
name = "opaque-debug"
@@ -761,9 +777,9 @@ checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5"
[[package]]
name = "openssl"
version = "0.10.34"
version = "0.10.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d7830286ad6a3973c0f1d9b73738f69c76b739301d0229c4b96501695cbe4c8"
checksum = "549430950c79ae24e6d02e0b7404534ecf311d94cc9f861e9e4020187d13d885"
dependencies = [
"bitflags",
"cfg-if",
@@ -781,9 +797,9 @@ checksum = "28988d872ab76095a6e6ac88d99b54fd267702734fd7ffe610ca27f533ddb95a"
[[package]]
name = "openssl-sys"
version = "0.9.63"
version = "0.9.65"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6b0d6fb7d80f877617dfcb014e605e2b5ab2fb0afdf27935219bb6bd984cb98"
checksum = "7a7907e3bfa08bb85105209cdfcb6c63d109f8f6c1ed6ca318fff5c1853fbc1d"
dependencies = [
"autocfg 1.0.1",
"cc",
@@ -880,31 +896,11 @@ dependencies = [
"siphasher 0.3.5",
]
[[package]]
name = "pin-project"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7509cc106041c40a4518d2af7a61530e1eed0e6285296a3d8c5472806ccc4a4"
dependencies = [
"pin-project-internal",
]
[[package]]
name = "pin-project-internal"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48c950132583b500556b1efd71d45b319029f2b71518d979fcc208e16b42426f"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "pin-project-lite"
version = "0.2.6"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc0e1f259c92177c30a4c9d177246edd0a3568b25756a977d0632cf8fa37e905"
checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443"
[[package]]
name = "pin-utils"
@@ -932,11 +928,12 @@ checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "predicates"
version = "1.0.8"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f49cfaf7fdaa3bfacc6fa3e7054e65148878354a5cfddcf661df4c851f8021df"
checksum = "c6e46ca79eb4e21e2ec14430340c71250ab69332abf85521c95d3a8bc336aa76"
dependencies = [
"difference",
"difflib",
"itertools",
"predicates-core",
]
@@ -1015,14 +1012,14 @@ dependencies = [
[[package]]
name = "rand"
version = "0.8.3"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e"
checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8"
dependencies = [
"libc",
"rand_chacha 0.3.0",
"rand_core 0.6.2",
"rand_hc 0.3.0",
"rand_chacha 0.3.1",
"rand_core 0.6.3",
"rand_hc 0.3.1",
]
[[package]]
@@ -1047,12 +1044,12 @@ dependencies = [
[[package]]
name = "rand_chacha"
version = "0.3.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e12735cf05c9e10bf21534da50a147b924d555dc7a547c42e6bb2d5b6017ae0d"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core 0.6.2",
"rand_core 0.6.3",
]
[[package]]
@@ -1081,9 +1078,9 @@ dependencies = [
[[package]]
name = "rand_core"
version = "0.6.2"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7"
checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7"
dependencies = [
"getrandom 0.2.3",
]
@@ -1108,11 +1105,11 @@ dependencies = [
[[package]]
name = "rand_hc"
version = "0.3.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3190ef7066a446f2e7f42e239d161e905420ccab01eb967c9eb27d21b2322a73"
checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7"
dependencies = [
"rand_core 0.6.2",
"rand_core 0.6.3",
]
[[package]]
@@ -1188,9 +1185,9 @@ dependencies = [
[[package]]
name = "redox_syscall"
version = "0.2.8"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "742739e41cd49414de871ea5e549afb7e2a3ac77b589bcbebe8c82fab37147fc"
checksum = "5ab49abadf3f9e1c4bc499e8845e152ad87d2ad2d30371841171169e9d75feee"
dependencies = [
"bitflags",
]
@@ -1229,9 +1226,9 @@ dependencies = [
[[package]]
name = "reqwest"
version = "0.11.3"
version = "0.11.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2296f2fac53979e8ccbc4a1136b25dcefd37be9ed7e4a1f6b05a6029c84ff124"
checksum = "246e9f61b9bb77df069a947682be06e31ac43ea37862e244a69f177694ea6d22"
dependencies = [
"async-compression",
"base64",
@@ -1281,9 +1278,9 @@ dependencies = [
[[package]]
name = "security-framework"
version = "2.2.0"
version = "2.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3670b1d2fdf6084d192bc71ead7aabe6c06aa2ea3fbd9cc3ac111fa5c2b1bd84"
checksum = "23a2ac85147a3a11d77ecf1bc7166ec0b92febfa4461c37944e180f319ece467"
dependencies = [
"bitflags",
"core-foundation",
@@ -1294,9 +1291,9 @@ dependencies = [
[[package]]
name = "security-framework-sys"
version = "2.2.0"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3676258fd3cfe2c9a0ec99ce3038798d847ce3e4bb17746373eb9f0f1ac16339"
checksum = "7e4effb91b4b8b6fb7732e670b6cee160278ff8e6bf485c7805d9e319d76e284"
dependencies = [
"core-foundation-sys",
"libc",
@@ -1431,9 +1428,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
[[package]]
name = "syn"
version = "1.0.72"
version = "1.0.73"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1e8cdbefb79a9a5a65e0db8b47b723ee907b7c7f8496c76a1770b5c310bab82"
checksum = "f71489ff30030d2ae598524f61326b902466f72a0fb1a8564c001cc63425bcc7"
dependencies = [
"proc-macro2",
"quote",
@@ -1448,7 +1445,7 @@ checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22"
dependencies = [
"cfg-if",
"libc",
"rand 0.8.3",
"rand 0.8.4",
"redox_syscall",
"remove_dir_all",
"winapi",
@@ -1502,9 +1499,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]]
name = "tokio"
version = "1.6.1"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a38d31d7831c6ed7aad00aa4c12d9375fd225a6dd77da1d25b707346319a975"
checksum = "570c2eb13b3ab38208130eccd41be92520388791207fde783bda7c1e8ace28d4"
dependencies = [
"autocfg 1.0.1",
"bytes",
@@ -1513,6 +1510,7 @@ dependencies = [
"mio",
"num_cpus",
"pin-project-lite",
"winapi",
]
[[package]]
@@ -1594,9 +1592,9 @@ dependencies = [
[[package]]
name = "unicode-normalization"
version = "0.1.18"
version = "0.1.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33717dca7ac877f497014e10d73f3acf948c342bee31b5ca7892faf94ccc6b49"
checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9"
dependencies = [
"tinyvec",
]
@@ -1633,9 +1631,9 @@ checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "vcpkg"
version = "0.2.13"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "025ce40a007e1907e58d5bc1a594def78e5573bb0b1160bc389634e8f12e4faa"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "vec_map"

View File

@@ -1,6 +1,6 @@
[package]
name = "monolith"
version = "2.5.0"
version = "2.6.1"
authors = [
"Sunshine <sunshine@uberspace.net>",
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
@@ -27,15 +27,16 @@ base64 = "0.13.0"
chrono = "0.4.19" # Used for formatting creation timestamp
clap = "2.33.3"
cssparser = "0.28.1"
encoding_rs = "0.8.28"
html5ever = "0.24.1"
regex = "1.5.4" # Used for parsing srcset and NOSCRIPT
sha2 = "0.9.5" # Used for calculating checksums during integrity checks
url = "2.2.2"
[dependencies.reqwest]
version = "0.11.3"
version = "0.11.4"
default-features = false
features = ["default-tls", "blocking", "gzip"]
[dev-dependencies]
assert_cmd = "1.0.4"
assert_cmd = "1.0.7"

View File

@@ -71,6 +71,7 @@ or
- `-a`: Exclude audio sources
- `-b`: Use custom `base URL`
- `-c`: Exclude CSS
- `-C`: Save document using custom `charset`
- `-e`: Ignore network errors
- `-f`: Omit frames
- `-F`: Exclude web fonts
@@ -80,7 +81,7 @@ or
- `-k`: Accept invalid X.509 (TLS) certificates
- `-M`: Don't add timestamp and URL information
- `-n`: Extract contents of NOSCRIPT elements
- `-o`: Write output to `file`
- `-o`: Write output to `file`, use “-” for STDOUT
- `-s`: Be quiet
- `-t`: Adjust `network request timeout`
- `-u`: Provide custom `User-Agent`

View File

@@ -198,9 +198,15 @@ pub fn process_css<'a>(
options,
depth + 1,
) {
Ok((import_contents, import_final_url, _import_media_type)) => {
Ok((
import_contents,
import_final_url,
import_media_type,
import_charset,
)) => {
let mut import_data_url = create_data_url(
"text/css",
&import_media_type,
&import_charset,
embed_css(
cache,
client,
@@ -247,9 +253,9 @@ pub fn process_css<'a>(
options,
depth + 1,
) {
Ok((data, final_url, media_type)) => {
Ok((data, final_url, media_type, charset)) => {
let mut data_url =
create_data_url(&media_type, &data, &final_url);
create_data_url(&media_type, &charset, &data, &final_url);
data_url.set_fragment(resolved_url.fragment());
result.push_str(
format_quoted_string(&data_url.to_string()).as_str(),
@@ -343,9 +349,10 @@ pub fn process_css<'a>(
options,
depth + 1,
) {
Ok((css, final_url, _media_type)) => {
Ok((css, final_url, media_type, charset)) => {
let mut data_url = create_data_url(
"text/css",
&media_type,
&charset,
embed_css(
cache,
client,
@@ -381,8 +388,9 @@ pub fn process_css<'a>(
options,
depth + 1,
) {
Ok((data, final_url, media_type)) => {
let mut data_url = create_data_url(&media_type, &data, &final_url);
Ok((data, final_url, media_type, charset)) => {
let mut data_url =
create_data_url(&media_type, &charset, &data, &final_url);
data_url.set_fragment(full_url.fragment());
result
.push_str(format_quoted_string(&data_url.to_string()).as_str());

View File

@@ -1,5 +1,6 @@
use base64;
use chrono::prelude::*;
use encoding_rs::Encoding;
use html5ever::interface::QualName;
use html5ever::parse_document;
use html5ever::rcdom::{Handle, NodeData, RcDom};
@@ -18,7 +19,7 @@ use crate::css::embed_css;
use crate::js::attr_is_event_handler;
use crate::opts::Options;
use crate::url::{clean_url, create_data_url, is_url_and_has_protocol, resolve_url};
use crate::utils::retrieve_asset;
use crate::utils::{parse_content_type, retrieve_asset};
struct SrcSetItem<'a> {
path: &'a str,
@@ -31,9 +32,8 @@ pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom {
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, document, SerializeOpts::default())
.expect("unable to serialize DOM into buffer");
let result = String::from_utf8(buf).unwrap();
let mut dom = html_to_dom(&result);
let mut dom = html_to_dom(&buf, "utf-8".to_string());
let doc = dom.get_document();
if let Some(html) = get_child_node_by_name(&doc, "html") {
if let Some(head) = get_child_node_by_name(&html, "head") {
@@ -115,7 +115,7 @@ pub fn create_metadata_tag(url: &Url) -> String {
// Prevent credentials from getting into metadata
if clean_url.scheme() == "http" || clean_url.scheme() == "https" {
// Only HTTP(S) URLs may feature credentials
// Only HTTP(S) URLs can contain credentials
clean_url.set_username("").unwrap();
clean_url.set_password(None).unwrap();
}
@@ -188,9 +188,13 @@ pub fn embed_srcset(
options,
depth + 1,
) {
Ok((image_data, image_final_url, image_media_type)) => {
let mut image_data_url =
create_data_url(&image_media_type, &image_data, &image_final_url);
Ok((image_data, image_final_url, image_media_type, image_charset)) => {
let mut image_data_url = create_data_url(
&image_media_type,
&image_charset,
&image_data,
&image_final_url,
);
// Append retreved asset as a data URL
image_data_url.set_fragment(image_full_url.fragment());
result.push_str(image_data_url.as_ref());
@@ -253,6 +257,48 @@ pub fn find_base_node(node: &Handle) -> Option<Handle> {
None
}
pub fn find_meta_charset_or_content_type_node(node: &Handle) -> Option<Handle> {
match node.data {
NodeData::Document => {
// Dig deeper
for child in node.children.borrow().iter() {
if let Some(meta_charset_node) = find_meta_charset_or_content_type_node(child) {
return Some(meta_charset_node);
}
}
}
NodeData::Element { ref name, .. } => {
match name.local.as_ref() {
"head" => {
if let Some(meta_node) = get_child_node_by_name(node, "meta") {
if let Some(_) = get_node_attr(&meta_node, "charset") {
return Some(meta_node);
} else if let Some(meta_node_http_equiv_attr_value) =
get_node_attr(&meta_node, "http-equiv")
{
if meta_node_http_equiv_attr_value.eq_ignore_ascii_case("content-type")
{
return Some(meta_node);
}
}
}
}
_ => {}
}
// Dig deeper
for child in node.children.borrow().iter() {
if let Some(meta_charset_node) = find_meta_charset_or_content_type_node(child) {
return Some(meta_charset_node);
}
}
}
_ => {}
}
None
}
pub fn get_base_url(handle: &Handle) -> Option<String> {
if let Some(base_node) = find_base_node(handle) {
get_node_attr(&base_node, "href")
@@ -261,6 +307,24 @@ pub fn get_base_url(handle: &Handle) -> Option<String> {
}
}
pub fn get_charset(node: &Handle) -> Option<String> {
if let Some(meta_charset_node) = find_meta_charset_or_content_type_node(node) {
if let Some(meta_charset_node_attr_value) = get_node_attr(&meta_charset_node, "charset") {
// Processing <meta charset="..." />
return Some(meta_charset_node_attr_value);
} else if let Some(meta_content_type_node_attr_value) =
get_node_attr(&meta_charset_node, "content")
{
// Processing <meta http-equiv="content-type" content="text/html; charset=..." />
let (_media_type, charset, _is_base64) =
parse_content_type(&meta_content_type_node_attr_value);
return Some(charset);
}
}
return None;
}
pub fn get_child_node_by_name(parent: &Handle, node_name: &str) -> Option<Handle> {
let children = parent.children.borrow();
let matching_children = children.iter().find(|child| match child.data {
@@ -273,13 +337,6 @@ pub fn get_child_node_by_name(parent: &Handle, node_name: &str) -> Option<Handle
}
}
pub fn get_node_name(node: &Handle) -> Option<&'_ str> {
match &node.data {
NodeData::Element { ref name, .. } => Some(name.local.as_ref()),
_ => None,
}
}
pub fn get_node_attr(node: &Handle, attr_name: &str) -> Option<String> {
match &node.data {
NodeData::Element { ref attrs, .. } => {
@@ -294,6 +351,13 @@ pub fn get_node_attr(node: &Handle, attr_name: &str) -> Option<String> {
}
}
pub fn get_node_name(node: &Handle) -> Option<&'_ str> {
match &node.data {
NodeData::Element { ref name, .. } => Some(name.local.as_ref()),
_ => None,
}
}
pub fn get_parent_node(child: &Handle) -> Handle {
let parent = child.parent.take().clone();
parent.and_then(|node| node.upgrade()).unwrap()
@@ -340,10 +404,19 @@ pub fn has_favicon(handle: &Handle) -> bool {
found_favicon
}
pub fn html_to_dom(data: &str) -> RcDom {
pub fn html_to_dom(data: &Vec<u8>, document_encoding: String) -> RcDom {
let s: String;
if let Some(encoding) = Encoding::for_label(document_encoding.as_bytes()) {
let (string, _, _) = encoding.decode(&data);
s = string.to_string();
} else {
s = String::from_utf8_lossy(&data).to_string();
}
parse_document(RcDom::default(), Default::default())
.from_utf8()
.read_from(&mut data.as_bytes())
.read_from(&mut s.as_bytes())
.unwrap()
}
@@ -355,9 +428,8 @@ pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom {
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, document, SerializeOpts::default())
.expect("unable to serialize DOM into buffer");
let result = String::from_utf8(buf).unwrap();
let mut dom = html_to_dom(&result);
let mut dom = html_to_dom(&buf, "utf-8".to_string());
let doc = dom.get_document();
if let Some(html_node) = get_child_node_by_name(&doc, "html") {
if let Some(head_node) = get_child_node_by_name(&html_node, "head") {
@@ -383,6 +455,41 @@ pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom {
dom
}
pub fn set_charset(mut dom: RcDom, desired_charset: String) -> RcDom {
if let Some(meta_charset_node) = find_meta_charset_or_content_type_node(&dom.document) {
if let Some(_) = get_node_attr(&meta_charset_node, "charset") {
set_node_attr(&meta_charset_node, "charset", Some(desired_charset));
} else if let Some(_) = get_node_attr(&meta_charset_node, "content") {
set_node_attr(
&meta_charset_node,
"content",
Some(format!("text/html;charset={}", desired_charset)),
);
}
} else {
let meta_charset_node = dom.create_element(
QualName::new(None, ns!(), local_name!("meta")),
vec![Attribute {
name: QualName::new(None, ns!(), local_name!("charset")),
value: format_tendril!("{}", desired_charset),
}],
Default::default(),
);
// Insert newly created META charset node into HEAD
if let Some(html_node) = get_child_node_by_name(&dom.document, "html") {
if let Some(head_node) = get_child_node_by_name(&html_node, "head") {
head_node
.children
.borrow_mut()
.push(meta_charset_node.clone());
}
}
}
dom
}
pub fn set_node_attr(node: &Handle, attr_name: &str, attr_value: Option<String>) {
match &node.data {
NodeData::Element { ref attrs, .. } => {
@@ -423,16 +530,10 @@ pub fn set_node_attr(node: &Handle, attr_name: &str, attr_value: Option<String>)
};
}
pub fn stringify_document(handle: &Handle, options: &Options) -> String {
pub fn serialize_document(mut dom: RcDom, document_encoding: String, options: &Options) -> Vec<u8> {
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, handle, SerializeOpts::default())
.expect("Unable to serialize DOM into buffer");
let doc = dom.get_document();
let mut result = String::from_utf8(buf).unwrap();
// We can't make it isolate the page right away since it may have no HEAD element,
// ergo we have to serialize, parse the DOM again, insert the CSP meta tag, and then
// finally serialize and return the resulting string
if options.isolate
|| options.no_css
|| options.no_fonts
@@ -441,9 +542,6 @@ pub fn stringify_document(handle: &Handle, options: &Options) -> String {
|| options.no_images
{
// Take care of CSP
let mut buf: Vec<u8> = Vec::new();
let mut dom = html_to_dom(&result);
let doc = dom.get_document();
if let Some(html) = get_child_node_by_name(&doc, "html") {
if let Some(head) = get_child_node_by_name(&html, "head") {
let meta = dom.create_element(
@@ -468,19 +566,27 @@ pub fn stringify_document(handle: &Handle, options: &Options) -> String {
head.children.borrow_mut().reverse();
}
}
serialize(&mut buf, &doc, SerializeOpts::default())
.expect("Unable to serialize DOM into buffer");
result = String::from_utf8(buf).unwrap();
}
serialize(&mut buf, &doc, SerializeOpts::default())
.expect("Unable to serialize DOM into buffer");
// Unwrap NOSCRIPT elements
if options.unwrap_noscript {
let s: &str = &String::from_utf8_lossy(&buf);
let noscript_re = Regex::new(r"<(?P<c>/?noscript[^>]*)>").unwrap();
result = noscript_re.replace_all(&result, "<!--$c-->").to_string();
buf = noscript_re.replace_all(&s, "<!--$c-->").as_bytes().to_vec();
}
result
if !document_encoding.is_empty() {
if let Some(encoding) = Encoding::for_label(document_encoding.as_bytes()) {
let s: &str = &String::from_utf8_lossy(&buf);
let (data, _, _) = encoding.encode(s);
buf = data.to_vec();
}
}
buf
}
pub fn retrieve_and_embed_asset(
@@ -503,7 +609,7 @@ pub fn retrieve_and_embed_asset(
options,
depth + 1,
) {
Ok((data, final_url, mut media_type)) => {
Ok((data, final_url, mut media_type, charset)) => {
let node_name: &str = get_node_name(&node).unwrap();
// Check integrity if it's a LINK or SCRIPT element
@@ -521,23 +627,25 @@ pub fn retrieve_and_embed_asset(
}
if ok_to_include {
let s: String;
if let Some(encoding) = Encoding::for_label(charset.as_bytes()) {
let (string, _, _) = encoding.decode(&data);
s = string.to_string();
} else {
s = String::from_utf8_lossy(&data).to_string();
}
if node_name == "link" && determine_link_node_type(node) == "stylesheet" {
// Stylesheet LINK elements require special treatment
let css: String = embed_css(
cache,
client,
&final_url,
&String::from_utf8_lossy(&data),
options,
depth + 1,
);
let css: String = embed_css(cache, client, &final_url, &s, options, depth + 1);
// Create and embed data URL
let css_data_url = create_data_url("text/css", css.as_bytes(), &final_url);
let css_data_url =
create_data_url(&media_type, &charset, css.as_bytes(), &final_url);
set_node_attr(&node, attr_name, Some(css_data_url.to_string()));
} else if node_name == "frame" || node_name == "iframe" {
// (I)FRAMEs are also quite different from conventional resources
let frame_dom = html_to_dom(&String::from_utf8_lossy(&data));
let frame_dom = html_to_dom(&data, charset.clone());
walk_and_embed_assets(
cache,
client,
@@ -556,7 +664,8 @@ pub fn retrieve_and_embed_asset(
.unwrap();
// Create and embed data URL
let mut frame_data_url = create_data_url(&media_type, &frame_data, &final_url);
let mut frame_data_url =
create_data_url(&media_type, &charset, &frame_data, &final_url);
frame_data_url.set_fragment(resolved_url.fragment());
set_node_attr(node, attr_name, Some(frame_data_url.to_string()));
} else {
@@ -575,7 +684,7 @@ pub fn retrieve_and_embed_asset(
}
// Create and embed data URL
let mut data_url = create_data_url(&media_type, &data, &final_url);
let mut data_url = create_data_url(&media_type, &charset, &data, &final_url);
data_url.set_fragment(resolved_url.fragment());
set_node_attr(node, attr_name, Some(data_url.to_string()));
}
@@ -621,28 +730,8 @@ pub fn walk_and_embed_assets(
|| meta_attr_http_equiv_value.eq_ignore_ascii_case("location")
{
// Remove http-equiv attributes from META nodes if they're able to control the page
set_node_attr(
&node,
"http-equiv",
Some(format!(
"disabled by monolith ({})",
meta_attr_http_equiv_value
)),
);
} else if meta_attr_http_equiv_value.eq_ignore_ascii_case("Content-Type") {
// Enforce charset to be set to UTF-8
if let Some(_attr_value) = get_node_attr(node, "content") {
set_node_attr(
&node,
"content",
Some(str!("text/html; charset=utf-8")),
);
}
set_node_attr(&node, "http-equiv", None);
}
} else if let Some(_meta_attr_http_equiv_value) = get_node_attr(node, "charset")
{
// Enforce charset to be set to UTF-8
set_node_attr(&node, "charset", Some(str!("utf-8")));
}
}
"link" => {
@@ -1003,7 +1092,7 @@ pub fn walk_and_embed_assets(
client,
&document_url,
node,
"href",
"src",
&frame_attr_src_value,
options,
depth,
@@ -1078,7 +1167,8 @@ pub fn walk_and_embed_assets(
// Get contents of NOSCRIPT node
let mut noscript_contents = contents.borrow_mut();
// Parse contents of NOSCRIPT node as DOM
let noscript_contents_dom: RcDom = html_to_dom(&noscript_contents);
let noscript_contents_dom: RcDom =
html_to_dom(&noscript_contents.as_bytes().to_vec(), str!());
// Embed assets of NOSCRIPT node contents
walk_and_embed_assets(
cache,
@@ -1098,7 +1188,7 @@ pub fn walk_and_embed_assets(
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &body, SerializeOpts::default())
.expect("Unable to serialize DOM into buffer");
let result = String::from_utf8(buf).unwrap();
let result = String::from_utf8_lossy(&buf);
noscript_contents.push_slice(&result);
}
}

View File

@@ -1,3 +1,5 @@
use encoding_rs::Encoding;
use html5ever::rcdom::RcDom;
use reqwest::blocking::Client;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use std::collections::HashMap;
@@ -9,11 +11,11 @@ use std::time::Duration;
use url::Url;
use monolith::html::{
add_favicon, create_metadata_tag, get_base_url, has_favicon, html_to_dom, set_base_url,
stringify_document, walk_and_embed_assets,
add_favicon, create_metadata_tag, get_base_url, get_charset, has_favicon, html_to_dom,
serialize_document, set_base_url, set_charset, walk_and_embed_assets,
};
use monolith::opts::Options;
use monolith::url::{create_data_url, parse_data_url, resolve_url};
use monolith::url::{create_data_url, resolve_url};
use monolith::utils::retrieve_asset;
mod macros;
@@ -32,29 +34,35 @@ impl Output {
}
}
fn writeln_str(&mut self, s: &str) -> Result<(), Error> {
fn write(&mut self, bytes: &Vec<u8>) -> Result<(), Error> {
match self {
Output::Stdout(stdout) => {
writeln!(stdout, "{}", s)?;
stdout.write_all(bytes)?;
// Ensure newline at end of output
if bytes.last() != Some(&b"\n"[0]) {
stdout.write(b"\n")?;
}
stdout.flush()
}
Output::File(f) => {
writeln!(f, "{}", s)?;
f.flush()
Output::File(file) => {
file.write_all(bytes)?;
// Ensure newline at end of output
if bytes.last() != Some(&b"\n"[0]) {
file.write(b"\n")?;
}
file.flush()
}
}
}
}
pub fn read_stdin() -> String {
let mut buffer = String::new();
pub fn read_stdin() -> Vec<u8> {
let mut buffer: Vec<u8> = vec![];
for line in io::stdin().lock().lines() {
buffer += line.unwrap_or_default().as_str();
buffer += "\n";
match io::stdin().lock().read_to_end(&mut buffer) {
Ok(_) => buffer,
Err(_) => buffer,
}
buffer
}
fn main() {
@@ -69,15 +77,22 @@ fn main() {
process::exit(1);
}
// Check if custom charset is valid
if let Some(custom_charset) = options.charset.clone() {
if !Encoding::for_label_no_replacement(custom_charset.as_bytes()).is_some() {
eprintln!("Unknown encoding: {}", &custom_charset);
process::exit(1);
}
}
let target_url: Url;
let mut base_url: Url;
let mut use_stdin: bool = false;
// Determine exact target URL
if target.clone() == "-" {
// Read from pipe (stdin)
use_stdin = true;
// Set default target URL to an empty data URL; the user can control it via --base-url
// Set default target URL to an empty data URL; the user can set it via --base-url
target_url = Url::parse("data:text/html,").unwrap();
} else {
match Url::parse(&target.clone()) {
@@ -96,7 +111,7 @@ fn main() {
}
Err(_err) => {
// Failed to parse given base URL,
// perhaps it's a filesystem path?
// perhaps it's a filesystem path?
let path: &Path = Path::new(&target);
if path.exists() {
@@ -123,7 +138,7 @@ fn main() {
}
} else {
// Last chance, now we do what browsers do:
// prepend "http://" and hope it points to a website
// prepend "http://" and hope it points to a website
target.insert_str(0, "http://");
target_url = Url::parse(&target).unwrap();
}
@@ -131,9 +146,6 @@ fn main() {
}
}
// Define output
let mut output = Output::new(&options.output).expect("Could not prepare output");
// Initialize client
let mut cache = HashMap::new();
let mut header_map = HeaderMap::new();
@@ -143,35 +155,47 @@ fn main() {
HeaderValue::from_str(&user_agent).expect("Invalid User-Agent header specified"),
);
}
let timeout: u64 = if options.timeout > 0 {
options.timeout
let client = if options.timeout > 0 {
Client::builder().timeout(Duration::from_secs(options.timeout))
} else {
std::u64::MAX / 4 // This is pretty close to infinity
};
let client = Client::builder()
.timeout(Duration::from_secs(timeout))
.danger_accept_invalid_certs(options.insecure)
.default_headers(header_map)
.build()
.expect("Failed to initialize HTTP client");
// No timeout is default
Client::builder()
}
.danger_accept_invalid_certs(options.insecure)
.default_headers(header_map)
.build()
.expect("Failed to initialize HTTP client");
// At this stage we assume that the base URL is the same as the target URL
base_url = target_url.clone();
// At first we assume that base URL is the same as target URL
let mut base_url: Url = target_url.clone();
let mut dom;
let data: Vec<u8>;
let mut document_encoding: String = str!();
let mut dom: RcDom;
// Retrieve target document
if use_stdin {
dom = html_to_dom(&read_stdin());
data = read_stdin();
} else if target_url.scheme() == "file"
|| (target_url.scheme() == "http" || target_url.scheme() == "https")
|| target_url.scheme() == "data"
{
match retrieve_asset(&mut cache, &client, &target_url, &target_url, &options, 0) {
Ok((data, final_url, _media_type)) => {
if options.base_url.clone().unwrap_or(str!()).is_empty() {
base_url = final_url
Ok((retrieved_data, final_url, media_type, charset)) => {
// Make sure the media type is text/html
if !media_type.eq_ignore_ascii_case("text/html") {
if !options.silent {
eprintln!("Unsupported document media type");
}
process::exit(1);
}
dom = html_to_dom(&String::from_utf8_lossy(&data));
if options.base_url.clone().unwrap_or(str!()).is_empty() {
base_url = final_url;
}
data = retrieved_data;
document_encoding = charset;
}
Err(_) => {
if !options.silent {
@@ -180,36 +204,42 @@ fn main() {
process::exit(1);
}
}
} else if target_url.scheme() == "data" {
let (media_type, data): (String, Vec<u8>) = parse_data_url(&target_url);
if !media_type.eq_ignore_ascii_case("text/html") {
if !options.silent {
eprintln!("Unsupported data URL media type");
}
process::exit(1);
}
dom = html_to_dom(&String::from_utf8_lossy(&data));
} else {
process::exit(1);
}
// Initial parse
dom = html_to_dom(&data, document_encoding.clone());
// TODO: investigate if charset from filesystem/data URL/HTTP headers
// has say over what's specified in HTML
// Attempt to determine document's charset
if let Some(html_charset) = get_charset(&dom.document) {
if !html_charset.is_empty() {
// Check if the charset specified inside HTML is valid
if let Some(encoding) = Encoding::for_label_no_replacement(html_charset.as_bytes()) {
document_encoding = html_charset;
dom = html_to_dom(&data, encoding.name().to_string());
}
}
}
// Use custom base URL if specified, read and use what's in the DOM otherwise
let b: String = options.base_url.clone().unwrap_or(str!());
if b.is_empty() {
// No custom base URL is specified,
// try to see if the document has BASE tag
let custom_base_url: String = options.base_url.clone().unwrap_or(str!());
if custom_base_url.is_empty() {
// No custom base URL is specified
// Try to see if document has BASE element
if let Some(existing_base_url) = get_base_url(&dom.document) {
base_url = resolve_url(&target_url, &existing_base_url);
}
} else {
// Custom base URL provided
match Url::parse(&b) {
match Url::parse(&custom_base_url) {
Ok(parsed_url) => {
if parsed_url.scheme() == "file" {
// File base URLs can only work with
// documents saved from filesystem
// documents saved from filesystem
if target_url.scheme() == "file" {
base_url = parsed_url;
}
@@ -218,11 +248,10 @@ fn main() {
}
}
Err(_) => {
// Failed to parse given base URL,
// perhaps it's a filesystem path?
// Failed to parse given base URL, perhaps it's a filesystem path?
if target_url.scheme() == "file" {
// Relative paths could work for documents saved from filesystem
let path: &Path = Path::new(&b);
let path: &Path = Path::new(&custom_base_url);
if path.exists() {
match Url::from_file_path(fs::canonicalize(&path).unwrap()) {
Ok(file_url) => {
@@ -230,7 +259,10 @@ fn main() {
}
Err(_) => {
if !options.silent {
eprintln!("Could not map given path to base URL: {}", b);
eprintln!(
"Could not map given path to base URL: {}",
custom_base_url
);
}
process::exit(1);
}
@@ -241,11 +273,10 @@ fn main() {
}
}
// Embed remote assets
// Traverse through the document and embed remote assets
walk_and_embed_assets(&mut cache, &client, &base_url, &dom.document, &options, 0);
// Update or add new BASE tag to reroute network requests
// and hash-links in the final document
// Update or add new BASE element to reroute network requests and hash-links
if let Some(new_base_url) = options.base_url.clone() {
dom = set_base_url(&dom.document, new_base_url);
}
@@ -265,8 +296,9 @@ fn main() {
&options,
0,
) {
Ok((data, final_url, media_type)) => {
let favicon_data_url: Url = create_data_url(&media_type, &data, &final_url);
Ok((data, final_url, media_type, charset)) => {
let favicon_data_url: Url =
create_data_url(&media_type, &charset, &data, &final_url);
dom = add_favicon(&dom.document, favicon_data_url.to_string());
}
Err(_) => {
@@ -275,20 +307,25 @@ fn main() {
}
}
// Serialize DOM tree
let mut result: String = stringify_document(&dom.document, &options);
// Add metadata tag
if !options.no_metadata {
let metadata_comment: String = create_metadata_tag(&target_url);
result.insert_str(0, &metadata_comment);
if metadata_comment.len() > 0 {
result.insert_str(metadata_comment.len(), "\n");
}
// Save using specified charset, if given
if let Some(custom_charset) = options.charset.clone() {
document_encoding = custom_charset;
dom = set_charset(dom, document_encoding.clone());
}
// Serialize DOM tree
let mut result: Vec<u8> = serialize_document(dom, document_encoding, &options);
// Prepend metadata comment tag
if !options.no_metadata {
let mut metadata_comment: String = create_metadata_tag(&target_url);
metadata_comment += "\n";
result.splice(0..0, metadata_comment.as_bytes().to_vec());
}
// Define output
let mut output = Output::new(&options.output).expect("Could not prepare output");
// Write result into stdout or file
output
.writeln_str(&result)
.expect("Could not write HTML output");
output.write(&result).expect("Could not write HTML output");
}

View File

@@ -6,6 +6,7 @@ pub struct Options {
pub no_audio: bool,
pub base_url: Option<String>,
pub no_css: bool,
pub charset: Option<String>,
pub ignore_errors: bool,
pub no_frames: bool,
pub no_fonts: bool,
@@ -48,6 +49,7 @@ impl Options {
.args_from_usage("-a, --no-audio 'Removes audio sources'")
.args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'")
.args_from_usage("-c, --no-css 'Removes CSS'")
.args_from_usage("-C, --charset=[UTF-8] 'Enforces custom encoding'")
.args_from_usage("-e, --ignore-errors 'Ignore network errors'")
.args_from_usage("-f, --no-frames 'Removes frames and iframes'")
.args_from_usage("-F, --no-fonts 'Removes fonts'")
@@ -59,7 +61,9 @@ impl Options {
.args_from_usage(
"-n, --unwrap-noscript 'Replaces NOSCRIPT elements with their contents'",
)
.args_from_usage("-o, --output=[document.html] 'Writes output to <file>'")
.args_from_usage(
"-o, --output=[document.html] 'Writes output to <file>, use - for STDOUT'",
)
.args_from_usage("-s, --silent 'Suppresses verbosity'")
.args_from_usage("-t, --timeout=[60] 'Adjusts network request timeout'")
.args_from_usage("-u, --user-agent=[Firefox] 'Sets custom User-Agent string'")
@@ -69,7 +73,7 @@ impl Options {
.required(true)
.takes_value(true)
.index(1)
.help("URL or file path, use - for stdin"),
.help("URL or file path, use - for STDIN"),
)
.get_matches();
let mut options: Options = Options::default();
@@ -84,6 +88,9 @@ impl Options {
options.base_url = Some(str!(base_url));
}
options.no_css = app.is_present("no-css");
if let Some(charset) = app.value_of("charset") {
options.charset = Some(str!(charset));
}
options.ignore_errors = app.is_present("ignore-errors");
options.no_frames = app.is_present("no-frames");
options.no_fonts = app.is_present("no-fonts");

View File

@@ -22,18 +22,18 @@ mod passing {
.output()
.unwrap();
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain newly added base URL
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
"<html><head>\
<base href=\"http://localhost:8000/\"></base>\
</head><body>Hello, World!</body></html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
// Exit code should be 0
out.assert().code(0);
}
@@ -46,18 +46,18 @@ mod passing {
.output()
.unwrap();
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain newly added base URL
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
"<html><head>\
<base href=\"http://localhost:8000/\">\
</head><body>Hello, World!</body></html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
// Exit code should be 0
out.assert().code(0);
}
@@ -72,18 +72,18 @@ mod passing {
.output()
.unwrap();
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain newly added base URL
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
"<html><head>\
<base href=\"http://localhost/\">\
</head><body>Hello, World!</body></html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
// Exit code should be 0
out.assert().code(0);
}
@@ -98,18 +98,18 @@ mod passing {
.output()
.unwrap();
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain newly added base URL
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
"<html><head>\
<base href=\"\">\
</head><body>Hello, World!</body></html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
// Exit code should be 0
out.assert().code(0);
}
}

View File

@@ -14,21 +14,36 @@ mod passing {
use std::process::{Command, Stdio};
use url::Url;
#[test]
fn print_help_information() {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd.arg("-h").output().unwrap();
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain program name, version, and usage information
// TODO
// Exit code should be 0
out.assert().code(0);
}
#[test]
fn print_version() {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd.arg("-V").output().unwrap();
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain program name and version
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
format!("{} {}\n", env!("CARGO_PKG_NAME"), env!("CARGO_PKG_VERSION"))
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
// Exit code should be 0
out.assert().code(0);
}
@@ -46,11 +61,17 @@ mod passing {
cmd.stdin(echo_out);
let out = cmd.arg("-M").arg("-").output().unwrap();
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain HTML created out of STDIN
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
"<html><head></head><body>Hello from STDIN\n</body></html>\n"
);
// Exit code should be 0
out.assert().code(0);
}
#[test]
@@ -64,15 +85,9 @@ mod passing {
let out = cmd.arg("-M").arg(path_html.as_os_str()).output().unwrap();
// STDOUT should contain embedded CSS url()'s
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head><style>\n\n @charset \"UTF-8\";\n\n @import \"data:text/css;base64,Ym9keXtiYWNrZ3JvdW5kLWNvbG9yOiMwMDA7Y29sb3I6I2ZmZn0K\";\n\n @import url(\"data:text/css;base64,Ym9keXtiYWNrZ3JvdW5kLWNvbG9yOiMwMDA7Y29sb3I6I2ZmZn0K\");\n\n @import url(\"data:text/css;base64,Ym9keXtiYWNrZ3JvdW5kLWNvbG9yOiMwMDA7Y29sb3I6I2ZmZn0K\");\n\n</style>\n</head><body></body></html>\n"
);
// STDERR should list files that got retrieved
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
String::from_utf8_lossy(&out.stderr),
format!(
"\
{file_url_html}\n \
@@ -85,7 +100,13 @@ mod passing {
)
);
// The exit code should be 0
// STDOUT should contain embedded CSS url()'s
assert_eq!(
String::from_utf8_lossy(&out.stdout),
"<html><head><style>\n\n @charset \"UTF-8\";\n\n @import \"data:text/css;base64,Ym9keXtiYWNrZ3JvdW5kLWNvbG9yOiMwMDA7Y29sb3I6I2ZmZn0K\";\n\n @import url(\"data:text/css;base64,Ym9keXtiYWNrZ3JvdW5kLWNvbG9yOiMwMDA7Y29sb3I6I2ZmZn0K\");\n\n @import url(\"data:text/css;base64,Ym9keXtiYWNrZ3JvdW5kLWNvbG9yOiMwMDA7Y29sb3I6I2ZmZn0K\");\n\n</style>\n</head><body></body></html>\n"
);
// Exit code should be 0
out.assert().code(0);
}
}
@@ -108,16 +129,16 @@ mod failing {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd.arg("").output().unwrap();
// STDOUT should be empty
assert_eq!(std::str::from_utf8(&out.stdout).unwrap(), "");
// STDERR should contain error description
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
String::from_utf8_lossy(&out.stderr),
"No target specified\n"
);
// The exit code should be 1
// STDOUT should be empty
assert_eq!(String::from_utf8_lossy(&out.stdout), "");
// Exit code should be 1
out.assert().code(1);
}
}

View File

@@ -21,18 +21,18 @@ mod passing {
.output()
.unwrap();
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain isolated HTML
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
"<html><head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src 'unsafe-inline' data:;\"></meta>\
</head><body>Hello, World!</body></html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
// Exit code should be 0
out.assert().code(0);
}
@@ -46,19 +46,19 @@ mod passing {
.output()
.unwrap();
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain HTML with no CSS
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
"<html><head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"style-src 'none';\"></meta>\
<style></style>\
</head><body>Hello</body></html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
// Exit code should be 0
out.assert().code(0);
}
@@ -72,19 +72,19 @@ mod passing {
.output()
.unwrap();
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain HTML with no web fonts
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
"<html><head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"font-src 'none';\"></meta>\
<style></style>\
</head><body>Hi</body></html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
// Exit code should be 0
out.assert().code(0);
}
@@ -98,18 +98,18 @@ mod passing {
.output()
.unwrap();
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain HTML with no iframes
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
"<html><head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"frame-src 'none'; child-src 'none';\"></meta>\
</head><body><iframe src=\"\"></iframe>Hi</body></html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
// Exit code should be 0
out.assert().code(0);
}
@@ -123,9 +123,12 @@ mod passing {
.output()
.unwrap();
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain HTML with no images
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
format!(
"<html>\
<head>\
@@ -140,10 +143,7 @@ mod passing {
)
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
// Exit code should be 0
out.assert().code(0);
}
@@ -157,9 +157,12 @@ mod passing {
.output()
.unwrap();
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain HTML with no JS
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
"<html>\
<head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"script-src 'none';\"></meta>\
@@ -168,10 +171,7 @@ mod passing {
</html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
// Exit code should be 0
out.assert().code(0);
}
}
@@ -194,16 +194,16 @@ mod failing {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd.arg("data:,Hello%2C%20World!").output().unwrap();
// STDOUT should contain HTML
assert_eq!(std::str::from_utf8(&out.stdout).unwrap(), "");
// STDERR should contain error description
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
"Unsupported data URL media type\n"
String::from_utf8_lossy(&out.stderr),
"Unsupported document media type\n"
);
// The exit code should be 1
// STDOUT should contain HTML
assert_eq!(String::from_utf8_lossy(&out.stdout), "");
// Exit code should be 1
out.assert().code(1);
}
@@ -216,16 +216,16 @@ mod failing {
.output()
.unwrap();
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain HTML with no JS in it
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
"<html><head><script src=\"data:application/javascript;base64,\"></script></head><body></body></html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
// Exit code should be 0
out.assert().code(0);
}
}

View File

@@ -10,7 +10,7 @@ mod passing {
use assert_cmd::prelude::*;
use std::env;
use std::fs;
use std::path::Path;
use std::path::{Path, MAIN_SEPARATOR};
use std::process::Command;
use url::Url;
@@ -21,18 +21,33 @@ mod passing {
str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/");
let out = cmd
.arg("-M")
.arg(if cfg!(windows) {
"src\\tests\\data\\basic\\local-file.html"
} else {
"src/tests/data/basic/local-file.html"
})
.arg(format!(
"src{s}tests{s}data{s}basic{s}local-file.html",
s = MAIN_SEPARATOR
))
.output()
.unwrap();
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
// STDERR should contain list of retrieved file URLs, two missing
assert_eq!(
String::from_utf8_lossy(&out.stderr),
format!(
"\
{file}{cwd}/src/tests/data/basic/local-file.html\n \
{file}{cwd}/src/tests/data/basic/local-style.css\n \
{file}{cwd}/src/tests/data/basic/local-style-does-not-exist.css (not found)\n \
{file}{cwd}/src/tests/data/basic/monolith.png (not found)\n \
{file}{cwd}/src/tests/data/basic/local-script.js\n\
",
file = file_url_protocol,
cwd = cwd_normalized
)
);
// STDOUT should contain HTML from the local file
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
"\
<!DOCTYPE html><html lang=\"en\"><head>\n \
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n \
@@ -47,23 +62,7 @@ mod passing {
"
);
// STDERR should contain list of retrieved file URLs, two missing
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
format!(
"\
{file}{cwd}/src/tests/data/basic/local-file.html\n \
{file}{cwd}/src/tests/data/basic/local-style.css\n \
{file}{cwd}/src/tests/data/basic/local-style-does-not-exist.css (not found)\n \
{file}{cwd}/src/tests/data/basic/monolith.png (not found)\n \
{file}{cwd}/src/tests/data/basic/local-script.js\n\
",
file = file_url_protocol,
cwd = cwd_normalized
)
);
// The exit code should be 0
// Exit code should be 0
out.assert().code(0);
}
@@ -79,9 +78,18 @@ mod passing {
.output()
.unwrap();
// STDERR should contain only the target file
assert_eq!(
String::from_utf8_lossy(&out.stderr),
format!(
"{file_url_html}\n",
file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()).unwrap(),
)
);
// STDOUT should contain HTML from the local file
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
format!(
"\
<!DOCTYPE html><html lang=\"en\"><head>\
@@ -100,16 +108,7 @@ mod passing {
)
);
// STDERR should contain only the target file
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
format!(
"{file_url_html}\n",
file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()).unwrap(),
)
);
// The exit code should be 0
// Exit code should be 0
out.assert().code(0);
}
@@ -122,25 +121,27 @@ mod passing {
let out = cmd
.arg("-M")
.arg("-cji")
.arg(if cfg!(windows) {
format!(
"{file}{cwd}/src/tests/data/basic/local-file.html",
file = file_url_protocol,
cwd = cwd_normalized,
)
} else {
format!(
"{file}{cwd}/src/tests/data/basic/local-file.html",
file = file_url_protocol,
cwd = cwd_normalized,
)
})
.arg(format!(
"{file}{cwd}/src/tests/data/basic/local-file.html",
file = file_url_protocol,
cwd = cwd_normalized,
))
.output()
.unwrap();
// STDERR should contain list of retrieved file URLs
assert_eq!(
String::from_utf8_lossy(&out.stderr),
format!(
"{file}{cwd}/src/tests/data/basic/local-file.html\n",
file = file_url_protocol,
cwd = cwd_normalized,
)
);
// STDOUT should contain HTML from the local file
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
format!(
"\
<!DOCTYPE html><html lang=\"en\"><head>\
@@ -159,17 +160,7 @@ mod passing {
)
);
// STDERR should contain list of retrieved file URLs
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
format!(
"{file}{cwd}/src/tests/data/basic/local-file.html\n",
file = file_url_protocol,
cwd = cwd_normalized,
)
);
// The exit code should be 0
// Exit code should be 0
out.assert().code(0);
}
@@ -181,15 +172,9 @@ mod passing {
let out = cmd.arg("-M").arg(path_html.as_os_str()).output().unwrap();
// STDOUT should contain HTML with date URL for background-image in it
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head></head><body><div style=\"background-image: url(&quot;data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIxLjEiIGJhc2VQcm9maWxlPSJmdWxsIiB3aWR0aD0iMzAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICAgIDxyZWN0IHdpZHRoPSIxMDAlIiBoZWlnaHQ9IjEwMCUiIGZpbGw9InJlZCIgLz4KICAgIDxjaXJjbGUgY3g9IjE1MCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9ImdyZWVuIiAvPgogICAgPHRleHQgeD0iMTUwIiB5PSIxMjUiIGZvbnQtc2l6ZT0iNjAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IndoaXRlIj5TVkc8L3RleHQ+Cjwvc3ZnPgo=&quot;)\"></div>\n</body></html>\n"
);
// STDERR should list files that got retrieved
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
String::from_utf8_lossy(&out.stderr),
format!(
"\
{file_url_html}\n \
@@ -200,7 +185,13 @@ mod passing {
)
);
// The exit code should be 0
// STDOUT should contain HTML with date URL for background-image in it
assert_eq!(
String::from_utf8_lossy(&out.stdout),
"<html><head></head><body><div style=\"background-image: url(&quot;data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIxLjEiIGJhc2VQcm9maWxlPSJmdWxsIiB3aWR0aD0iMzAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICAgIDxyZWN0IHdpZHRoPSIxMDAlIiBoZWlnaHQ9IjEwMCUiIGZpbGw9InJlZCIgLz4KICAgIDxjaXJjbGUgY3g9IjE1MCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9ImdyZWVuIiAvPgogICAgPHRleHQgeD0iMTUwIiB5PSIxMjUiIGZvbnQtc2l6ZT0iNjAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IndoaXRlIj5TVkc8L3RleHQ+Cjwvc3ZnPgo=&quot;)\"></div>\n</body></html>\n"
);
// Exit code should be 0
out.assert().code(0);
}
@@ -229,9 +220,25 @@ mod passing {
.output()
.unwrap();
// STDERR should contain list of retrieved file URLs
assert_eq!(
String::from_utf8_lossy(&out.stderr),
format!(
"\
{file}{cwd}/src/tests/data/integrity/index.html\n \
{file}{cwd}/src/tests/data/integrity/style.css\n \
{file}{cwd}/src/tests/data/integrity/style.css\n \
{file}{cwd}/src/tests/data/integrity/script.js\n \
{file}{cwd}/src/tests/data/integrity/script.js\n\
",
file = file_url_protocol,
cwd = cwd_normalized,
)
);
// STDOUT should contain HTML from the local file; integrity attributes should be missing
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
format!(
"\
<!DOCTYPE html><html lang=\"en\"><head>\
@@ -247,23 +254,7 @@ mod passing {
)
);
// STDERR should contain list of retrieved file URLs
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
format!(
"\
{file}{cwd}/src/tests/data/integrity/index.html\n \
{file}{cwd}/src/tests/data/integrity/style.css\n \
{file}{cwd}/src/tests/data/integrity/style.css\n \
{file}{cwd}/src/tests/data/integrity/script.js\n \
{file}{cwd}/src/tests/data/integrity/script.js\n\
",
file = file_url_protocol,
cwd = cwd_normalized,
)
);
// The exit code should be 0
// Exit code should be 0
out.assert().code(0);
}
}

View File

@@ -22,15 +22,9 @@ mod passing {
let out = cmd.arg("-M").arg(path_html.as_os_str()).output().unwrap();
// STDOUT should contain HTML with no CSS
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head></head><body><noscript><img src=\"data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIxLjEiIGJhc2VQcm9maWxlPSJmdWxsIiB3aWR0aD0iMzAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICAgIDxyZWN0IHdpZHRoPSIxMDAlIiBoZWlnaHQ9IjEwMCUiIGZpbGw9InJlZCIgLz4KICAgIDxjaXJjbGUgY3g9IjE1MCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9ImdyZWVuIiAvPgogICAgPHRleHQgeD0iMTUwIiB5PSIxMjUiIGZvbnQtc2l6ZT0iNjAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IndoaXRlIj5TVkc8L3RleHQ+Cjwvc3ZnPgo=\"></noscript>\n</body></html>\n"
);
// STDERR should contain target HTML and embedded SVG files
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
String::from_utf8_lossy(&out.stderr),
format!(
"\
{file_url_html}\n \
@@ -41,7 +35,13 @@ mod passing {
)
);
// The exit code should be 0
// STDOUT should contain HTML with no CSS
assert_eq!(
String::from_utf8_lossy(&out.stdout),
"<html><head></head><body><noscript><img src=\"data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIxLjEiIGJhc2VQcm9maWxlPSJmdWxsIiB3aWR0aD0iMzAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICAgIDxyZWN0IHdpZHRoPSIxMDAlIiBoZWlnaHQ9IjEwMCUiIGZpbGw9InJlZCIgLz4KICAgIDxjaXJjbGUgY3g9IjE1MCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9ImdyZWVuIiAvPgogICAgPHRleHQgeD0iMTUwIiB5PSIxMjUiIGZvbnQtc2l6ZT0iNjAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IndoaXRlIj5TVkc8L3RleHQ+Cjwvc3ZnPgo=\"></noscript>\n</body></html>\n"
);
// Exit code should be 0
out.assert().code(0);
}
@@ -53,15 +53,9 @@ mod passing {
let out = cmd.arg("-Mn").arg(path_html.as_os_str()).output().unwrap();
// STDOUT should contain HTML with no CSS
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head></head><body><!--noscript--><img src=\"data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIxLjEiIGJhc2VQcm9maWxlPSJmdWxsIiB3aWR0aD0iMzAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICAgIDxyZWN0IHdpZHRoPSIxMDAlIiBoZWlnaHQ9IjEwMCUiIGZpbGw9InJlZCIgLz4KICAgIDxjaXJjbGUgY3g9IjE1MCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9ImdyZWVuIiAvPgogICAgPHRleHQgeD0iMTUwIiB5PSIxMjUiIGZvbnQtc2l6ZT0iNjAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IndoaXRlIj5TVkc8L3RleHQ+Cjwvc3ZnPgo=\"><!--/noscript-->\n</body></html>\n"
);
// STDERR should contain target HTML and embedded SVG files
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
String::from_utf8_lossy(&out.stderr),
format!(
"\
{file_url_html}\n \
@@ -72,7 +66,13 @@ mod passing {
)
);
// The exit code should be 0
// STDOUT should contain HTML with no CSS
assert_eq!(
String::from_utf8_lossy(&out.stdout),
"<html><head></head><body><!--noscript--><img src=\"data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIxLjEiIGJhc2VQcm9maWxlPSJmdWxsIiB3aWR0aD0iMzAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICAgIDxyZWN0IHdpZHRoPSIxMDAlIiBoZWlnaHQ9IjEwMCUiIGZpbGw9InJlZCIgLz4KICAgIDxjaXJjbGUgY3g9IjE1MCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9ImdyZWVuIiAvPgogICAgPHRleHQgeD0iMTUwIiB5PSIxMjUiIGZvbnQtc2l6ZT0iNjAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IndoaXRlIj5TVkc8L3RleHQ+Cjwvc3ZnPgo=\"><!--/noscript-->\n</body></html>\n"
);
// Exit code should be 0
out.assert().code(0);
}
@@ -84,15 +84,9 @@ mod passing {
let out = cmd.arg("-Mn").arg(path_html.as_os_str()).output().unwrap();
// STDOUT should contain HTML with no CSS
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html><head></head><body><!--noscript--><h1>JS is not active</h1><!--noscript--><img src=\"data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIxLjEiIGJhc2VQcm9maWxlPSJmdWxsIiB3aWR0aD0iMzAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICAgIDxyZWN0IHdpZHRoPSIxMDAlIiBoZWlnaHQ9IjEwMCUiIGZpbGw9InJlZCIgLz4KICAgIDxjaXJjbGUgY3g9IjE1MCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9ImdyZWVuIiAvPgogICAgPHRleHQgeD0iMTUwIiB5PSIxMjUiIGZvbnQtc2l6ZT0iNjAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IndoaXRlIj5TVkc8L3RleHQ+Cjwvc3ZnPgo=\"><!--/noscript--><!--/noscript-->\n</body></html>\n"
);
// STDERR should contain target HTML and embedded SVG files
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
String::from_utf8_lossy(&out.stderr),
format!(
"\
{file_url_html}\n \
@@ -103,7 +97,13 @@ mod passing {
)
);
// The exit code should be 0
// STDOUT should contain HTML with no CSS
assert_eq!(
String::from_utf8_lossy(&out.stdout),
"<html><head></head><body><!--noscript--><h1>JS is not active</h1><!--noscript--><img src=\"data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIxLjEiIGJhc2VQcm9maWxlPSJmdWxsIiB3aWR0aD0iMzAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICAgIDxyZWN0IHdpZHRoPSIxMDAlIiBoZWlnaHQ9IjEwMCUiIGZpbGw9InJlZCIgLz4KICAgIDxjaXJjbGUgY3g9IjE1MCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9ImdyZWVuIiAvPgogICAgPHRleHQgeD0iMTUwIiB5PSIxMjUiIGZvbnQtc2l6ZT0iNjAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IndoaXRlIj5TVkc8L3RleHQ+Cjwvc3ZnPgo=\"><!--/noscript--><!--/noscript-->\n</body></html>\n"
);
// Exit code should be 0
out.assert().code(0);
}
@@ -115,22 +115,9 @@ mod passing {
let out = cmd.arg("-Mn").arg(path_html.as_os_str()).output().unwrap();
// STDOUT should contain HTML with no CSS
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
"<html>\
<head></head>\
<body>\
<!--noscript-->\
<img src=\"data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIxLjEiIGJhc2VQcm9maWxlPSJmdWxsIiB3aWR0aD0iMzAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICAgIDxyZWN0IHdpZHRoPSIxMDAlIiBoZWlnaHQ9IjEwMCUiIGZpbGw9InJlZCIgLz4KICAgIDxjaXJjbGUgY3g9IjE1MCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9ImdyZWVuIiAvPgogICAgPHRleHQgeD0iMTUwIiB5PSIxMjUiIGZvbnQtc2l6ZT0iNjAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IndoaXRlIj5TVkc8L3RleHQ+Cjwvc3ZnPgo=\">\
<!--/noscript-->\n\
</body>\
</html>\n"
);
// STDERR should contain target HTML and embedded SVG files
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
String::from_utf8_lossy(&out.stderr),
format!(
"\
{file_url_html}\n \
@@ -141,7 +128,20 @@ mod passing {
)
);
// The exit code should be 0
// STDOUT should contain HTML with no CSS
assert_eq!(
String::from_utf8_lossy(&out.stdout),
"<html>\
<head></head>\
<body>\
<!--noscript-->\
<img src=\"data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIxLjEiIGJhc2VQcm9maWxlPSJmdWxsIiB3aWR0aD0iMzAwIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KICAgIDxyZWN0IHdpZHRoPSIxMDAlIiBoZWlnaHQ9IjEwMCUiIGZpbGw9InJlZCIgLz4KICAgIDxjaXJjbGUgY3g9IjE1MCIgY3k9IjEwMCIgcj0iODAiIGZpbGw9ImdyZWVuIiAvPgogICAgPHRleHQgeD0iMTUwIiB5PSIxMjUiIGZvbnQtc2l6ZT0iNjAiIHRleHQtYW5jaG9yPSJtaWRkbGUiIGZpbGw9IndoaXRlIj5TVkc8L3RleHQ+Cjwvc3ZnPgo=\">\
<!--/noscript-->\n\
</body>\
</html>\n"
);
// Exit code should be 0
out.assert().code(0);
}
@@ -155,16 +155,16 @@ mod passing {
.output()
.unwrap();
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain unwrapped contents of NOSCRIPT element
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stdout),
"<html><head><!--noscript class=\"\"-->test<!--/noscript--></head><body></body></html>\n"
);
// STDERR should be empty
assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), "");
// The exit code should be 0
// Exit code should be 0
out.assert().code(0);
}
}

View File

@@ -8,41 +8,211 @@
#[cfg(test)]
mod passing {
use assert_cmd::prelude::*;
use encoding_rs::Encoding;
use std::env;
use std::process::Command;
use std::path::MAIN_SEPARATOR;
use std::process::{Command, Stdio};
#[test]
fn change_encoding_to_utf_8() {
fn properly_save_document_with_gb2312() {
let cwd = env::current_dir().unwrap();
let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/");
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd
.arg("-M")
.arg(if cfg!(windows) {
"src\\tests\\data\\unusual_encodings\\iso-8859-1.html"
} else {
"src/tests/data/unusual_encodings/iso-8859-1.html"
})
.arg(format!(
"src{s}tests{s}data{s}unusual_encodings{s}gb2312.html",
s = MAIN_SEPARATOR
))
.output()
.unwrap();
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
// STDOUT should contain newly added base URL
// STDERR should contain only the target file
assert_eq!(
std::str::from_utf8(&out.stdout).unwrap(),
String::from_utf8_lossy(&out.stderr),
format!(
"{file}{cwd}/src/tests/data/unusual_encodings/gb2312.html\n",
file = file_url_protocol,
cwd = cwd_normalized,
)
);
// STDOUT should contain original document without any modificatons
let s: String;
if let Some(encoding) = Encoding::for_label(b"gb2312") {
let (string, _, _) = encoding.decode(&out.stdout);
s = string.to_string();
} else {
s = String::from_utf8_lossy(&out.stdout).to_string();
}
assert_eq!(
s,
"<html>\
<head>\n \
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n \
</head>\n \
<body>\n \
© Some Company\n \
\n\n</body>\
<head>\n \
<meta http-equiv=\"content-type\" content=\"text/html;charset=GB2312\">\n \
<title>近七成人减少线下需求\u{3000}银行数字化转型提速--经济·科技--人民网 </title>\n\
</head>\n\
<body>\n \
<h1>近七成人减少线下需求\u{3000}银行数字化转型提速</h1>\n\n\n\
</body>\
</html>\n"
);
// Exit code should be 0
out.assert().code(0);
}
#[test]
fn properly_save_document_with_gb2312_from_stdin() {
let mut echo = Command::new("cat")
.arg(format!(
"src{s}tests{s}data{s}unusual_encodings{s}gb2312.html",
s = MAIN_SEPARATOR
))
.stdout(Stdio::piped())
.spawn()
.unwrap();
let echo_out = echo.stdout.take().unwrap();
echo.wait().unwrap();
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
cmd.stdin(echo_out);
let out = cmd.arg("-M").arg("-").output().unwrap();
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain HTML created out of STDIN
let s: String;
if let Some(encoding) = Encoding::for_label(b"gb2312") {
let (string, _, _) = encoding.decode(&out.stdout);
s = string.to_string();
} else {
s = String::from_utf8_lossy(&out.stdout).to_string();
}
assert_eq!(
s,
"<html>\
<head>\n \
<meta http-equiv=\"content-type\" content=\"text/html;charset=GB2312\">\n \
<title>近七成人减少线下需求\u{3000}银行数字化转型提速--经济·科技--人民网 </title>\n\
</head>\n\
<body>\n \
<h1>近七成人减少线下需求\u{3000}银行数字化转型提速</h1>\n\n\n\
</body>\
</html>\n"
);
// Exit code should be 0
out.assert().code(0);
}
#[test]
fn properly_save_document_with_gb2312_custom_charset() {
let cwd = env::current_dir().unwrap();
let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/");
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd
.arg("-M")
.arg("-C")
.arg("utf8")
.arg(format!(
"src{s}tests{s}data{s}unusual_encodings{s}gb2312.html",
s = MAIN_SEPARATOR
))
.output()
.unwrap();
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
// STDERR should contain only the target file
assert_eq!(
std::str::from_utf8(&out.stderr).unwrap(),
String::from_utf8_lossy(&out.stderr),
format!(
"{file}{cwd}/src/tests/data/unusual_encodings/gb2312.html\n",
file = file_url_protocol,
cwd = cwd_normalized,
)
);
// STDOUT should contain original document without any modificatons
assert_eq!(
String::from_utf8_lossy(&out.stdout).to_string(),
"<html>\
<head>\n \
<meta http-equiv=\"content-type\" content=\"text/html;charset=utf8\">\n \
<title>近七成人减少线下需求\u{3000}银行数字化转型提速--经济·科技--人民网 </title>\n\
</head>\n\
<body>\n \
<h1>近七成人减少线下需求\u{3000}银行数字化转型提速</h1>\n\n\n\
</body>\
</html>\n"
);
// Exit code should be 0
out.assert().code(0);
}
#[test]
fn properly_save_document_with_gb2312_custom_charset_bad() {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd
.arg("-M")
.arg("-C")
.arg("utf0")
.arg(format!(
"src{s}tests{s}data{s}unusual_encodings{s}gb2312.html",
s = MAIN_SEPARATOR
))
.output()
.unwrap();
// STDERR should contain error message
assert_eq!(
String::from_utf8_lossy(&out.stderr),
"Unknown encoding: utf0\n"
);
// STDOUT should be empty
assert_eq!(String::from_utf8_lossy(&out.stdout).to_string(), "");
// Exit code should be 1
out.assert().code(1);
}
}
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝
// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗
// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod failing {
use assert_cmd::prelude::*;
use std::env;
use std::path::MAIN_SEPARATOR;
use std::process::Command;
#[test]
fn change_iso88591_to_utf8_to_properly_display_html_entities() {
let cwd = env::current_dir().unwrap();
let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/");
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd
.arg("-M")
.arg(format!(
"src{s}tests{s}data{s}unusual_encodings{s}iso-8859-1.html",
s = MAIN_SEPARATOR
))
.output()
.unwrap();
let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" };
// STDERR should contain only the target file
assert_eq!(
String::from_utf8_lossy(&out.stderr),
format!(
"{file}{cwd}/src/tests/data/unusual_encodings/iso-8859-1.html\n",
file = file_url_protocol,
@@ -50,7 +220,20 @@ mod passing {
)
);
// The exit code should be 0
// STDOUT should contain original document but with UTF-8 charset
assert_eq!(
String::from_utf8_lossy(&out.stdout),
"<html>\
<head>\n \
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">\n \
</head>\n \
<body>\n \
<20> Some Company\n \
\n\n</body>\
</html>\n"
);
// Exit code should be 0
out.assert().code(0);
}
}

View File

@@ -331,7 +331,7 @@ mod passing {
";
const CSS_OUT: &str = "\
#language a[href=\"#translations\"]:before {\n\
content: url(\"data:;base64,\") \"\\a \";\n\
content: url(\"data:text/plain;base64,\") \"\\a \";\n\
white-space: pre }\n\
";

View File

@@ -0,0 +1,9 @@
<html>
<head>
<meta http-equiv="content-type" content="text/html;charset=GB2312"/>
<title>近七成人减少线下需求 银行数字化转型提速--经济·科技--人民网 </title>
</head>
<body>
<h1>近七成人减少线下需求 银行数字化转型提速</h1>
</body>
</html>

View File

@@ -14,7 +14,7 @@ mod passing {
#[test]
fn basic() {
let html = "<div>text</div>";
let mut dom = html::html_to_dom(&html);
let mut dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
dom = html::add_favicon(&dom.document, "I_AM_A_FAVICON_DATA_URL".to_string());

View File

@@ -19,7 +19,7 @@ mod passing {
<body>
</body>
</html>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
assert_eq!(
html::get_base_url(&dom.document),
@@ -38,7 +38,7 @@ mod passing {
<body>
</body>
</html>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
assert_eq!(
html::get_base_url(&dom.document),
@@ -67,7 +67,7 @@ mod failing {
<body>
</body>
</html>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
assert_eq!(html::get_base_url(&dom.document), None);
}
@@ -82,7 +82,7 @@ mod failing {
<body>
</body>
</html>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
assert_eq!(html::get_base_url(&dom.document), None);
}
@@ -97,7 +97,7 @@ mod failing {
<body>
</body>
</html>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
assert_eq!(html::get_base_url(&dom.document), Some(str!()));
}

View File

@@ -0,0 +1,72 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use crate::html;
#[test]
fn meta_content_type() {
let html = "<!doctype html>
<html>
<head>
<meta http-equiv=\"content-type\" content=\"text/html;charset=GB2312\" />
</head>
<body>
</body>
</html>";
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
assert_eq!(html::get_charset(&dom.document), Some(str!("GB2312")));
}
#[test]
fn meta_charset() {
let html = "<!doctype html>
<html>
<head>
<meta charset=\"GB2312\" />
</head>
<body>
</body>
</html>";
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
assert_eq!(html::get_charset(&dom.document), Some(str!("GB2312")));
}
#[test]
fn multiple_conflicting_meta_charset_first() {
let html = "<!doctype html>
<html>
<head>
<meta charset=\"utf-8\" />
<meta http-equiv=\"content-type\" content=\"text/html;charset=GB2312\" />
</head>
<body>
</body>
</html>";
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
assert_eq!(html::get_charset(&dom.document), Some(str!("utf-8")));
}
#[test]
fn multiple_conflicting_meta_content_type_first() {
let html = "<!doctype html>
<html>
<head>
<meta http-equiv=\"content-type\" content=\"text/html;charset=GB2312\" />
<meta charset=\"utf-8\" />
</head>
<body>
</body>
</html>";
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
assert_eq!(html::get_charset(&dom.document), Some(str!("GB2312")));
}
}

View File

@@ -14,7 +14,7 @@ mod passing {
#[test]
fn div_two_style_attributes() {
let html = "<!doctype html><html><head></head><body><DIV STYLE=\"color: blue;\" style=\"display: none;\"></div></body></html>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let mut count = 0;
fn test_walk(node: &Handle, i: &mut i8) {

View File

@@ -14,7 +14,7 @@ mod passing {
#[test]
fn parent_node_names() {
let html = "<!doctype html><html><HEAD></HEAD><body><div><P></P></div></body></html>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let mut count = 0;
fn test_walk(node: &Handle, i: &mut i8) {

View File

@@ -12,7 +12,7 @@ mod passing {
#[test]
fn icon() {
let html = "<link rel=\"icon\" href=\"\" /><div>text</div>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let res: bool = html::has_favicon(&dom.document);
assert!(res);
@@ -21,7 +21,7 @@ mod passing {
#[test]
fn shortcut_icon() {
let html = "<link rel=\"shortcut icon\" href=\"\" /><div>text</div>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let res: bool = html::has_favicon(&dom.document);
assert!(res);
@@ -42,7 +42,7 @@ mod failing {
#[test]
fn absent() {
let html = "<div>text</div>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let res: bool = html::has_favicon(&dom.document);
assert!(!res);

View File

@@ -4,10 +4,11 @@ mod compose_csp;
mod create_metadata_tag;
mod embed_srcset;
mod get_base_url;
mod get_charset;
mod get_node_attr;
mod get_node_name;
mod has_favicon;
mod is_icon;
mod serialize_document;
mod set_node_attr;
mod stringify_document;
mod walk_and_embed_assets;

View File

@@ -13,11 +13,11 @@ mod passing {
#[test]
fn div_as_root_element() {
let html = "<div><script src=\"some.js\"></script></div>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let options = Options::default();
assert_eq!(
html::stringify_document(&dom.document, &options),
String::from_utf8_lossy(&html::serialize_document(dom, str!(), &options)),
"<html><head></head><body><div><script src=\"some.js\"></script></div></body></html>"
);
}
@@ -28,15 +28,16 @@ mod passing {
<link rel=\"something\" href=\"some.css\" />\
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src https:\">\
<div><script src=\"some.js\"></script></div>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let mut options = Options::default();
options.isolate = true;
assert_eq!(
html::stringify_document(
&dom.document,
String::from_utf8_lossy(&html::serialize_document(
dom,
str!(),
&options
),
)),
"<html>\
<head>\
<meta http-equiv=\"Content-Security-Policy\" content=\"default-src 'unsafe-inline' data:;\"></meta>\
@@ -59,12 +60,12 @@ mod passing {
<title>Unstyled document</title>\
<link rel=\"stylesheet\" href=\"main.css\"/>\
<div style=\"display: none;\"></div>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let mut options = Options::default();
options.no_css = true;
assert_eq!(
html::stringify_document(&dom.document, &options),
String::from_utf8_lossy(&html::serialize_document(dom, str!(), &options)),
"<!DOCTYPE html>\
<html>\
<head>\
@@ -83,15 +84,16 @@ mod passing {
<title>Frameless document</title>\
<link rel=\"something\"/>\
<div><script src=\"some.js\"></script></div>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let mut options = Options::default();
options.no_frames = true;
assert_eq!(
html::stringify_document(
&dom.document,
String::from_utf8_lossy(&html::serialize_document(
dom,
str!(),
&options
),
)),
"<!DOCTYPE html>\
<html>\
<head>\
@@ -115,7 +117,7 @@ mod passing {
<img style=\"width: 100%;\" src=\"some.png\" />\
<iframe src=\"some.html\"></iframe>\
</div>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let mut options = Options::default();
options.isolate = true;
options.no_css = true;
@@ -125,10 +127,11 @@ mod passing {
options.no_images = true;
assert_eq!(
html::stringify_document(
&dom.document,
String::from_utf8_lossy(&html::serialize_document(
dom,
str!(),
&options
),
)),
"<!DOCTYPE html>\
<html>\
<head>\

View File

@@ -14,7 +14,7 @@ mod passing {
#[test]
fn html_lang_and_body_style() {
let html = "<!doctype html><html lang=\"en\"><head></head><body></body></html>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let mut count = 0;
fn test_walk(node: &Handle, i: &mut i8) {
@@ -67,7 +67,7 @@ mod passing {
#[test]
fn body_background() {
let html = "<!doctype html><html lang=\"en\"><head></head><body background=\"1\" background=\"2\"></body></html>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let mut count = 0;
fn test_walk(node: &Handle, i: &mut i8) {

View File

@@ -20,7 +20,7 @@ mod passing {
let cache = &mut HashMap::new();
let html: &str = "<div><P></P></div>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let url: Url = Url::parse("http://localhost").unwrap();
let mut options = Options::default();
@@ -42,7 +42,7 @@ mod passing {
#[test]
fn ensure_no_recursive_iframe() {
let html = "<div><P></P><iframe src=\"\"></iframe></div>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
@@ -65,7 +65,7 @@ mod passing {
#[test]
fn ensure_no_recursive_frame() {
let html = "<frameset><frame src=\"\"></frameset>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
@@ -93,7 +93,7 @@ mod passing {
<style>html{background-color: #000;}</style>\
<div style=\"display: none;\"></div>\
";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
@@ -129,7 +129,7 @@ mod passing {
fn no_images() {
let html = "<link rel=\"icon\" href=\"favicon.ico\">\
<div><img src=\"http://localhost/assets/mono_lisa.png\" /></div>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
@@ -166,7 +166,7 @@ mod passing {
fn no_body_background_images() {
let html =
"<body background=\"no/such/image.png\" background=\"no/such/image2.png\"></body>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
@@ -190,7 +190,7 @@ mod passing {
#[test]
fn no_frames() {
let html = "<frameset><frame src=\"http://trackbook.com\"></frameset>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
@@ -222,7 +222,7 @@ mod passing {
#[test]
fn no_iframes() {
let html = "<iframe src=\"http://trackbook.com\"></iframe>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
@@ -258,7 +258,7 @@ mod passing {
<script>alert(1)</script>\
</div>\
";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
@@ -290,10 +290,10 @@ mod passing {
}
#[test]
fn keeps_integrity_for_linked_assets() {
fn keeps_integrity_for_unfamiliar_links() {
let html = "<title>Has integrity</title>\
<link integrity=\"sha384-12345\" rel=\"something\" href=\"https://some-site.com/some-file.ext\" />";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
@@ -322,13 +322,13 @@ mod passing {
}
#[test]
fn discards_integrity_for_linked_assets_nojs_nocss() {
fn discards_integrity_for_known_links_nojs_nocss() {
let html = "\
<title>No integrity</title>\
<link integrity=\"\" rel=\"stylesheet\" href=\"data:;\"/>\
<script integrity=\"\" src=\"some.js\"></script>\
";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
@@ -366,7 +366,7 @@ mod passing {
<link integrity=\"sha384-123\" rel=\"something\" href=\"data:;\"/>\
<script integrity=\"sha384-456\" src=\"some.js\"></script>\
";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
@@ -403,14 +403,14 @@ mod passing {
let html = "\
<html>\
<head>\
<meta http-equiv=\"Refresh\" value=\"20\"/>\
<meta http-equiv=\"Location\" value=\"https://freebsd.org\"/>\
<meta http-equiv=\"Refresh\" content=\"2\"/>\
<meta http-equiv=\"Location\" content=\"https://freebsd.org\"/>\
</head>\
<body>\
</body>\
</html>\
";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
@@ -433,8 +433,8 @@ mod passing {
"\
<html>\
<head>\
<meta http-equiv=\"disabled by monolith (Refresh)\" value=\"20\">\
<meta http-equiv=\"disabled by monolith (Location)\" value=\"https://freebsd.org\">\
<meta content=\"2\">\
<meta content=\"https://freebsd.org\">\
</head>\
<body>\
</body>\
@@ -452,7 +452,7 @@ mod passing {
</noscript>\
</body>\
</html>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();
@@ -488,7 +488,7 @@ mod passing {
#[test]
fn preserves_script_type_json() {
let html = "<script id=\"data\" type=\"application/json\">{\"mono\":\"lith\"}</script>";
let dom = html::html_to_dom(&html);
let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!());
let url: Url = Url::parse("http://localhost").unwrap();
let cache = &mut HashMap::new();

View File

@@ -16,6 +16,7 @@ mod passing {
assert_eq!(options.no_audio, false);
assert_eq!(options.base_url, None);
assert_eq!(options.no_css, false);
assert_eq!(options.charset, None);
assert_eq!(options.no_frames, false);
assert_eq!(options.no_fonts, false);
assert_eq!(options.no_images, false);

View File

@@ -13,9 +13,14 @@ mod passing {
#[test]
fn encode_string_with_specific_media_type() {
let mime = "application/javascript";
let media_type = "application/javascript";
let data = "var word = 'hello';\nalert(word);\n";
let data_url = url::create_data_url(mime, data.as_bytes(), &Url::parse("data:,").unwrap());
let data_url = url::create_data_url(
media_type,
"",
data.as_bytes(),
&Url::parse("data:,").unwrap(),
);
assert_eq!(
data_url.as_str(),
@@ -28,6 +33,7 @@ mod passing {
let data = "<svg></svg>\n";
let data_url = url::create_data_url(
"image/svg+xml",
"",
data.as_bytes(),
&Url::parse("data:,").unwrap(),
);
@@ -37,4 +43,67 @@ mod passing {
"data:image/svg+xml;base64,PHN2Zz48L3N2Zz4K"
);
}
#[test]
fn encode_string_with_specific_media_type_and_charset() {
let media_type = "application/javascript";
let charset = "utf8";
let data = "var word = 'hello';\nalert(word);\n";
let data_url = url::create_data_url(
media_type,
charset,
data.as_bytes(),
&Url::parse("data:,").unwrap(),
);
assert_eq!(
data_url.as_str(),
"data:application/javascript;charset=utf8;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK"
);
}
#[test]
fn create_data_url_with_us_ascii_charset() {
let media_type = "";
let charset = "us-ascii";
let data = "";
let data_url = url::create_data_url(
media_type,
charset,
data.as_bytes(),
&Url::parse("data:,").unwrap(),
);
assert_eq!(data_url.as_str(), "data:;base64,");
}
#[test]
fn create_data_url_with_utf8_charset() {
let media_type = "";
let charset = "utf8";
let data = "";
let data_url = url::create_data_url(
media_type,
charset,
data.as_bytes(),
&Url::parse("data:,").unwrap(),
);
assert_eq!(data_url.as_str(), "data:;charset=utf8;base64,");
}
#[test]
fn create_data_url_with_media_type_text_plain_and_utf8_charset() {
let media_type = "text/plain";
let charset = "utf8";
let data = "";
let data_url = url::create_data_url(
media_type,
charset,
data.as_bytes(),
&Url::parse("data:,").unwrap(),
);
assert_eq!(data_url.as_str(), "data:text/plain;charset=utf8;base64,");
}
}

View File

@@ -13,9 +13,10 @@ mod passing {
#[test]
fn parse_text_html_base64() {
let (media_type, data) = url::parse_data_url(&Url::parse("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg==").unwrap());
let (media_type, charset, data) = url::parse_data_url(&Url::parse("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg==").unwrap());
assert_eq!(media_type, "text/html");
assert_eq!(charset, "US-ASCII");
assert_eq!(
String::from_utf8_lossy(&data),
"Work expands so as to fill the time available for its completion"
@@ -24,11 +25,12 @@ mod passing {
#[test]
fn parse_text_html_utf8() {
let (media_type, data) = url::parse_data_url(
&Url::parse("data:text/html;utf8,Work expands so as to fill the time available for its completion").unwrap(),
let (media_type, charset, data) = url::parse_data_url(
&Url::parse("data:text/html;charset=utf8,Work expands so as to fill the time available for its completion").unwrap(),
);
assert_eq!(media_type, "text/html");
assert_eq!(charset, "utf8");
assert_eq!(
String::from_utf8_lossy(&data),
"Work expands so as to fill the time available for its completion"
@@ -37,7 +39,7 @@ mod passing {
#[test]
fn parse_text_html_plaintext() {
let (media_type, data) = url::parse_data_url(
let (media_type, charset, data) = url::parse_data_url(
&Url::parse(
"data:text/html,Work expands so as to fill the time available for its completion",
)
@@ -45,6 +47,7 @@ mod passing {
);
assert_eq!(media_type, "text/html");
assert_eq!(charset, "US-ASCII");
assert_eq!(
String::from_utf8_lossy(&data),
"Work expands so as to fill the time available for its completion"
@@ -53,26 +56,31 @@ mod passing {
#[test]
fn parse_text_css_url_encoded() {
let (media_type, data) =
let (media_type, charset, data) =
url::parse_data_url(&Url::parse("data:text/css,div{background-color:%23000}").unwrap());
assert_eq!(media_type, "text/css");
assert_eq!(charset, "US-ASCII");
assert_eq!(String::from_utf8_lossy(&data), "div{background-color:#000}");
}
#[test]
fn parse_no_media_type_base64() {
let (media_type, data) = url::parse_data_url(&Url::parse("data:;base64,dGVzdA==").unwrap());
let (media_type, charset, data) =
url::parse_data_url(&Url::parse("data:;base64,dGVzdA==").unwrap());
assert_eq!(media_type, "");
assert_eq!(media_type, "text/plain");
assert_eq!(charset, "US-ASCII");
assert_eq!(String::from_utf8_lossy(&data), "test");
}
#[test]
fn parse_no_media_type_no_encoding() {
let (media_type, data) = url::parse_data_url(&Url::parse("data:;,test%20test").unwrap());
let (media_type, charset, data) =
url::parse_data_url(&Url::parse("data:;,test%20test").unwrap());
assert_eq!(media_type, "");
assert_eq!(media_type, "text/plain");
assert_eq!(charset, "US-ASCII");
assert_eq!(String::from_utf8_lossy(&data), "test test");
}
}
@@ -92,9 +100,10 @@ mod failing {
#[test]
fn empty_data_url() {
let (media_type, data) = url::parse_data_url(&Url::parse("data:,").unwrap());
let (media_type, charset, data) = url::parse_data_url(&Url::parse("data:,").unwrap());
assert_eq!(media_type, "");
assert_eq!(media_type, "text/plain");
assert_eq!(charset, "US-ASCII");
assert_eq!(String::from_utf8_lossy(&data), "");
}
}

View File

@@ -28,4 +28,9 @@ mod passing {
fn three() {
assert_eq!(utils::indent(3), " ");
}
#[test]
fn four() {
assert_eq!(utils::indent(4), " ");
}
}

View File

@@ -1,3 +1,4 @@
mod detect_media_type;
mod indent;
mod parse_content_type;
mod retrieve_asset;

View File

@@ -0,0 +1,86 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use crate::utils;
#[test]
fn text_plain_utf8() {
let (media_type, charset, is_base64) = utils::parse_content_type("text/plain;charset=utf8");
assert_eq!(media_type, "text/plain");
assert_eq!(charset, "utf8");
assert!(!is_base64);
}
#[test]
fn text_plain_utf8_spaces() {
let (media_type, charset, is_base64) =
utils::parse_content_type(" text/plain ; charset=utf8 ");
assert_eq!(media_type, "text/plain");
assert_eq!(charset, "utf8");
assert!(!is_base64);
}
#[test]
fn empty() {
let (media_type, charset, is_base64) = utils::parse_content_type("");
assert_eq!(media_type, "text/plain");
assert_eq!(charset, "US-ASCII");
assert!(!is_base64);
}
#[test]
fn base64() {
let (media_type, charset, is_base64) = utils::parse_content_type(";base64");
assert_eq!(media_type, "text/plain");
assert_eq!(charset, "US-ASCII");
assert!(is_base64);
}
#[test]
fn text_html_base64() {
let (media_type, charset, is_base64) = utils::parse_content_type("text/html;base64");
assert_eq!(media_type, "text/html");
assert_eq!(charset, "US-ASCII");
assert!(is_base64);
}
#[test]
fn only_media_type() {
let (media_type, charset, is_base64) = utils::parse_content_type("text/html");
assert_eq!(media_type, "text/html");
assert_eq!(charset, "US-ASCII");
assert!(!is_base64);
}
#[test]
fn only_media_type_colon() {
let (media_type, charset, is_base64) = utils::parse_content_type("text/html;");
assert_eq!(media_type, "text/html");
assert_eq!(charset, "US-ASCII");
assert!(!is_base64);
}
#[test]
fn media_type_gb2312_filename() {
let (media_type, charset, is_base64) =
utils::parse_content_type("text/html;charset=GB2312;filename=index.html");
assert_eq!(media_type, "text/html");
assert_eq!(charset, "GB2312");
assert!(!is_base64);
}
#[test]
fn media_type_filename_gb2312() {
let (media_type, charset, is_base64) =
utils::parse_content_type("text/html;filename=index.html;charset=GB2312");
assert_eq!(media_type, "text/html");
assert_eq!(charset, "GB2312");
assert!(!is_base64);
}
}

View File

@@ -26,7 +26,7 @@ mod passing {
// If both source and target are data URLs,
// ensure the result contains target data URL
let (data, final_url, media_type) = utils::retrieve_asset(
let (data, final_url, media_type, charset) = utils::retrieve_asset(
cache,
&client,
&Url::parse("data:text/html;base64,c291cmNl").unwrap(),
@@ -35,23 +35,16 @@ mod passing {
0,
)
.unwrap();
assert_eq!(&media_type, "text/html");
assert_eq!(&charset, "US-ASCII");
assert_eq!(
url::create_data_url(&media_type, &data, &final_url),
url::create_data_url(
"text/html",
"target".as_bytes(),
&Url::parse("data:text/html;base64,c291cmNl").unwrap()
)
url::create_data_url(&media_type, &charset, &data, &final_url),
Url::parse("data:text/html;base64,dGFyZ2V0").unwrap(),
);
assert_eq!(
final_url,
url::create_data_url(
"text/html",
"target".as_bytes(),
&Url::parse("data:text/html;base64,c291cmNl").unwrap()
)
Url::parse("data:text/html;base64,dGFyZ2V0").unwrap(),
);
assert_eq!(&media_type, "text/html");
}
#[test]
@@ -66,7 +59,7 @@ mod passing {
// Inclusion of local assets from local sources should be allowed
let cwd = env::current_dir().unwrap();
let (data, final_url, _media_type) = utils::retrieve_asset(
let (data, final_url, media_type, charset) = utils::retrieve_asset(
cache,
&client,
&Url::parse(&format!(
@@ -85,7 +78,9 @@ mod passing {
0,
)
.unwrap();
assert_eq!(url::create_data_url("application/javascript", &data, &final_url), Url::parse("data:application/javascript;base64,ZG9jdW1lbnQuYm9keS5zdHlsZS5iYWNrZ3JvdW5kQ29sb3IgPSAiZ3JlZW4iOwpkb2N1bWVudC5ib2R5LnN0eWxlLmNvbG9yID0gInJlZCI7Cg==").unwrap());
assert_eq!(&media_type, "application/javascript");
assert_eq!(&charset, "");
assert_eq!(url::create_data_url(&media_type, &charset, &data, &final_url), Url::parse("data:application/javascript;base64,ZG9jdW1lbnQuYm9keS5zdHlsZS5iYWNrZ3JvdW5kQ29sb3IgPSAiZ3JlZW4iOwpkb2N1bWVudC5ib2R5LnN0eWxlLmNvbG9yID0gInJlZCI7Cg==").unwrap());
assert_eq!(
final_url,
Url::parse(&format!(

View File

@@ -1,7 +1,7 @@
use base64;
use url::{form_urlencoded, Url};
use crate::utils::detect_media_type;
use crate::utils::{detect_media_type, parse_content_type};
pub fn clean_url(url: Url) -> Url {
let mut url = url.clone();
@@ -12,7 +12,8 @@ pub fn clean_url(url: Url) -> Url {
url
}
pub fn create_data_url(media_type: &str, data: &[u8], final_asset_url: &Url) -> Url {
pub fn create_data_url(media_type: &str, charset: &str, data: &[u8], final_asset_url: &Url) -> Url {
// TODO: move this block out of this function
let media_type: String = if media_type.is_empty() {
detect_media_type(data, &final_asset_url)
} else {
@@ -21,7 +22,14 @@ pub fn create_data_url(media_type: &str, data: &[u8], final_asset_url: &Url) ->
let mut data_url: Url = Url::parse("data:,").unwrap();
data_url.set_path(format!("{};base64,{}", media_type, base64::encode(data)).as_str());
let c: String =
if !charset.trim().is_empty() && !charset.trim().eq_ignore_ascii_case("US-ASCII") {
format!(";charset={}", charset.trim())
} else {
str!()
};
data_url.set_path(format!("{}{};base64,{}", media_type, c, base64::encode(data)).as_str());
data_url
}
@@ -37,42 +45,26 @@ pub fn is_url_and_has_protocol(input: &str) -> bool {
}
}
pub fn parse_data_url(url: &Url) -> (String, Vec<u8>) {
pub fn parse_data_url(url: &Url) -> (String, String, Vec<u8>) {
let path: String = url.path().to_string();
let comma_loc: usize = path.find(',').unwrap_or(path.len());
let meta_data: String = path.chars().take(comma_loc).collect();
let raw_data: String = path.chars().skip(comma_loc + 1).collect();
// Split data URL into meta data and raw data
let content_type: String = path.chars().take(comma_loc).collect();
let data: String = path.chars().skip(comma_loc + 1).collect();
let text: String = percent_decode(raw_data);
// Parse meta data
let (media_type, charset, is_base64) = parse_content_type(&content_type);
let meta_data_items: Vec<&str> = meta_data.split(';').collect();
let mut media_type: String = str!();
let mut encoding: &str = "";
let mut i: i8 = 0;
for item in &meta_data_items {
if i == 0 {
media_type = str!(item);
} else {
if item.eq_ignore_ascii_case("base64")
|| item.eq_ignore_ascii_case("utf8")
|| item.eq_ignore_ascii_case("charset=UTF-8")
{
encoding = item;
}
}
i = i + 1;
}
let data: Vec<u8> = if encoding.eq_ignore_ascii_case("base64") {
// Parse raw data into vector of bytes
let text: String = percent_decode(data);
let blob: Vec<u8> = if is_base64 {
base64::decode(&text).unwrap_or(vec![])
} else {
text.as_bytes().to_vec()
};
(media_type, data)
(media_type, charset, blob)
}
pub fn percent_decode(input: String) -> String {

View File

@@ -33,25 +33,63 @@ const MAGIC: [[&[u8]; 2]; 18] = [
[b"....moov", b"video/quicktime"],
[b"\x1A\x45\xDF\xA3", b"video/webm"],
];
const PLAINTEXT_MEDIA_TYPES: &[&str] = &["application/javascript", "image/svg+xml"];
const PLAINTEXT_MEDIA_TYPES: &[&str] = &[
"application/javascript",
"application/json",
"image/svg+xml",
];
pub fn detect_media_type(data: &[u8], url: &Url) -> String {
// At first attempt to read file's header
for magic_item in MAGIC.iter() {
if data.starts_with(magic_item[0]) {
return String::from_utf8(magic_item[1].to_vec()).unwrap();
}
}
if url.path().to_lowercase().ends_with(".svg") {
return str!("image/svg+xml");
}
str!()
// If header didn't match any known magic signatures,
// try to guess media type from file name
let parts: Vec<&str> = url.path().split('/').collect();
detect_media_type_by_file_name(parts.last().unwrap())
}
pub fn is_plaintext_media_type(media_type: &str) -> bool {
media_type.to_lowercase().as_str().starts_with("text/")
|| PLAINTEXT_MEDIA_TYPES.contains(&media_type.to_lowercase().as_str())
pub fn detect_media_type_by_file_name(filename: &str) -> String {
let filename_lowercased: &str = &filename.to_lowercase();
let parts: Vec<&str> = filename_lowercased.split('.').collect();
let mime: &str = match parts.last() {
Some(v) => match *v {
"avi" => "video/avi",
"bmp" => "image/bmp",
"css" => "text/css",
"flac" => "audio/flac",
"gif" => "image/gif",
"htm" | "html" => "text/html",
"ico" => "image/x-icon",
"jpeg" | "jpg" => "image/jpeg",
"js" => "application/javascript",
"json" => "application/json",
"mp3" => "audio/mpeg",
"mp4" | "m4v" => "video/mp4",
"ogg" => "audio/ogg",
"ogv" => "video/ogg",
"pdf" => "application/pdf",
"png" => "image/png",
"svg" => "image/svg+xml",
"swf" => "application/x-shockwave-flash",
"tif" | "tiff" => "image/tiff",
"txt" => "text/plain",
"wav" => "audio/wav",
"webp" => "image/webp",
"woff" => "font/woff",
"woff2" => "font/woff2",
"xml" => "text/xml",
&_ => "",
},
None => "",
};
mime.to_string()
}
pub fn indent(level: u32) -> String {
@@ -66,6 +104,38 @@ pub fn indent(level: u32) -> String {
result
}
pub fn is_plaintext_media_type(media_type: &str) -> bool {
media_type.to_lowercase().as_str().starts_with("text/")
|| PLAINTEXT_MEDIA_TYPES.contains(&media_type.to_lowercase().as_str())
}
pub fn parse_content_type(content_type: &str) -> (String, String, bool) {
let mut media_type: String = str!("text/plain");
let mut charset: String = str!("US-ASCII");
let mut is_base64: bool = false;
// Parse meta data
let content_type_items: Vec<&str> = content_type.split(';').collect();
let mut i: i8 = 0;
for item in &content_type_items {
if i == 0 {
if item.trim().len() > 0 {
media_type = str!(item.trim());
}
} else {
if item.trim().eq_ignore_ascii_case("base64") {
is_base64 = true;
} else if item.trim().starts_with("charset=") {
charset = item.trim().chars().skip(8).collect();
}
}
i += 1;
}
(media_type, charset, is_base64)
}
pub fn retrieve_asset(
cache: &mut HashMap<String, Vec<u8>>,
client: &Client,
@@ -73,10 +143,10 @@ pub fn retrieve_asset(
url: &Url,
options: &Options,
depth: u32,
) -> Result<(Vec<u8>, Url, String), reqwest::Error> {
) -> Result<(Vec<u8>, Url, String, String), reqwest::Error> {
if url.scheme() == "data" {
let (media_type, data) = parse_data_url(url);
Ok((data, url.clone(), media_type))
let (media_type, charset, data) = parse_data_url(url);
Ok((data, url.clone(), media_type, charset))
} else if url.scheme() == "file" {
// Check if parent_url is also file:/// (if not, then we don't embed the asset)
if parent_url.scheme() != "file" {
@@ -123,7 +193,14 @@ pub fn retrieve_asset(
eprintln!("{}{}", indent(depth).as_str(), &url);
}
Ok((fs::read(&path).expect(""), url.clone(), str!()))
let file_blob: Vec<u8> = fs::read(&path).expect("Unable to read file");
Ok((
file_blob.clone(),
url.clone(),
detect_media_type(&file_blob, url),
str!(),
))
}
} else {
if !options.silent {
@@ -147,16 +224,19 @@ pub fn retrieve_asset(
let cache_key: String = clean_url(url.clone()).as_str().to_string();
if cache.contains_key(&cache_key) {
// URL is in cache,
// we get and return it
// URL is in cache, we get and return it
if !options.silent {
eprintln!("{}{} (from cache)", indent(depth).as_str(), &url);
}
Ok((cache.get(&cache_key).unwrap().to_vec(), url.clone(), str!()))
Ok((
cache.get(&cache_key).unwrap().to_vec(),
url.clone(),
str!(),
str!(),
))
} else {
// URL not in cache,
// we retrieve the file
// URL not in cache, we retrieve the file
match client.get(url.as_str()).send() {
Ok(mut response) => {
if !options.ignore_errors && response.status() != 200 {
@@ -192,18 +272,20 @@ pub fn retrieve_asset(
let mut data: Vec<u8> = vec![];
response.copy_to(&mut data).unwrap();
// Attempt to obtain media type by reading Content-Type header
let media_type: &str = response
// Attempt to obtain media type and charset by reading Content-Type header
let content_type: &str = response
.headers()
.get(CONTENT_TYPE)
.and_then(|header| header.to_str().ok())
.unwrap_or("");
let (media_type, charset, _is_base64) = parse_content_type(&content_type);
// Add retrieved resource to cache
cache.insert(new_cache_key, data.clone());
// Return
Ok((data, response.url().clone(), media_type.to_string()))
Ok((data, response.url().clone(), media_type, charset))
}
Err(error) => {
if !options.silent {