61 Commits

Author SHA1 Message Date
Sunshine
8b0635bd84 bump version number (2.8.2 -> 2.8.3) 2024-09-05 19:00:24 -04:00
Sunshine
b685d3a46c update dependencies 2024-09-05 19:00:24 -04:00
Sunshine
82d05fc0f1 only send Referer HTTP header if the parent URL's scheme is http(s) 2024-09-05 18:38:52 -04:00
Orhun Parmaksız
65ac5c36b1 Support disabling vendored OpenSSL while building 2024-09-05 17:36:13 -04:00
Sunshine
1f60a76fcd fix version of shopify/upload-to-release 2024-09-02 18:11:48 -10:00
Sunshine
f067fc2324 bump version number (2.8.1 -> 2.8.2) 2024-09-03 00:01:37 -04:00
Sunshine
49d7585e02 get rid of useless docs 2024-09-02 23:50:09 -04:00
Caleb Hattingh
e25b7bc470 Link openssl statically - fixes #340 2024-09-02 23:40:15 -04:00
Sunshine
329c0568a4 update README.md 2024-08-19 07:21:10 -10:00
Andriy Rakhnin
f151a33c48 update authors 2024-08-16 10:52:58 -04:00
Andriy Rakhnin
ebf96bf1e5 referer_url refactoring 2024-08-16 10:52:58 -04:00
Andriy Rakhnin
c10c78a27d add HTTP referer header for resource requests 2024-08-16 10:52:58 -04:00
Andriy Rakhnin
64e84e4983 couple of cosmetic changes 2024-08-15 14:14:51 -04:00
Andriy Rakhnin
674d4085c7 fix srcset attribute parsing according to the specifications 2024-08-15 14:14:51 -04:00
Bryan Honof
e0fd5d4bb9 Update README.md 2024-07-30 14:54:01 -04:00
Bryan Honof
f4e360f09d Add flox installation option 2024-07-30 14:54:01 -04:00
Sunshine
0b4116f48a bump version in nuget manifest file 2024-06-24 12:06:22 -04:00
Orhun Parmaksız
084981a2ae Update instructions for installing on Arch Linux 2024-06-13 14:20:15 -04:00
Sunshine
a3feb7b721 update dependencies 2024-05-17 11:05:35 -04:00
Sunshine
87eb197e33 do not indent links based on depth in the output 2024-05-17 10:47:25 -04:00
Viktor Szépe
6798cad2b2 Fix typos 2024-03-29 06:24:14 -10:00
Waldir Pimenta
174cb50877 Add installation instructions for NixPkgs 2024-03-27 01:32:17 -10:00
Sunshine
e397a7532d Update README.md 2024-03-26 03:00:12 -10:00
Sunshine
91d8c146a9 Update README.md 2024-03-26 02:52:59 -10:00
Sunshine
67e07b91af provide more installation instructions 2024-03-26 02:51:44 -10:00
Sunshine
f797b8c999 get rid of outdated related projects 2024-03-26 02:05:07 -10:00
Sunshine
60251c6878 switch to new builder for Docker 2024-03-25 03:44:15 -10:00
Sunshine
b70801d55b ignore /target/ when building Docker image 2024-03-25 02:55:06 -10:00
Sunshine
2a50936990 update one of the author's email 2024-03-25 02:32:32 -10:00
Fred Weitendorf
f9e961f088 Add cargo setup to README
I did not have cargo installed on my system when I first tried to build the repo from a fresh git clone (I had not noticed it was a rust project), and it wasn't called out as a dependency for installation from source. I've added it as a noted dependency with some quick, collapsible installation instructions I followed to get it installed.
2024-03-25 01:33:23 -10:00
Thomas Kraxner
d8c3620d00 Scoop as install source 2024-03-25 01:30:17 -10:00
Thomas Merz
4aab0a64ee 🩹 fix README to prevent 'ERROR: failed to solve: invalid reference format: repository name must be lowercase'
and 'docker: invalid reference format: repository name must be lowercase.'
2024-03-25 01:06:43 -10:00
Sunshine
a2155e0af6 switch Rust edition from 2018 to 2021 2024-01-14 07:04:32 -10:00
Sunshine
f7e5527432 upgrade html5ever to 0.26.0 2024-01-14 06:41:46 -10:00
Sunshine
f7dd09d481 get rid of warnings from old version of Shopify/upload-to-release 2024-01-14 06:11:51 -10:00
Sunshine
73c0ceebd4 bump version number (2.8.0 -> 2.8.1) 2024-01-13 16:02:39 -10:00
Sunshine
727eae2e35 get rid of warnings related to new API of base64 crate 2024-01-13 15:57:26 -10:00
Sunshine
b7a38c9f4a Update README.md 2024-01-13 15:44:39 -10:00
vladislav doster
aa556094a4 fix: bump CD workflow runner to ubuntu-20.04
ubuntu-18.04 runners have been deprecated

https://docs.github.com/en/actions/using-jobs/choosing-the-runner-for-a-job#choosing-github-hosted-runners
2024-01-13 15:10:23 -10:00
Sunshine
81b304c558 optimize code 2024-01-13 05:33:54 -10:00
Sunshine
a3e82a2ad8 update dependencies 2024-01-13 05:33:54 -10:00
Sunshine
a5bf1705db bump version number (2.7.0 -> 2.8.0) 2024-01-13 05:33:54 -10:00
Sunshine
78c37958dc add support for using cookie file 2024-01-13 04:50:51 -10:00
Sunshine
20c56a5440 Create FUNDING.yml 2023-09-23 03:29:40 -10:00
Sunshine
37416f827b add installation instruction for Chocolatey 2022-12-05 10:11:24 -10:00
Sunshine
7f123e810b add installation instructions for Arch Linux and Alpine Linux 2022-12-05 10:11:24 -10:00
Sunshine
db04d11d99 add installation instructions for GUIX 2022-11-23 16:39:15 -10:00
Sunshine
1c8d4f1830 Update README.md 2022-11-13 11:16:28 -05:00
Sunshine
1c71e708e1 bump version number (2.6.2 -> 2.7.0), update dependencies 2022-11-10 06:36:32 -10:00
Sunshine
a1bb9a4b74 Update README.md 2022-11-10 06:03:30 -10:00
Sunshine
cf7e368545 Update README.md 2022-11-10 06:02:46 -10:00
Sunshine
c1edde9b3e refine CLI API for white/black-listing of domains 2022-11-10 05:37:36 -10:00
Sunshine
7c0504c4cb Update README.md 2022-11-10 04:45:37 -10:00
Sunshine
1bff2c22ba Update README.md 2022-11-10 04:41:13 -10:00
Sunshine
8113509dcf fix tests 2022-11-10 04:12:31 -10:00
Sunshine
8fc0fc155f parse XML documents, save non-HTML and non-XML targets unparsed 2022-11-10 04:12:31 -10:00
Jakub Jirutka
7c61b462dd disable unnecessary/unused regex features to reduce binary size
This will reduce the monolith binary size by ~15%.
2022-09-20 11:46:26 -04:00
Simone Mosciatti
ef3684025b move to use http instead of https 2022-09-11 14:30:44 -04:00
Simone Mosciatti
db7ee697b3 rewrite small part of the input argument handling
the commit rewrite a small part of the input argument handling, trying
to follow besr rust practices.
We get rid of a variable and of a mutable reference while keeping the
code a bit more coincise.
2022-09-11 14:30:44 -04:00
Sunshine
89ce5029b9 add option to blacklist/whitelist domains 2022-09-01 13:35:52 -10:00
dependabot[bot]
54609b10e5 Bump iana-time-zone from 0.1.44 to 0.1.46 (#316)
Bumps [iana-time-zone](https://github.com/strawlab/iana-time-zone) from 0.1.44 to 0.1.46.
- [Release notes](https://github.com/strawlab/iana-time-zone/releases)
- [Changelog](https://github.com/strawlab/iana-time-zone/blob/main/CHANGELOG.md)
- [Commits](https://github.com/strawlab/iana-time-zone/compare/0.1.44...v0.1.46)

---
updated-dependencies:
- dependency-name: iana-time-zone
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2022-08-31 11:35:38 -10:00
55 changed files with 2113 additions and 1266 deletions

1
.dockerignore Normal file
View File

@@ -0,0 +1 @@
/target/

3
.github/FUNDING.yml vendored Normal file
View File

@@ -0,0 +1,3 @@
# These are supported funding model platforms
github: snshn

View File

@@ -20,14 +20,14 @@ jobs:
- name: Build the executable
run: cargo build --release
- uses: Shopify/upload-to-release@1.0.0
- uses: Shopify/upload-to-release@v2.0.0
with:
name: monolith.exe
path: target\release\monolith.exe
repo-token: ${{ secrets.GITHUB_TOKEN }}
gnu_linux_armhf:
runs-on: ubuntu-18.04
runs-on: ubuntu-20.04
steps:
- name: Checkout the repository
uses: actions/checkout@v2
@@ -53,14 +53,14 @@ jobs:
run: cargo build --release --target=arm-unknown-linux-gnueabihf
- name: Attach artifact to the release
uses: Shopify/upload-to-release@1.0.0
uses: Shopify/upload-to-release@v2.0.0
with:
name: monolith-gnu-linux-armhf
path: target/arm-unknown-linux-gnueabihf/release/monolith
repo-token: ${{ secrets.GITHUB_TOKEN }}
gnu_linux_aarch64:
runs-on: ubuntu-18.04
runs-on: ubuntu-20.04
steps:
- name: Checkout the repository
uses: actions/checkout@v2
@@ -86,14 +86,14 @@ jobs:
run: cargo build --release --target=aarch64-unknown-linux-gnu
- name: Attach artifact to the release
uses: Shopify/upload-to-release@1.0.0
uses: Shopify/upload-to-release@v2.0.0
with:
name: monolith-gnu-linux-aarch64
path: target/aarch64-unknown-linux-gnu/release/monolith
repo-token: ${{ secrets.GITHUB_TOKEN }}
gnu_linux_x86_64:
runs-on: ubuntu-18.04
runs-on: ubuntu-20.04
steps:
- name: Checkout the repository
uses: actions/checkout@v2
@@ -101,7 +101,7 @@ jobs:
- name: Build the executable
run: cargo build --release
- uses: Shopify/upload-to-release@1.0.0
- uses: Shopify/upload-to-release@v2.0.0
with:
name: monolith-gnu-linux-x86_64
path: target/release/monolith

1381
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,14 +1,15 @@
[package]
name = "monolith"
version = "2.6.2"
version = "2.8.3"
authors = [
"Sunshine <sunshine@uberspace.net>",
"Sunshine <snshn@tutanota.com>",
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
"Emmanuel Delaborde <th3rac25@gmail.com>",
"Emi Simpson <emi@alchemi.dev>",
"rhysd <lin90162@yahoo.co.jp>",
"Andriy Rakhnin <a@rakhnin.com>",
]
edition = "2018"
edition = "2021"
description = "CLI tool for saving web pages as a single HTML file"
homepage = "https://github.com/Y2Z/monolith"
repository = "https://github.com/Y2Z/monolith"
@@ -21,23 +22,36 @@ include = [
]
license = "CC0-1.0"
[dependencies]
atty = "0.2.14" # Used for highlighting network errors
base64 = "0.13.0" # Used for integrity attributes
chrono = "0.4.20" # Used for formatting creation timestamp
clap = "3.2.16"
cssparser = "0.29.6"
encoding_rs = "0.8.31"
html5ever = "0.24.1"
percent-encoding = "2.1.0"
regex = "1.6.0" # Used for parsing srcset and NOSCRIPT
sha2 = "0.10.2" # Used for calculating checksums during integrity checks
url = "2.2.2"
[features]
default = ["vendored-openssl"]
# Compile and statically link a copy of OpenSSL.
vendored-openssl = ["openssl/vendored"]
[dependencies]
atty = "0.2.14" # Used for highlighting network errors
base64 = "0.22.1" # Used for integrity attributes
chrono = "0.4.38" # Used for formatting output timestamp
clap = "3.2.25" # Used for processing CLI arguments
cssparser = "0.34.0" # Used for dealing with CSS
encoding_rs = "0.8.34" # Used for parsing and converting document charsets
html5ever = "0.27.0" # Used for all things DOM
markup5ever_rcdom = "0.3.0" # Used for manipulating DOM
percent-encoding = "2.3.1" # Used for encoding URLs
sha2 = "0.10.8" # Used for calculating checksums during integrity checks
url = "2.5.2" # Used for parsing URLs
openssl = "0.10.64" # Used for static linking of the OpenSSL library
# Used for parsing srcset and NOSCRIPT
[dependencies.regex]
version = "1.10.6"
default-features = false
features = ["std", "perf-dfa", "unicode-perl"]
# Used for making network requests
[dependencies.reqwest]
version = "0.11.11"
version = "0.12.7"
default-features = false
features = ["default-tls", "blocking", "gzip", "brotli", "deflate"]
[dev-dependencies]
assert_cmd = "2.0.4"
assert_cmd = "2.0.16"

View File

@@ -1,4 +1,4 @@
FROM ekidd/rust-musl-builder as builder
FROM clux/muslrust:stable as builder
RUN curl -L -o monolith.tar.gz $(curl -s https://api.github.com/repos/y2z/monolith/releases/latest \
| grep "tarball_url.*\"," \
@@ -17,6 +17,6 @@ RUN apk update && \
apk add --no-cache openssl && \
rm -rf "/var/cache/apk/*"
COPY --from=builder /home/rust/.cargo/bin/monolith /usr/bin/monolith
COPY --from=builder /root/.cargo/bin/monolith /usr/bin/monolith
WORKDIR /tmp
ENTRYPOINT ["/usr/bin/monolith"]

View File

@@ -7,23 +7,30 @@ build:
@cargo build --locked
.PHONY: build
test: build
@cargo test --locked
@cargo fmt --all -- --check
.PHONY: test
lint:
@cargo fmt --all --
.PHONY: lint
clean:
@cargo clean
.PHONY: clean
install:
@cargo install --force --locked --path .
.PHONY: install
lint:
@cargo fmt --all --
.PHONY: lint
lint_check:
@cargo fmt --all -- --check
.PHONY: lint_check
test: build
@cargo test --locked
.PHONY: test
uninstall:
@cargo uninstall
.PHONY: uninstall
clean:
@cargo clean
update-lock-file:
@cargo update
.PHONY: clean

138
README.md
View File

@@ -24,7 +24,7 @@ If compared to saving websites with `wget -mpk`, this tool embeds all assets as
## Installation
#### Using [Cargo](https://crates.io/crates/monolith)
#### Using [Cargo](https://crates.io/crates/monolith) (cross-platform)
```console
cargo install monolith
@@ -36,6 +36,24 @@ cargo install monolith
brew install monolith
```
#### Via [Chocolatey](https://community.chocolatey.org/packages/monolith) (Windows)
```console
choco install monolith
```
#### Via [Scoop](https://scoop.sh/#/apps?q=monolith) (Windows)
```console
scoop install main/monolith
```
#### Via [Winget](https://winstall.app/apps/Y2Z.Monolith) (Windows)
```console
winget install --id=Y2Z.Monolith -e
```
#### Via [MacPorts](https://ports.macports.org/port/monolith/summary) (macOS)
```console
@@ -48,6 +66,42 @@ sudo port install monolith
snap install monolith
```
#### Using [Guix](https://packages.guix.gnu.org/packages/monolith) (GNU/Linux)
```console
guix install monolith
```
#### Using [NixPkgs](https://search.nixos.org/packages?channel=unstable&show=monolith&query=monolith)
```console
nix-env -iA nixpkgs.monolith
```
#### Using [Flox](https://flox.dev)
```console
flox install monolith
```
#### Using [Pacman](https://archlinux.org/packages/extra/x86_64/monolith) (Arch Linux)
```console
pacman -S monolith
```
#### Using [aports](https://pkgs.alpinelinux.org/packages?name=monolith) (Alpine Linux)
```console
apk add monolith
```
#### Using [XBPS Package Manager](https://voidlinux.org/packages/?q=monolith) (Void Linux)
```console
xbps-install -S monolith
```
#### Using [FreeBSD packages](https://svnweb.freebsd.org/ports/head/www/monolith/) (FreeBSD)
```console
@@ -71,13 +125,31 @@ make install clean
#### Using [containers](https://www.docker.com/)
```console
docker build -t Y2Z/monolith .
docker build -t y2z/monolith .
sudo install -b dist/run-in-container.sh /usr/local/bin/monolith
```
#### From [source](https://github.com/Y2Z/monolith)
Dependency: `libssl`
Dependencies: `libssl`, `cargo`
<details>
<summary>Install cargo (GNU/Linux)</summary>
Check if cargo is installed
```console
cargo -v
```
If cargo is not already installed, install and add it to your existing ```$PATH``` (paraphrasing the [official installation instructions](https://doc.rust-lang.org/cargo/getting-started/installation.html)):
```console
curl https://sh.rustup.rs -sSf | sh
. "$HOME/.cargo/env"
```
Proceed with installing from source:
</details>
```console
git clone https://github.com/Y2Z/monolith.git
@@ -100,7 +172,7 @@ monolith https://lyrics.github.io/db/P/Portishead/Dummy/Roads/ -o portishead-roa
```
```console
cat index.html | monolith -aIiFfcMv -b https://original.site/ - > result.html
cat some-site-page.html | monolith -aIiFfcMv -b https://some.site/ - > some-site-page-with-assets.html
```
@@ -110,12 +182,16 @@ cat index.html | monolith -aIiFfcMv -b https://original.site/ - > result.html
## Options
- `-a`: Exclude audio sources
- `-b`: Use custom `base URL`
- `-b`: Use `custom base URL`
- `-B`: Forbid retrieving assets from specified domain(s)
- `-c`: Exclude CSS
- `-C`: Save document using custom `charset`
- `-C`: Read cookies from `file`
- `-d`: Allow retrieving assets only from specified `domain(s)`
- `-e`: Ignore network errors
- `-E`: Save document using `custom encoding`
- `-f`: Omit frames
- `-F`: Exclude web fonts
- `-h`: Print help information
- `-i`: Remove images
- `-I`: Isolate the document
- `-j`: Exclude JavaScript
@@ -125,13 +201,42 @@ cat index.html | monolith -aIiFfcMv -b https://original.site/ - > result.html
- `-o`: Write output to `file` (use “-” for STDOUT)
- `-s`: Be quiet
- `-t`: Adjust `network request timeout`
- `-u`: Provide custom `User-Agent`
- `-u`: Provide `custom User-Agent`
- `-v`: Exclude videos
---------------------------------------------------
## Whitelisting and blacklisting domains
Options `-d` and `-B` provide control over what domains can be used to retrieve assets from, e.g.:
```console
monolith -I -d example.com -d www.example.com https://example.com -o example-only.html
```
```console
monolith -I -B -d .googleusercontent.com -d googleanalytics.com -d .google.com https://example.com -o example-no-ads.html
```
---------------------------------------------------
## Dynamic content
Monolith doesn't feature a JavaScript engine, hence websites that retrieve and display data after initial load may require usage of additional tools.
For example, Chromium (Chrome) can be used to act as a pre-processor for such pages:
```console
chromium --headless --window-size=1920,1080 --run-all-compositor-stages-before-draw --virtual-time-budget=9000 --incognito --dump-dom https://github.com | monolith - -I -b https://github.com -o github.html
```
---------------------------------------------------
## Proxies
Please set `https_proxy`, `http_proxy`, and `no_proxy` environment variables.
@@ -148,26 +253,7 @@ Please open an issue if something is wrong, that helps make this project better.
---------------------------------------------------
## Related projects
- Monolith Chrome Extension: https://github.com/rhysd/monolith-of-web
- Pagesaver: https://github.com/distributed-mind/pagesaver
- Personal WayBack Machine: https://github.com/popey/pwbm
- Hako: https://github.com/dmpop/hako
- Monk: https://github.com/monk-dev/monk
---------------------------------------------------
## License
To the extent possible under law, the author(s) have dedicated all copyright related and neighboring rights to this software to the public domain worldwide.
This software is distributed without any warranty.
---------------------------------------------------
<!-- Microtext -->
<sub>Keep in mind that `monolith` is not aware of your browsers session</sub>

2
dist/run-in-container.sh vendored Normal file → Executable file
View File

@@ -7,4 +7,4 @@ if which podman 2>&1 > /dev/null; then
DOCKER=podman
fi
$DOCKER run --rm Y2Z/$PROG_NAME "$@"
$DOCKER run --rm y2z/$PROG_NAME "$@"

View File

@@ -1,19 +0,0 @@
# 1. Record architecture decisions
Date: 2019-12-25
## Status
Accepted
## Context
We need to record the architectural decisions made on this project.
## Decision
We will use Architecture Decision Records, as [described by Michael Nygard](http://thinkrelevance.com/blog/2011/11/15/documenting-architecture-decisions).
## Consequences
See Michael Nygard's article, linked above. For a lightweight ADR toolset, see Nat Pryce's [adr-tools](https://github.com/npryce/adr-tools).

View File

@@ -1,19 +0,0 @@
# 2. NOSCRIPT nodes
Date: 2020-04-16
## Status
Accepted
## Context
HTML pages can contain `noscript` nodes, which reveal their contents only in case when JavaScript is not available. Most of the time they contain hidden messages that inform about certain JavaScript-dependent features not being operational, however sometimes can also feature media assets or even iframes.
## Decision
When the document is being saved with or without JavaScript, each `noscript` node should be preserved while its children need to be processed exactly the same way as the rest of the document. This approach will ensure that even hidden remote assets are embedded — since those hidden elements may have to be displayed later in a browser that has JavaScript turned off. An option should be available to "unwrap" all `noscript` nodes in order to make their contents always visible in the document, complimenting the "disable JS" function of the program.
## Consequences
Saved documents will have contents of all `noscript` nodes processed as if they are part of the document's DOM, therefore properly display images encapsulated within `noscript` nodes when being viewed in browsers that have JavaScript turned off (or have no JavaScript support in the first place). The new option to "unwrap" `noscript` elements will help the user ensure that the resulting document always represents what the original web page looked like in a browser that had JavaScript turned off.

View File

@@ -1,21 +0,0 @@
# 3. Network request timeout
Date: 2020-02-15
## Status
Accepted
## Context
A slow network connection and overloaded server may negatively impact network response time.
## Decision
Make the program simulate behavior of popular web browsers and CLI tools, where the default network response timeout is most often set to 120 seconds.
Instead of featuring retries for timed out network requests, the program should have an option to adjust the timeout length, along with making it indefinite when given "0" as its value.
## Consequences
The user is able to retrieve resources that have long response time, as well as obtain full control over how soon, and if at all, network requests should time out.

View File

@@ -1,21 +0,0 @@
# 4. Asset integrity check
Date: 2020-02-23
## Status
Accepted
## Context
In HTML5, `link` and `script` nodes have an attribute named `integrity`, which lets the browser check if the remote file is valid, mostly for the purpose of enhancing page security.
## Decision
In order to replicate the browser's behavior, the program should perform integrity check the same way it does, excluding the linked asset from the final result if such check fails.
The `integrity` attribute should be removed from nodes, as it bears no benefit for resources embedded as data URLs.
## Consequences
Assets that fail to pass the check get excluded from the saved document. Meanwhile, saved documents no longer contain integrity attributes on all `link` and `script` nodes.

View File

@@ -1,19 +0,0 @@
# 5. Asset Minimization
Date: 2020-03-14
## Status
Accepted
## Context
It may look like a good idea to make monolith compress retrieved assets while saving the page for the purpose of reducing the resulting document's file size.
## Decision
Given that the main purpose of this program is to save pages in a convenient to store and share manner — it's mostly an archiving tool, aside from being able to tell monolith to exclude certain types of asests (e.g. images, CSS, JavaScript), it would be outside of scope of this program to implement code for compressing assets. Minimizing files before embedding them does not reduce the amount of data that needs to be transferred either. A separate tool can be used later to compress and minimize pages saved by monolith, if needed.
## Consequences
Monolith will not support modification of original document assets for the purpose of reducing their size, sticking to performing only minimal amount of modifications to the original web page — whatever is needed to provide security or exclude unwanted asset types.

View File

@@ -1,19 +0,0 @@
# 6. Reload and location `meta` tags
Date: 2020-06-25
## Status
Accepted
## Context
HTML documents may contain `meta` tags capable of automatically refreshing the page or redirecting to another location.
## Decision
Since the resulting document is saved to disk and generally not intended to be served over the network, it only makes sense to remove `meta` tags that have `http-equiv` attribute equal to "Refresh" or "Location", in order to prevent them from reloading the page or redirecting to another location.
## Consequences
Monolith will ensure that saved documents do not contain `meta` tags capable of changing location or reloading the page.

View File

@@ -1,19 +0,0 @@
# 7. Network errors
Date: 2020-11-22
## Status
Accepted
## Context
Servers may return information with HTTP response codes other than `200`, however those responses may still contain useful data.
## Decision
Fail by default, notifying of the network error. Add option to continue retrieving assets by treating all response codes as `200`.
## Consequences
Monolith will fail to obtain resources with status other than `200`, unless told to ignore network errors.

View File

@@ -1,40 +0,0 @@
# 8. Base Tag
Date: 2020-12-25
## Status
Accepted
## Context
HTML documents may contain `base` tag, which influences resolution of anchor links and relative URLs as well as dynamically loaded resources.
Sometimes, in order to make certain saved documents function closer to how they operate while being served from a remote server, the `base` tag specifying the source page's URL may need to be added to the document.
There can be only one such tag. If multiple `base` tags are present, only the first encountered tag ends up being used.
## Decision
Adding the `base` tag should be optional — saved documents should not contain the `base` tag unless it was specified by the user, or the document originally had the `base` tag in it.
Existing `href` attribute's value of the original `base` tag should be used for resolving the document's relative links instead of document's own URL (precisely the way browsers do it).
## Consequences
#### If the base tag does not exist in the source document
- If the base tag does not exist in the source document
- With base URL option provided
- use the specified base URL value to retrieve assets, keep original base URL value in the document
- Without base URL option provided
- download document as usual, do not add base tag
- If the base tag already exists in the source document
- With base URL option provided
- we overwrite the original base URL before retrieving assets, keep new base URL value in the document
- Without base URL option provided:
- use the base URL from the original document to retrieve assets, keep original base URL value in the document
The program will obtain ability to retrieve remote assets for non-remote sources (such as data URLs and local files).
The program will obatin ability to get rid of existing base tag values (by provind an empty one).

View File

@@ -1,3 +0,0 @@
# References
- https://content-security-policy.com/

View File

@@ -1,23 +0,0 @@
# Web apps that can be saved with Monolith
These apps retain all or most of their functionality when saved with Monolith:
## Converse
| Website | https://conversejs.org |
|:-----------------------|:--------------------------------------------------------------------|
| Description | An XMPP client built using web technologies |
| Functionality retained | **full** |
| Command to use | `monolith https://conversejs.org/fullscreen.html > conversejs.html` |
| Monolith version used | 2.2.7 |
## Markdown Tables generator
| Website | https://www.tablesgenerator.com |
|:--------------------------|:-----------------------------------------------------------------------------------------------|
| Description | Tool for creating tables in extended Markdown format |
| Functionality retained | **full** |
| Command to use | `monolith -I https://www.tablesgenerator.com/markdown_tables -o markdown-table-generator.html` |
| Monolith version used | 2.6.1 |

View File

@@ -2,7 +2,7 @@
<package xmlns="http://schemas.microsoft.com/packaging/2015/06/nuspec.xsd">
<metadata>
<id>monolith</id>
<version>2.4.0</version>
<version>2.8.1</version>
<title>Monolith</title>
<authors>Sunshine, Mahdi Robatipoor, Emmanuel Delaborde, Emi Simpson, rhysd</authors>
<projectUrl>https://github.com/Y2Z/monolith</projectUrl>

119
src/cookies.rs Normal file
View File

@@ -0,0 +1,119 @@
use std::time::{SystemTime, UNIX_EPOCH};
use url::Url;
pub struct Cookie {
pub domain: String,
pub include_subdomains: bool,
pub path: String,
pub https_only: bool,
pub expires: u64,
pub name: String,
pub value: String,
}
#[derive(Debug)]
pub enum CookieFileContentsParseError {
InvalidHeader,
}
impl Cookie {
pub fn is_expired(&self) -> bool {
if self.expires == 0 {
return false; // Session, never expires
}
let start = SystemTime::now();
let since_the_epoch = start
.duration_since(UNIX_EPOCH)
.expect("Time went backwards");
self.expires < since_the_epoch.as_secs()
}
pub fn matches_url(&self, url: &str) -> bool {
match Url::parse(&url) {
Ok(url) => {
// Check protocol scheme
match url.scheme() {
"http" => {
if self.https_only {
return false;
}
}
"https" => {}
_ => {
// Should never match URLs of protocols other than HTTP(S)
return false;
}
}
// Check host
if let Some(url_host) = url.host_str() {
if self.domain.starts_with(".") && self.include_subdomains {
if !url_host.to_lowercase().ends_with(&self.domain)
&& !url_host
.eq_ignore_ascii_case(&self.domain[1..self.domain.len() - 1])
{
return false;
}
} else {
if !url_host.eq_ignore_ascii_case(&self.domain) {
return false;
}
}
} else {
return false;
}
// Check path
if !url.path().eq_ignore_ascii_case(&self.path)
&& !url.path().starts_with(&self.path)
{
return false;
}
}
Err(_) => {
return false;
}
}
true
}
}
pub fn parse_cookie_file_contents(
cookie_file_contents: &str,
) -> Result<Vec<Cookie>, CookieFileContentsParseError> {
let mut cookies: Vec<Cookie> = Vec::new();
for (i, line) in cookie_file_contents.lines().enumerate() {
if i == 0 {
// Parsing first line
if !line.eq("# HTTP Cookie File") && !line.eq("# Netscape HTTP Cookie File") {
return Err(CookieFileContentsParseError::InvalidHeader);
}
} else {
// Ignore comment lines
if line.starts_with("#") {
continue;
}
// Attempt to parse values
let mut fields = line.split("\t");
if fields.clone().count() != 7 {
continue;
}
cookies.push(Cookie {
domain: fields.next().unwrap().to_string().to_lowercase(),
include_subdomains: fields.next().unwrap().to_string() == "TRUE",
path: fields.next().unwrap().to_string(),
https_only: fields.next().unwrap().to_string() == "TRUE",
expires: fields.next().unwrap().parse::<u64>().unwrap(),
name: fields.next().unwrap().to_string(),
value: fields.next().unwrap().to_string(),
});
}
}
Ok(cookies)
}

View File

@@ -36,7 +36,6 @@ pub fn embed_css(
document_url: &Url,
css: &str,
options: &Options,
depth: u32,
) -> String {
let mut input = ParserInput::new(&css);
let mut parser = Parser::new(&mut input);
@@ -47,7 +46,6 @@ pub fn embed_css(
document_url,
&mut parser,
options,
depth,
"",
"",
"",
@@ -81,15 +79,14 @@ pub fn process_css<'a>(
document_url: &Url,
parser: &mut Parser,
options: &Options,
depth: u32,
rule_name: &str,
prop_name: &str,
func_name: &str,
) -> Result<String, ParseError<'a, String>> {
let mut result: String = "".to_string();
let mut curr_rule: String = rule_name.clone().to_string();
let mut curr_prop: String = prop_name.clone().to_string();
let mut curr_rule: String = rule_name.to_string();
let mut curr_prop: String = prop_name.to_string();
let mut token: &Token;
let mut token_offset: SourcePosition;
@@ -135,7 +132,6 @@ pub fn process_css<'a>(
document_url,
parser,
options,
depth,
rule_name,
curr_prop.as_str(),
func_name,
@@ -190,14 +186,7 @@ pub fn process_css<'a>(
}
let import_full_url: Url = resolve_url(&document_url, value);
match retrieve_asset(
cache,
client,
&document_url,
&import_full_url,
options,
depth + 1,
) {
match retrieve_asset(cache, client, &document_url, &import_full_url, options) {
Ok((
import_contents,
import_final_url,
@@ -213,7 +202,6 @@ pub fn process_css<'a>(
&import_final_url,
&String::from_utf8_lossy(&import_contents),
options,
depth + 1,
)
.as_bytes(),
&import_final_url,
@@ -251,7 +239,6 @@ pub fn process_css<'a>(
&document_url,
&resolved_url,
options,
depth + 1,
) {
Ok((data, final_url, media_type, charset)) => {
let mut data_url =
@@ -341,14 +328,7 @@ pub fn process_css<'a>(
result.push_str("url(");
if is_import {
let full_url: Url = resolve_url(&document_url, value);
match retrieve_asset(
cache,
client,
&document_url,
&full_url,
options,
depth + 1,
) {
match retrieve_asset(cache, client, &document_url, &full_url, options) {
Ok((css, final_url, media_type, charset)) => {
let mut data_url = create_data_url(
&media_type,
@@ -359,7 +339,6 @@ pub fn process_css<'a>(
&final_url,
&String::from_utf8_lossy(&css),
options,
depth + 1,
)
.as_bytes(),
&final_url,
@@ -380,14 +359,7 @@ pub fn process_css<'a>(
result.push_str(format_quoted_string(EMPTY_IMAGE_DATA_URL).as_str());
} else {
let full_url: Url = resolve_url(&document_url, value);
match retrieve_asset(
cache,
client,
&document_url,
&full_url,
options,
depth + 1,
) {
match retrieve_asset(cache, client, &document_url, &full_url, options) {
Ok((data, final_url, media_type, charset)) => {
let mut data_url =
create_data_url(&media_type, &charset, &data, &final_url);
@@ -423,7 +395,6 @@ pub fn process_css<'a>(
document_url,
parser,
options,
depth,
curr_rule.as_str(),
curr_prop.as_str(),
function_name,

View File

@@ -1,13 +1,13 @@
use base64;
use base64::prelude::*;
use chrono::prelude::*;
use encoding_rs::Encoding;
use html5ever::interface::QualName;
use html5ever::parse_document;
use html5ever::rcdom::{Handle, NodeData, RcDom};
use html5ever::serialize::{serialize, SerializeOpts};
use html5ever::tendril::{format_tendril, TendrilSink};
use html5ever::tree_builder::{Attribute, TreeSink};
use html5ever::{local_name, namespace_url, ns, LocalName};
use markup5ever_rcdom::{Handle, NodeData, RcDom, SerializableHandle};
use regex::Regex;
use reqwest::blocking::Client;
use reqwest::Url;
@@ -30,10 +30,16 @@ struct SrcSetItem<'a> {
const ICON_VALUES: &'static [&str] = &["icon", "shortcut icon"];
const WHITESPACES: &'static [char] = &['\t', '\n', '\x0c', '\r', ' '];
pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom {
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, document, SerializeOpts::default())
.expect("unable to serialize DOM into buffer");
serialize(
&mut buf,
&SerializableHandle::from(document.clone()),
SerializeOpts::default(),
)
.expect("unable to serialize DOM into buffer");
let mut dom = html_to_dom(&buf, "utf-8".to_string());
let doc = dom.get_document();
@@ -65,15 +71,15 @@ pub fn check_integrity(data: &[u8], integrity: &str) -> bool {
if integrity.starts_with("sha256-") {
let mut hasher = Sha256::new();
hasher.update(data);
base64::encode(hasher.finalize()) == integrity[7..]
BASE64_STANDARD.encode(hasher.finalize()) == integrity[7..]
} else if integrity.starts_with("sha384-") {
let mut hasher = Sha384::new();
hasher.update(data);
base64::encode(hasher.finalize()) == integrity[7..]
BASE64_STANDARD.encode(hasher.finalize()) == integrity[7..]
} else if integrity.starts_with("sha512-") {
let mut hasher = Sha512::new();
hasher.update(data);
base64::encode(hasher.finalize()) == integrity[7..]
BASE64_STANDARD.encode(hasher.finalize()) == integrity[7..]
} else {
false
}
@@ -161,15 +167,44 @@ pub fn embed_srcset(
document_url: &Url,
srcset: &str,
options: &Options,
depth: u32,
) -> String {
let mut array: Vec<SrcSetItem> = vec![];
let re = Regex::new(r",\s+").unwrap();
for srcset_item in re.split(srcset) {
let parts: Vec<&str> = srcset_item.trim().split_whitespace().collect();
if parts.len() > 0 {
let path = parts[0].trim();
let descriptor = if parts.len() > 1 { parts[1].trim() } else { "" };
// Parse srcset attribute according to the specs
// https://html.spec.whatwg.org/multipage/images.html#srcset-attribute
let mut offset = 0;
let size = srcset.chars().count();
while offset < size {
let mut has_descriptor = true;
// Zero or more whitespaces + skip leading comma
let url_start = offset
+ srcset[offset..]
.chars()
.take_while(|&c| WHITESPACES.contains(&c) || c == ',')
.count();
if url_start >= size {
break;
}
// A valid non-empty URL that does not start or end with comma
let mut url_end = url_start
+ srcset[url_start..]
.chars()
.take_while(|&c| !WHITESPACES.contains(&c))
.count();
while (url_end - 1) > url_start && srcset.chars().nth(url_end - 1).unwrap() == ',' {
has_descriptor = false;
url_end -= 1;
}
offset = url_end;
// If the URL wasn't terminated by comma there may also be a descriptor
if has_descriptor {
offset += srcset[url_end..].chars().take_while(|&c| c != ',').count();
}
// Collect SrcSetItem
if url_end > url_start {
let path = &srcset[url_start..url_end];
let descriptor = &srcset[url_end..offset].trim();
let srcset_real_item = SrcSetItem { path, descriptor };
array.push(srcset_real_item);
}
@@ -182,14 +217,7 @@ pub fn embed_srcset(
result.push_str(EMPTY_IMAGE_DATA_URL);
} else {
let image_full_url: Url = resolve_url(&document_url, part.path);
match retrieve_asset(
cache,
client,
&document_url,
&image_full_url,
options,
depth + 1,
) {
match retrieve_asset(cache, client, &document_url, &image_full_url, options) {
Ok((image_data, image_final_url, image_media_type, image_charset)) => {
let mut image_data_url = create_data_url(
&image_media_type,
@@ -197,7 +225,7 @@ pub fn embed_srcset(
&image_data,
&image_final_url,
);
// Append retreved asset as a data URL
// Append retrieved asset as a data URL
image_data_url.set_fragment(image_full_url.fragment());
result.push_str(image_data_url.as_ref());
}
@@ -428,8 +456,12 @@ pub fn is_icon(attr_value: &str) -> bool {
pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom {
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, document, SerializeOpts::default())
.expect("unable to serialize DOM into buffer");
serialize(
&mut buf,
&SerializableHandle::from(document.clone()),
SerializeOpts::default(),
)
.expect("unable to serialize DOM into buffer");
let mut dom = html_to_dom(&buf, "utf-8".to_string());
let doc = dom.get_document();
@@ -534,7 +566,7 @@ pub fn set_node_attr(node: &Handle, attr_name: &str, attr_value: Option<String>)
pub fn serialize_document(mut dom: RcDom, document_encoding: String, options: &Options) -> Vec<u8> {
let mut buf: Vec<u8> = Vec::new();
let doc = dom.get_document();
let document = dom.get_document();
if options.isolate
|| options.no_css
@@ -544,7 +576,7 @@ pub fn serialize_document(mut dom: RcDom, document_encoding: String, options: &O
|| options.no_images
{
// Take care of CSP
if let Some(html) = get_child_node_by_name(&doc, "html") {
if let Some(html) = get_child_node_by_name(&document, "html") {
if let Some(head) = get_child_node_by_name(&html, "head") {
let meta = dom.create_element(
QualName::new(None, ns!(), local_name!("meta")),
@@ -570,8 +602,12 @@ pub fn serialize_document(mut dom: RcDom, document_encoding: String, options: &O
}
}
serialize(&mut buf, &doc, SerializeOpts::default())
.expect("Unable to serialize DOM into buffer");
serialize(
&mut buf,
&SerializableHandle::from(document.clone()),
SerializeOpts::default(),
)
.expect("Unable to serialize DOM into buffer");
// Unwrap NOSCRIPT elements
if options.unwrap_noscript {
@@ -599,18 +635,10 @@ pub fn retrieve_and_embed_asset(
attr_name: &str,
attr_value: &str,
options: &Options,
depth: u32,
) {
let resolved_url: Url = resolve_url(document_url, attr_value.clone());
let resolved_url: Url = resolve_url(document_url, attr_value);
match retrieve_asset(
cache,
client,
&document_url.clone(),
&resolved_url,
options,
depth + 1,
) {
match retrieve_asset(cache, client, &document_url.clone(), &resolved_url, options) {
Ok((data, final_url, mut media_type, charset)) => {
let node_name: &str = get_node_name(&node).unwrap();
@@ -639,7 +667,7 @@ pub fn retrieve_and_embed_asset(
if node_name == "link" && determine_link_node_type(node) == "stylesheet" {
// Stylesheet LINK elements require special treatment
let css: String = embed_css(cache, client, &final_url, &s, options, depth + 1);
let css: String = embed_css(cache, client, &final_url, &s, options);
// Create and embed data URL
let css_data_url =
@@ -648,19 +676,12 @@ pub fn retrieve_and_embed_asset(
} else if node_name == "frame" || node_name == "iframe" {
// (I)FRAMEs are also quite different from conventional resources
let frame_dom = html_to_dom(&data, charset.clone());
walk_and_embed_assets(
cache,
client,
&final_url,
&frame_dom.document,
&options,
depth + 1,
);
walk_and_embed_assets(cache, client, &final_url, &frame_dom.document, &options);
let mut frame_data: Vec<u8> = Vec::new();
serialize(
&mut frame_data,
&frame_dom.document,
&SerializableHandle::from(frame_dom.document.clone()),
SerializeOpts::default(),
)
.unwrap();
@@ -710,13 +731,12 @@ pub fn walk_and_embed_assets(
document_url: &Url,
node: &Handle,
options: &Options,
depth: u32,
) {
match node.data {
NodeData::Document => {
// Dig deeper
for child in node.children.borrow().iter() {
walk_and_embed_assets(cache, client, &document_url, child, options, depth);
walk_and_embed_assets(cache, client, &document_url, child, options);
}
}
NodeData::Element {
@@ -751,7 +771,6 @@ pub fn walk_and_embed_assets(
"href",
&link_attr_href_value,
options,
depth,
);
} else {
set_node_attr(node, "href", None);
@@ -774,7 +793,6 @@ pub fn walk_and_embed_assets(
"href",
&link_attr_href_value,
options,
depth,
);
}
}
@@ -816,7 +834,6 @@ pub fn walk_and_embed_assets(
"background",
&body_attr_background_value,
options,
depth,
);
}
}
@@ -862,7 +879,6 @@ pub fn walk_and_embed_assets(
"src",
&img_full_url,
options,
depth,
);
}
}
@@ -870,14 +886,8 @@ pub fn walk_and_embed_assets(
// Resolve srcset attribute
if let Some(img_srcset) = get_node_attr(node, "srcset") {
if !img_srcset.is_empty() {
let resolved_srcset: String = embed_srcset(
cache,
client,
&document_url,
&img_srcset,
options,
depth,
);
let resolved_srcset: String =
embed_srcset(cache, client, &document_url, &img_srcset, options);
set_node_attr(node, "srcset", Some(resolved_srcset));
}
}
@@ -907,7 +917,6 @@ pub fn walk_and_embed_assets(
"src",
&input_attr_src_value,
options,
depth,
);
}
}
@@ -940,7 +949,6 @@ pub fn walk_and_embed_assets(
"href",
&image_href,
options,
depth,
);
}
}
@@ -961,7 +969,6 @@ pub fn walk_and_embed_assets(
"src",
&source_attr_src_value,
options,
depth,
);
}
} else if parent_node_name == "video" {
@@ -976,7 +983,6 @@ pub fn walk_and_embed_assets(
"src",
&source_attr_src_value,
options,
depth,
);
}
}
@@ -998,7 +1004,6 @@ pub fn walk_and_embed_assets(
&document_url,
&source_attr_srcset_value,
options,
depth,
);
set_node_attr(node, "srcset", Some(resolved_srcset));
}
@@ -1051,7 +1056,6 @@ pub fn walk_and_embed_assets(
"src",
&script_attr_src.unwrap_or_default(),
options,
depth,
);
}
}
@@ -1069,7 +1073,6 @@ pub fn walk_and_embed_assets(
&document_url,
tendril.as_ref(),
options,
depth,
);
tendril.clear();
tendril.push_slice(&replacement);
@@ -1101,7 +1104,6 @@ pub fn walk_and_embed_assets(
"src",
&frame_attr_src_value,
options,
depth,
);
}
}
@@ -1121,7 +1123,6 @@ pub fn walk_and_embed_assets(
"src",
&audio_attr_src_value,
options,
depth,
);
}
}
@@ -1140,7 +1141,6 @@ pub fn walk_and_embed_assets(
"src",
&video_attr_src_value,
options,
depth,
);
}
}
@@ -1164,7 +1164,6 @@ pub fn walk_and_embed_assets(
"poster",
&video_attr_poster_value,
options,
depth,
);
}
}
@@ -1188,7 +1187,6 @@ pub fn walk_and_embed_assets(
&document_url,
&noscript_contents_dom.document,
&options,
depth,
);
// Get rid of original contents
noscript_contents.clear();
@@ -1198,8 +1196,12 @@ pub fn walk_and_embed_assets(
{
if let Some(body) = get_child_node_by_name(&html, "body") {
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &body, SerializeOpts::default())
.expect("Unable to serialize DOM into buffer");
serialize(
&mut buf,
&SerializableHandle::from(body.clone()),
SerializeOpts::default(),
)
.expect("Unable to serialize DOM into buffer");
let result = String::from_utf8_lossy(&buf);
noscript_contents.push_slice(&result);
}
@@ -1225,7 +1227,6 @@ pub fn walk_and_embed_assets(
&document_url,
&node_attr_style_value,
options,
depth,
);
set_node_attr(node, "style", Some(embedded_style));
}
@@ -1249,7 +1250,7 @@ pub fn walk_and_embed_assets(
// Dig deeper
for child in node.children.borrow().iter() {
walk_and_embed_assets(cache, client, &document_url, child, options, depth);
walk_and_embed_assets(cache, client, &document_url, child, options);
}
}
_ => {

View File

@@ -1,3 +1,4 @@
pub mod cookies;
pub mod css;
pub mod html;
pub mod js;

View File

@@ -1,5 +1,5 @@
use encoding_rs::Encoding;
use html5ever::rcdom::RcDom;
use markup5ever_rcdom::RcDom;
use reqwest::blocking::Client;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use std::collections::HashMap;
@@ -10,6 +10,7 @@ use std::process;
use std::time::Duration;
use url::Url;
use monolith::cookies::parse_cookie_file_contents;
use monolith::html::{
add_favicon, create_metadata_tag, get_base_url, get_charset, has_favicon, html_to_dom,
serialize_document, set_base_url, set_charset, walk_and_embed_assets,
@@ -64,82 +65,102 @@ pub fn read_stdin() -> Vec<u8> {
}
fn main() {
let options = Options::from_args();
let mut target: String = options.target.clone();
let mut options = Options::from_args();
// Check if target was provided
if target.len() == 0 {
if options.target.len() == 0 {
if !options.silent {
eprintln!("No target specified");
}
process::exit(1);
}
// Check if custom charset is valid
if let Some(custom_charset) = options.charset.clone() {
if !Encoding::for_label_no_replacement(custom_charset.as_bytes()).is_some() {
eprintln!("Unknown encoding: {}", &custom_charset);
// Check if custom encoding is valid
if let Some(custom_encoding) = options.encoding.clone() {
if !Encoding::for_label_no_replacement(custom_encoding.as_bytes()).is_some() {
eprintln!("Unknown encoding: {}", &custom_encoding);
process::exit(1);
}
}
let target_url: Url;
let mut use_stdin: bool = false;
// Determine exact target URL
if target.clone() == "-" {
// Read from pipe (stdin)
use_stdin = true;
// Set default target URL to an empty data URL; the user can set it via --base-url
target_url = Url::parse("data:text/html,").unwrap();
} else {
match Url::parse(&target.clone()) {
Ok(parsed_url) => {
if parsed_url.scheme() == "data"
|| parsed_url.scheme() == "file"
|| (parsed_url.scheme() == "http" || parsed_url.scheme() == "https")
{
target_url = parsed_url;
} else {
let target_url = match options.target.as_str() {
"-" => {
// Read from pipe (stdin)
use_stdin = true;
// Set default target URL to an empty data URL; the user can set it via --base-url
Url::parse("data:text/html,").unwrap()
}
target => match Url::parse(&target) {
Ok(url) => match url.scheme() {
"data" | "file" | "http" | "https" => url,
unsupported_scheme => {
if !options.silent {
eprintln!("Unsupported target URL type: {}", &parsed_url.scheme());
eprintln!("Unsupported target URL type: {}", unsupported_scheme);
}
process::exit(1);
process::exit(1)
}
}
Err(_err) => {
},
Err(_) => {
// Failed to parse given base URL (perhaps it's a filesystem path?)
let path: &Path = Path::new(&target);
if path.exists() {
if path.is_file() {
match Url::from_file_path(fs::canonicalize(&path).unwrap()) {
Ok(file_url) => {
target_url = file_url;
}
Err(_err) => {
if !options.silent {
eprintln!(
"Could not generate file URL out of given path: {}",
"err"
);
match path.exists() {
true => match path.is_file() {
true => {
let canonical_path = fs::canonicalize(&path).unwrap();
match Url::from_file_path(canonical_path) {
Ok(url) => url,
Err(_) => {
if !options.silent {
eprintln!(
"Could not generate file URL out of given path: {}",
&target
);
}
process::exit(1);
}
process::exit(1);
}
}
} else {
if !options.silent {
eprintln!("Local target is not a file: {}", &options.target);
false => {
if !options.silent {
eprintln!("Local target is not a file: {}", &target);
}
process::exit(1);
}
process::exit(1);
},
false => {
// It is not a FS path, now we do what browsers do:
// prepend "http://" and hope it points to a website
Url::parse(&format!("http://{hopefully_url}", hopefully_url = &target))
.unwrap()
}
} else {
// Last chance, now we do what browsers do:
// prepend "http://" and hope it points to a website
target.insert_str(0, "http://");
target_url = Url::parse(&target).unwrap();
}
}
},
};
// Read and parse cookie file
if let Some(opt_cookie_file) = options.cookie_file.clone() {
match fs::read_to_string(opt_cookie_file) {
Ok(str) => match parse_cookie_file_contents(&str) {
Ok(cookies) => {
options.cookies = cookies;
// for c in &cookies {
// // if !cookie.is_expired() {
// // options.cookies.append(c);
// // }
// }
}
Err(_) => {
eprintln!("Could not parse specified cookie file");
process::exit(1);
}
},
Err(_) => {
eprintln!("Could not read specified cookie file");
process::exit(1);
}
}
}
@@ -177,14 +198,23 @@ fn main() {
|| (target_url.scheme() == "http" || target_url.scheme() == "https")
|| target_url.scheme() == "data"
{
match retrieve_asset(&mut cache, &client, &target_url, &target_url, &options, 0) {
match retrieve_asset(&mut cache, &client, &target_url, &target_url, &options) {
Ok((retrieved_data, final_url, media_type, charset)) => {
// Make sure the media type is text/html
if !media_type.eq_ignore_ascii_case("text/html") {
if !options.silent {
eprintln!("Unsupported document media type");
}
process::exit(1);
// Provide output as text without processing it, the way browsers do
if !media_type.eq_ignore_ascii_case("text/html")
&& !media_type.eq_ignore_ascii_case("application/xhtml+xml")
{
// Define output
let mut output =
Output::new(&options.output).expect("Could not prepare output");
// Write retrieved data into STDOUT or file
output
.write(&retrieved_data)
.expect("Could not write output");
// Nothing else to do past this point
process::exit(0);
}
if options
@@ -276,7 +306,7 @@ fn main() {
}
// Traverse through the document and embed remote assets
walk_and_embed_assets(&mut cache, &client, &base_url, &dom.document, &options, 0);
walk_and_embed_assets(&mut cache, &client, &base_url, &dom.document, &options);
// Update or add new BASE element to reroute network requests and hash-links
if let Some(new_base_url) = options.base_url.clone() {
@@ -290,14 +320,7 @@ fn main() {
{
let favicon_ico_url: Url = resolve_url(&base_url, "/favicon.ico");
match retrieve_asset(
&mut cache,
&client,
&target_url,
&favicon_ico_url,
&options,
0,
) {
match retrieve_asset(&mut cache, &client, &target_url, &favicon_ico_url, &options) {
Ok((data, final_url, media_type, charset)) => {
let favicon_data_url: Url =
create_data_url(&media_type, &charset, &data, &final_url);
@@ -310,8 +333,8 @@ fn main() {
}
// Save using specified charset, if given
if let Some(custom_charset) = options.charset.clone() {
document_encoding = custom_charset;
if let Some(custom_encoding) = options.encoding.clone() {
document_encoding = custom_encoding;
dom = set_charset(dom, document_encoding.clone());
}
@@ -328,6 +351,6 @@ fn main() {
// Define output
let mut output = Output::new(&options.output).expect("Could not prepare output");
// Write result into stdout or file
output.write(&result).expect("Could not write HTML output");
// Write result into STDOUT or file
output.write(&result).expect("Could not write output");
}

View File

@@ -1,13 +1,19 @@
use clap::{App, Arg};
use clap::{App, Arg, ArgAction};
use std::env;
use crate::cookies::Cookie;
#[derive(Default)]
pub struct Options {
pub no_audio: bool,
pub base_url: Option<String>,
pub blacklist_domains: bool,
pub no_css: bool,
pub charset: Option<String>,
pub cookie_file: Option<String>,
pub cookies: Vec<Cookie>,
pub domains: Option<Vec<String>>,
pub ignore_errors: bool,
pub encoding: Option<String>,
pub no_frames: bool,
pub no_fonts: bool,
pub no_images: bool,
@@ -46,28 +52,41 @@ impl Options {
.version(env!("CARGO_PKG_VERSION"))
.author(format!("\n{}\n\n", env!("CARGO_PKG_AUTHORS").replace(':', "\n")).as_str())
.about(format!("{}\n{}", ASCII, env!("CARGO_PKG_DESCRIPTION")).as_str())
.args_from_usage("-a, --no-audio 'Removes audio sources'")
.args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'")
.args_from_usage("-c, --no-css 'Removes CSS'")
.args_from_usage("-C, --charset=[UTF-8] 'Enforces custom encoding'")
.args_from_usage("-a, --no-audio 'Remove audio sources'")
.args_from_usage("-b, --base-url=[http://localhost/] 'Set custom base URL'")
.args_from_usage(
"-B, --blacklist-domains 'Treat list of specified domains as blacklist'",
)
.args_from_usage("-c, --no-css 'Remove CSS'")
.args_from_usage("-C, --cookies=[cookies.txt] 'Specify cookie file'")
.arg(
Arg::with_name("domains")
.short('d')
.long("domain")
.takes_value(true)
.value_name("example.com")
.action(ArgAction::Append)
.help("Specify domains to use for white/black-listing"),
)
.args_from_usage("-e, --ignore-errors 'Ignore network errors'")
.args_from_usage("-f, --no-frames 'Removes frames and iframes'")
.args_from_usage("-F, --no-fonts 'Removes fonts'")
.args_from_usage("-i, --no-images 'Removes images'")
.args_from_usage("-I, --isolate 'Cuts off document from the Internet'")
.args_from_usage("-j, --no-js 'Removes JavaScript'")
.args_from_usage("-k, --insecure 'Allows invalid X.509 (TLS) certificates'")
.args_from_usage("-M, --no-metadata 'Excludes timestamp and source information'")
.args_from_usage("-E, --encoding=[UTF-8] 'Enforce custom charset'")
.args_from_usage("-f, --no-frames 'Remove frames and iframes'")
.args_from_usage("-F, --no-fonts 'Remove fonts'")
.args_from_usage("-i, --no-images 'Remove images'")
.args_from_usage("-I, --isolate 'Cut off document from the Internet'")
.args_from_usage("-j, --no-js 'Remove JavaScript'")
.args_from_usage("-k, --insecure 'Allow invalid X.509 (TLS) certificates'")
.args_from_usage("-M, --no-metadata 'Exclude timestamp and source information'")
.args_from_usage(
"-n, --unwrap-noscript 'Replaces NOSCRIPT elements with their contents'",
"-n, --unwrap-noscript 'Replace NOSCRIPT elements with their contents'",
)
.args_from_usage(
"-o, --output=[document.html] 'Writes output to <file>, use - for STDOUT'",
"-o, --output=[document.html] 'Write output to <file>, use - for STDOUT'",
)
.args_from_usage("-s, --silent 'Suppresses verbosity'")
.args_from_usage("-t, --timeout=[60] 'Adjusts network request timeout'")
.args_from_usage("-u, --user-agent=[Firefox] 'Sets custom User-Agent string'")
.args_from_usage("-v, --no-video 'Removes video sources'")
.args_from_usage("-s, --silent 'Suppress verbosity'")
.args_from_usage("-t, --timeout=[60] 'Adjust network request timeout'")
.args_from_usage("-u, --user-agent=[Firefox] 'Set custom User-Agent string'")
.args_from_usage("-v, --no-video 'Remove video sources'")
.arg(
Arg::with_name("target")
.required(true)
@@ -87,9 +106,17 @@ impl Options {
if let Some(base_url) = app.value_of("base-url") {
options.base_url = Some(base_url.to_string());
}
options.blacklist_domains = app.is_present("blacklist-domains");
options.no_css = app.is_present("no-css");
if let Some(charset) = app.value_of("charset") {
options.charset = Some(charset.to_string());
if let Some(cookie_file) = app.value_of("cookies") {
options.cookie_file = Some(cookie_file.to_string());
}
if let Some(encoding) = app.value_of("encoding") {
options.encoding = Some(encoding.to_string());
}
if let Some(domains) = app.get_many::<String>("domains") {
let list_of_domains: Vec<String> = domains.map(|v| v.clone()).collect::<Vec<_>>();
options.domains = Some(list_of_domains);
}
options.ignore_errors = app.is_present("ignore-errors");
options.no_frames = app.is_present("no-frames");

View File

@@ -1,4 +1,4 @@
use base64;
use base64::prelude::*;
use percent_encoding::percent_decode_str;
use url::Url;
@@ -33,7 +33,15 @@ pub fn create_data_url(media_type: &str, charset: &str, data: &[u8], final_asset
"".to_string()
};
data_url.set_path(format!("{}{};base64,{}", media_type, c, base64::encode(data)).as_str());
data_url.set_path(
format!(
"{}{};base64,{}",
media_type,
c,
BASE64_STANDARD.encode(data)
)
.as_str(),
);
data_url
}
@@ -63,7 +71,7 @@ pub fn parse_data_url(url: &Url) -> (String, String, Vec<u8>) {
// Parse raw data into vector of bytes
let text: String = percent_decode_str(&data).decode_utf8_lossy().to_string();
let blob: Vec<u8> = if is_base64 {
base64::decode(&text).unwrap_or(vec![])
BASE64_STANDARD.decode(&text).unwrap_or(vec![])
} else {
text.as_bytes().to_vec()
};
@@ -71,6 +79,17 @@ pub fn parse_data_url(url: &Url) -> (String, String, Vec<u8>) {
(media_type, charset, blob)
}
pub fn get_referer_url(url: Url) -> Url {
let mut url = url.clone();
// Spec: https://httpwg.org/specs/rfc9110.html#field.referer
// Must not include the fragment and userinfo components of the URI
url.set_fragment(None);
url.set_username(&"").unwrap();
url.set_password(None).unwrap();
url
}
pub fn resolve_url(from: &Url, to: &str) -> Url {
match Url::parse(&to) {
Ok(parsed_url) => parsed_url,

View File

@@ -1,12 +1,12 @@
use reqwest::blocking::Client;
use reqwest::header::CONTENT_TYPE;
use reqwest::header::{HeaderMap, HeaderValue, CONTENT_TYPE, COOKIE, REFERER};
use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
use url::Url;
use crate::opts::Options;
use crate::url::{clean_url, parse_data_url};
use crate::url::{clean_url, get_referer_url, parse_data_url};
const ANSI_COLOR_RED: &'static str = "\x1b[31m";
const ANSI_COLOR_RESET: &'static str = "\x1b[0m";
@@ -92,16 +92,60 @@ pub fn detect_media_type_by_file_name(filename: &str) -> String {
mime.to_string()
}
pub fn indent(level: u32) -> String {
let mut result: String = String::new();
let mut l: u32 = level;
while l > 0 {
result += " ";
l -= 1;
pub fn domain_is_within_domain(domain: &str, domain_to_match_against: &str) -> bool {
if domain_to_match_against.len() == 0 {
return false;
}
result
if domain_to_match_against == "." {
return true;
}
let domain_partials: Vec<&str> = domain.trim_end_matches(".").rsplit(".").collect();
let domain_to_match_against_partials: Vec<&str> = domain_to_match_against
.trim_end_matches(".")
.rsplit(".")
.collect();
let domain_to_match_against_starts_with_a_dot = domain_to_match_against.starts_with(".");
let mut i: usize = 0;
let l: usize = std::cmp::max(
domain_partials.len(),
domain_to_match_against_partials.len(),
);
let mut ok: bool = true;
while i < l {
// Exit and return false if went out of bounds of domain to match against, and it didn't start with a dot
if !domain_to_match_against_starts_with_a_dot
&& domain_to_match_against_partials.len() < i + 1
{
ok = false;
break;
}
let domain_partial = if domain_partials.len() < i + 1 {
""
} else {
domain_partials.get(i).unwrap()
};
let domain_to_match_against_partial = if domain_to_match_against_partials.len() < i + 1 {
""
} else {
domain_to_match_against_partials.get(i).unwrap()
};
let parts_match = domain_to_match_against_partial.eq_ignore_ascii_case(domain_partial);
if !parts_match && domain_to_match_against_partial.len() != 0 {
ok = false;
break;
}
i += 1;
}
ok
}
pub fn is_plaintext_media_type(media_type: &str) -> bool {
@@ -142,18 +186,16 @@ pub fn retrieve_asset(
parent_url: &Url,
url: &Url,
options: &Options,
depth: u32,
) -> Result<(Vec<u8>, Url, String, String), reqwest::Error> {
if url.scheme() == "data" {
let (media_type, charset, data) = parse_data_url(url);
Ok((data, url.clone(), media_type, charset))
} else if url.scheme() == "file" {
// Check if parent_url is also file:/// (if not, then we don't embed the asset)
// Check if parent_url is also a file: URL (if not, then we don't embed the asset)
if parent_url.scheme() != "file" {
if !options.silent {
eprintln!(
"{}{}{} ({}){}",
indent(depth).as_str(),
"{}{} ({}){}",
if options.no_color { "" } else { ANSI_COLOR_RED },
&url,
"Security Error",
@@ -174,8 +216,7 @@ pub fn retrieve_asset(
if path.is_dir() {
if !options.silent {
eprintln!(
"{}{}{} (is a directory){}",
indent(depth).as_str(),
"{}{} (is a directory){}",
if options.no_color { "" } else { ANSI_COLOR_RED },
&url,
if options.no_color {
@@ -190,7 +231,7 @@ pub fn retrieve_asset(
Err(client.get("").send().unwrap_err())
} else {
if !options.silent {
eprintln!("{}{}", indent(depth).as_str(), &url);
eprintln!("{}", &url);
}
let file_blob: Vec<u8> = fs::read(&path).expect("Unable to read file");
@@ -205,8 +246,7 @@ pub fn retrieve_asset(
} else {
if !options.silent {
eprintln!(
"{}{}{} (not found){}",
indent(depth).as_str(),
"{}{} (not found){}",
if options.no_color { "" } else { ANSI_COLOR_RED },
&url,
if options.no_color {
@@ -226,7 +266,7 @@ pub fn retrieve_asset(
if cache.contains_key(&cache_key) {
// URL is in cache, we get and return it
if !options.silent {
eprintln!("{}{} (from cache)", indent(depth).as_str(), &url);
eprintln!("{} (from cache)", &url);
}
Ok((
@@ -236,14 +276,41 @@ pub fn retrieve_asset(
"".to_string(),
))
} else {
if let Some(domains) = &options.domains {
let domain_matches = domains
.iter()
.any(|d| domain_is_within_domain(url.host_str().unwrap(), &d.trim()));
if (options.blacklist_domains && domain_matches)
|| (!options.blacklist_domains && !domain_matches)
{
return Err(client.get("").send().unwrap_err());
}
}
// URL not in cache, we retrieve the file
match client.get(url.as_str()).send() {
let mut headers = HeaderMap::new();
if options.cookies.len() > 0 {
for cookie in &options.cookies {
if !cookie.is_expired() && cookie.matches_url(url.as_str()) {
let cookie_header_value: String = cookie.name.clone() + "=" + &cookie.value;
headers
.insert(COOKIE, HeaderValue::from_str(&cookie_header_value).unwrap());
}
}
}
// Add referer header for page resource requests
if ["https", "http"].contains(&parent_url.scheme()) && parent_url != url {
headers.insert(
REFERER,
HeaderValue::from_str(get_referer_url(parent_url.clone()).as_str()).unwrap(),
);
}
match client.get(url.as_str()).headers(headers).send() {
Ok(response) => {
if !options.ignore_errors && response.status() != reqwest::StatusCode::OK {
if !options.silent {
eprintln!(
"{}{}{} ({}){}",
indent(depth).as_str(),
"{}{} ({}){}",
if options.no_color { "" } else { ANSI_COLOR_RED },
&url,
response.status(),
@@ -262,9 +329,9 @@ pub fn retrieve_asset(
if !options.silent {
if url.as_str() == response_url.as_str() {
eprintln!("{}{}", indent(depth).as_str(), &url);
eprintln!("{}", &url);
} else {
eprintln!("{}{} -> {}", indent(depth).as_str(), &url, &response_url);
eprintln!("{} -> {}", &url, &response_url);
}
}
@@ -288,8 +355,7 @@ pub fn retrieve_asset(
Err(error) => {
if !options.silent {
eprintln!(
"{}{}{}{}",
indent(depth).as_str(),
"{}{}{}",
if options.no_color { "" } else { ANSI_COLOR_RED },
error,
if options.no_color {
@@ -311,8 +377,7 @@ pub fn retrieve_asset(
Err(error) => {
if !options.silent {
eprintln!(
"{}{}{} ({}){}",
indent(depth).as_str(),
"{}{} ({}){}",
if options.no_color { "" } else { ANSI_COLOR_RED },
&url,
error,

View File

@@ -90,9 +90,9 @@ mod passing {
String::from_utf8_lossy(&out.stderr),
format!(
"\
{file_url_html}\n \
{file_url_css}\n \
{file_url_css}\n \
{file_url_html}\n\
{file_url_css}\n\
{file_url_css}\n\
{file_url_css}\n\
",
file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()).unwrap(),

View File

@@ -196,17 +196,14 @@ mod failing {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd.arg("data:,Hello%2C%20World!").output().unwrap();
// STDERR should contain error description
assert_eq!(
String::from_utf8_lossy(&out.stderr),
"Unsupported document media type\n"
);
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain HTML
assert_eq!(String::from_utf8_lossy(&out.stdout), "");
// STDOUT should contain text
assert_eq!(String::from_utf8_lossy(&out.stdout), "Hello, World!\n");
// Exit code should be 1
out.assert().code(1);
// Exit code should be 0
out.assert().code(0);
}
#[test]
@@ -221,7 +218,7 @@ mod failing {
// STDERR should be empty
assert_eq!(String::from_utf8_lossy(&out.stderr), "");
// STDOUT should contain HTML with no JS in it
// STDOUT should contain HTML without contents of local JS file
assert_eq!(
String::from_utf8_lossy(&out.stdout),
"<html><head><script src=\"data:application/javascript;base64,\"></script></head><body></body></html>\n"

View File

@@ -39,10 +39,10 @@ mod passing {
String::from_utf8_lossy(&out.stderr),
format!(
"\
{file}{cwd}/tests/_data_/basic/local-file.html\n \
{file}{cwd}/tests/_data_/basic/local-style.css\n \
{file}{cwd}/tests/_data_/basic/local-style-does-not-exist.css (not found)\n \
{file}{cwd}/tests/_data_/basic/monolith.png (not found)\n \
{file}{cwd}/tests/_data_/basic/local-file.html\n\
{file}{cwd}/tests/_data_/basic/local-style.css\n\
{file}{cwd}/tests/_data_/basic/local-style-does-not-exist.css (not found)\n\
{file}{cwd}/tests/_data_/basic/monolith.png (not found)\n\
{file}{cwd}/tests/_data_/basic/local-script.js\n\
",
file = file_url_protocol,
@@ -185,7 +185,7 @@ mod passing {
String::from_utf8_lossy(&out.stderr),
format!(
"\
{file_url_html}\n \
{file_url_html}\n\
{file_url_svg}\n\
",
file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()).unwrap(),
@@ -236,10 +236,10 @@ mod passing {
String::from_utf8_lossy(&out.stderr),
format!(
"\
{file}{cwd}/tests/_data_/integrity/index.html\n \
{file}{cwd}/tests/_data_/integrity/style.css\n \
{file}{cwd}/tests/_data_/integrity/style.css\n \
{file}{cwd}/tests/_data_/integrity/script.js\n \
{file}{cwd}/tests/_data_/integrity/index.html\n\
{file}{cwd}/tests/_data_/integrity/style.css\n\
{file}{cwd}/tests/_data_/integrity/style.css\n\
{file}{cwd}/tests/_data_/integrity/script.js\n\
{file}{cwd}/tests/_data_/integrity/script.js\n\
",
file = file_url_protocol,

View File

@@ -27,7 +27,7 @@ mod passing {
String::from_utf8_lossy(&out.stderr),
format!(
"\
{file_url_html}\n \
{file_url_html}\n\
{file_url_svg}\n\
",
file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()).unwrap(),
@@ -58,7 +58,7 @@ mod passing {
String::from_utf8_lossy(&out.stderr),
format!(
"\
{file_url_html}\n \
{file_url_html}\n\
{file_url_svg}\n\
",
file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()).unwrap(),
@@ -89,7 +89,7 @@ mod passing {
String::from_utf8_lossy(&out.stderr),
format!(
"\
{file_url_html}\n \
{file_url_html}\n\
{file_url_svg}\n\
",
file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()).unwrap(),
@@ -120,7 +120,7 @@ mod passing {
String::from_utf8_lossy(&out.stderr),
format!(
"\
{file_url_html}\n \
{file_url_html}\n\
{file_url_svg}\n\
",
file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()).unwrap(),

View File

@@ -38,7 +38,7 @@ mod passing {
)
);
// STDOUT should contain original document without any modificatons
// STDOUT should contain original document without any modifications
let s: String;
if let Some(encoding) = Encoding::for_label(b"gb2312") {
let (string, _, _) = encoding.decode(&out.stdout);
@@ -115,7 +115,7 @@ mod passing {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd
.arg("-M")
.arg("-C")
.arg("-E")
.arg("utf8")
.arg(format!(
"tests{s}_data_{s}unusual_encodings{s}gb2312.html",
@@ -135,7 +135,7 @@ mod passing {
)
);
// STDOUT should contain original document without any modificatons
// STDOUT should contain original document without any modifications
assert_eq!(
String::from_utf8_lossy(&out.stdout).to_string(),
"<html>\
@@ -158,7 +158,7 @@ mod passing {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd
.arg("-M")
.arg("-C")
.arg("-E")
.arg("utf0")
.arg(format!(
"tests{s}_data_{s}unusual_encodings{s}gb2312.html",

View File

@@ -0,0 +1,68 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use monolith::cookies;
#[test]
fn never_expires() {
let cookie = cookies::Cookie {
domain: String::from("127.0.0.1"),
include_subdomains: true,
path: String::from("/"),
https_only: false,
expires: 0,
name: String::from(""),
value: String::from(""),
};
assert!(!cookie.is_expired());
}
#[test]
fn expires_long_from_now() {
let cookie = cookies::Cookie {
domain: String::from("127.0.0.1"),
include_subdomains: true,
path: String::from("/"),
https_only: false,
expires: 9999999999,
name: String::from(""),
value: String::from(""),
};
assert!(!cookie.is_expired());
}
}
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝
// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗
// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod failing {
use monolith::cookies;
#[test]
fn expired() {
let cookie = cookies::Cookie {
domain: String::from("127.0.0.1"),
include_subdomains: true,
path: String::from("/"),
https_only: false,
expires: 1,
name: String::from(""),
value: String::from(""),
};
assert!(cookie.is_expired());
}
}

View File

@@ -0,0 +1,107 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use monolith::cookies;
#[test]
fn secure_url() {
let cookie = cookies::Cookie {
domain: String::from("127.0.0.1"),
include_subdomains: true,
path: String::from("/"),
https_only: true,
expires: 0,
name: String::from(""),
value: String::from(""),
};
assert!(cookie.matches_url("https://127.0.0.1/something"));
}
#[test]
fn non_secure_url() {
let cookie = cookies::Cookie {
domain: String::from("127.0.0.1"),
include_subdomains: true,
path: String::from("/"),
https_only: false,
expires: 0,
name: String::from(""),
value: String::from(""),
};
assert!(cookie.matches_url("http://127.0.0.1/something"));
}
#[test]
fn subdomain() {
let cookie = cookies::Cookie {
domain: String::from(".somethingsomething.com"),
include_subdomains: true,
path: String::from("/"),
https_only: true,
expires: 0,
name: String::from(""),
value: String::from(""),
};
assert!(cookie.matches_url("https://cdn.somethingsomething.com/something"));
}
}
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝
// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗
// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod failing {
use monolith::cookies;
#[test]
fn empty_url() {
let cookie = cookies::Cookie {
domain: String::from("127.0.0.1"),
include_subdomains: true,
path: String::from("/"),
https_only: false,
expires: 0,
name: String::from(""),
value: String::from(""),
};
assert!(!cookie.matches_url(""));
}
#[test]
fn wrong_hostname() {
let cookie = cookies::Cookie {
domain: String::from("127.0.0.1"),
include_subdomains: true,
path: String::from("/"),
https_only: false,
expires: 0,
name: String::from(""),
value: String::from(""),
};
assert!(!cookie.matches_url("http://0.0.0.0/"));
}
#[test]
fn wrong_path() {
let cookie = cookies::Cookie {
domain: String::from("127.0.0.1"),
include_subdomains: false,
path: String::from("/"),
https_only: false,
expires: 0,
name: String::from(""),
value: String::from(""),
};
assert!(!cookie.matches_url("http://0.0.0.0/path"));
}
}

View File

@@ -0,0 +1,2 @@
mod is_expired;
mod matches_url;

2
tests/cookies/mod.rs Normal file
View File

@@ -0,0 +1,2 @@
mod cookie;
mod parse_cookie_file_contents;

View File

@@ -0,0 +1,87 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use monolith::cookies;
#[test]
fn parse_file() {
let file_contents =
"# Netscape HTTP Cookie File\n127.0.0.1\tFALSE\t/\tFALSE\t0\tUSER_TOKEN\tin";
let result = cookies::parse_cookie_file_contents(&file_contents).unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].domain, "127.0.0.1");
assert_eq!(result[0].include_subdomains, false);
assert_eq!(result[0].path, "/");
assert_eq!(result[0].https_only, false);
assert_eq!(result[0].expires, 0);
assert_eq!(result[0].name, "USER_TOKEN");
assert_eq!(result[0].value, "in");
}
#[test]
fn parse_multiline_file() {
let file_contents = "# HTTP Cookie File\n127.0.0.1\tFALSE\t/\tFALSE\t0\tUSER_TOKEN\tin\n127.0.0.1\tTRUE\t/\tTRUE\t9\tUSER_TOKEN\tout\n\n";
let result = cookies::parse_cookie_file_contents(&file_contents).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].domain, "127.0.0.1");
assert_eq!(result[0].include_subdomains, false);
assert_eq!(result[0].path, "/");
assert_eq!(result[0].https_only, false);
assert_eq!(result[0].expires, 0);
assert_eq!(result[0].name, "USER_TOKEN");
assert_eq!(result[0].value, "in");
assert_eq!(result[1].domain, "127.0.0.1");
assert_eq!(result[1].include_subdomains, true);
assert_eq!(result[1].path, "/");
assert_eq!(result[1].https_only, true);
assert_eq!(result[1].expires, 9);
assert_eq!(result[1].name, "USER_TOKEN");
assert_eq!(result[1].value, "out");
}
}
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝
// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗
// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod failing {
use monolith::cookies;
#[test]
fn empty() {
let file_contents = "";
let result = cookies::parse_cookie_file_contents(&file_contents).unwrap();
assert_eq!(result.len(), 0);
}
#[test]
fn no_header() {
let file_contents = "127.0.0.1 FALSE / FALSE 0 USER_TOKEN in";
match cookies::parse_cookie_file_contents(&file_contents) {
Ok(_result) => {
assert!(false);
}
Err(_e) => {
assert!(true);
}
}
}
#[test]
fn spaces_instead_of_tabs() {
let file_contents =
"# HTTP Cookie File\n127.0.0.1 FALSE / FALSE 0 USER_TOKEN in";
let result = cookies::parse_cookie_file_contents(&file_contents).unwrap();
assert_eq!(result.len(), 0);
}
}

View File

@@ -23,7 +23,7 @@ mod passing {
let options = Options::default();
assert_eq!(
css::embed_css(cache, &client, &document_url, "", &options, 0),
css::embed_css(cache, &client, &document_url, "", &options),
""
);
}
@@ -36,7 +36,7 @@ mod passing {
let options = Options::default();
assert_eq!(
css::embed_css(cache, &client, &document_url, "\t \t ", &options, 0,),
css::embed_css(cache, &client, &document_url, "\t \t ", &options),
""
);
}
@@ -59,7 +59,7 @@ mod passing {
height: calc(100vh - 10pt)";
assert_eq!(
css::embed_css(cache, &client, &document_url, &STYLE, &options, 0,),
css::embed_css(cache, &client, &document_url, &STYLE, &options),
format!(
"/* border: none;*/\
background-image: url(\"{empty_image}\"); \
@@ -91,7 +91,7 @@ mod passing {
height: calc(100vh - 10pt)";
assert_eq!(
css::embed_css(cache, &client, &document_url, &STYLE, &options, 0),
css::embed_css(cache, &client, &document_url, &STYLE, &options),
format!(
"/* border: none;*/\
background-image: url(\"{empty_image}\"); \
@@ -122,7 +122,7 @@ mod passing {
html > body {}";
assert_eq!(
css::embed_css(cache, &client, &document_url, &CSS, &options, 0),
css::embed_css(cache, &client, &document_url, &CSS, &options),
CSS
);
}
@@ -166,7 +166,7 @@ mod passing {
";
assert_eq!(
css::embed_css(cache, &client, &document_url, &CSS, &options, 0),
css::embed_css(cache, &client, &document_url, &CSS, &options),
CSS
);
}
@@ -188,7 +188,7 @@ mod passing {
";
assert_eq!(
css::embed_css(cache, &client, &document_url, &CSS, &options, 0,),
css::embed_css(cache, &client, &document_url, &CSS, &options),
"\
@charset \"UTF-8\";\n\
\n\
@@ -218,7 +218,7 @@ mod passing {
";
assert_eq!(
css::embed_css(cache, &client, &document_url, &CSS, &options, 0,),
css::embed_css(cache, &client, &document_url, &CSS, &options),
CSS
);
}
@@ -240,7 +240,7 @@ mod passing {
";
assert_eq!(
css::embed_css(cache, &client, &document_url, &CSS, &options, 0,),
css::embed_css(cache, &client, &document_url, &CSS, &options),
CSS
);
}
@@ -264,7 +264,7 @@ mod passing {
";
assert_eq!(
css::embed_css(cache, &client, &document_url, &CSS, &options, 0,),
css::embed_css(cache, &client, &document_url, &CSS, &options),
CSS
);
}
@@ -312,7 +312,7 @@ mod passing {
";
assert_eq!(
css::embed_css(cache, &client, &document_url, &CSS, &options, 0,),
css::embed_css(cache, &client, &document_url, &CSS, &options),
CSS_OUT
);
}
@@ -337,7 +337,7 @@ mod passing {
";
assert_eq!(
css::embed_css(cache, &client, &document_url, &CSS, &options, 0,),
css::embed_css(cache, &client, &document_url, &CSS, &options),
CSS_OUT
);
}
@@ -364,7 +364,7 @@ mod passing {
";
assert_eq!(
css::embed_css(cache, &client, &document_url, &CSS, &options, 0,),
css::embed_css(cache, &client, &document_url, &CSS, &options),
CSS_OUT
);
}

View File

@@ -10,17 +10,17 @@ mod passing {
use monolith::css;
#[test]
fn backrgound() {
fn background() {
assert!(css::is_image_url_prop("background"));
}
#[test]
fn backrgound_image() {
fn background_image() {
assert!(css::is_image_url_prop("background-image"));
}
#[test]
fn backrgound_image_uppercase() {
fn background_image_uppercase() {
assert!(css::is_image_url_prop("BACKGROUND-IMAGE"));
}

View File

@@ -8,6 +8,7 @@
#[cfg(test)]
mod passing {
use html5ever::serialize::{serialize, SerializeOpts};
use markup5ever_rcdom::SerializableHandle;
use monolith::html;
@@ -19,7 +20,12 @@ mod passing {
dom = html::add_favicon(&dom.document, "I_AM_A_FAVICON_DATA_URL".to_string());
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
serialize(
&mut buf,
&SerializableHandle::from(dom.document.clone()),
SerializeOpts::default(),
)
.unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),

View File

@@ -29,7 +29,6 @@ mod passing {
&Url::parse("data:,").unwrap(),
&srcset_value,
&options,
0,
);
assert_eq!(
@@ -55,7 +54,6 @@ mod passing {
&Url::parse("data:,").unwrap(),
&srcset_value,
&options,
0,
);
assert_eq!(
@@ -78,7 +76,6 @@ mod passing {
&Url::parse("data:,").unwrap(),
&srcset_value,
&options,
0,
);
assert_eq!(
@@ -101,7 +98,6 @@ mod passing {
&Url::parse("data:,").unwrap(),
&srcset_value,
&options,
0,
);
assert_eq!(
@@ -112,6 +108,56 @@ mod passing {
),
);
}
#[test]
fn no_whitespace_after_commas() {
let cache = &mut HashMap::new();
let client = Client::new();
let srcset_value = "small,s.png 1x,medium,m.png 2x,large,l.png 3x";
let mut options = Options::default();
options.no_images = true;
options.silent = true;
let embedded_css = html::embed_srcset(
cache,
&client,
&Url::parse("data:,").unwrap(),
&srcset_value,
&options,
);
assert_eq!(
embedded_css,
format!(
"{} 1x, {} 2x, {} 3x",
EMPTY_IMAGE_DATA_URL, EMPTY_IMAGE_DATA_URL, EMPTY_IMAGE_DATA_URL
),
);
}
#[test]
fn last_without_descriptor() {
let cache = &mut HashMap::new();
let client = Client::new();
let srcset_value = "small,s.png 1x, medium,m.png 2x, large,l.png";
let mut options = Options::default();
options.no_images = true;
options.silent = true;
let embedded_css = html::embed_srcset(
cache,
&client,
&Url::parse("data:,").unwrap(),
&srcset_value,
&options,
);
assert_eq!(
embedded_css,
format!(
"{} 1x, {} 2x, {}",
EMPTY_IMAGE_DATA_URL, EMPTY_IMAGE_DATA_URL, EMPTY_IMAGE_DATA_URL
),
);
}
}
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
@@ -145,12 +191,11 @@ mod failing {
&Url::parse("data:,").unwrap(),
&srcset_value,
&options,
0,
);
assert_eq!(
embedded_css,
format!("{} 1x, {} 2x,", EMPTY_IMAGE_DATA_URL, EMPTY_IMAGE_DATA_URL),
format!("{} 1x, {} 2x", EMPTY_IMAGE_DATA_URL, EMPTY_IMAGE_DATA_URL),
);
}
}

View File

@@ -7,7 +7,7 @@
#[cfg(test)]
mod passing {
use html5ever::rcdom::{Handle, NodeData};
use markup5ever_rcdom::{Handle, NodeData};
use monolith::html;

View File

@@ -7,7 +7,7 @@
#[cfg(test)]
mod passing {
use html5ever::rcdom::{Handle, NodeData};
use markup5ever_rcdom::{Handle, NodeData};
use monolith::html;

View File

@@ -7,7 +7,7 @@
#[cfg(test)]
mod passing {
use html5ever::rcdom::{Handle, NodeData};
use markup5ever_rcdom::{Handle, NodeData};
use monolith::html;

View File

@@ -8,6 +8,7 @@
#[cfg(test)]
mod passing {
use html5ever::serialize::{serialize, SerializeOpts};
use markup5ever_rcdom::SerializableHandle;
use reqwest::blocking::Client;
use std::collections::HashMap;
use url::Url;
@@ -29,10 +30,15 @@ mod passing {
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
serialize(
&mut buf,
&SerializableHandle::from(dom.document.clone()),
SerializeOpts::default(),
)
.unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
@@ -52,10 +58,15 @@ mod passing {
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
serialize(
&mut buf,
&SerializableHandle::from(dom.document.clone()),
SerializeOpts::default(),
)
.unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
@@ -75,10 +86,15 @@ mod passing {
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
serialize(
&mut buf,
&SerializableHandle::from(dom.document.clone()),
SerializeOpts::default(),
)
.unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
@@ -104,10 +120,15 @@ mod passing {
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
serialize(
&mut buf,
&SerializableHandle::from(dom.document.clone()),
SerializeOpts::default(),
)
.unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
@@ -140,10 +161,15 @@ mod passing {
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
serialize(
&mut buf,
&SerializableHandle::from(dom.document.clone()),
SerializeOpts::default(),
)
.unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
@@ -177,10 +203,15 @@ mod passing {
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
serialize(
&mut buf,
&SerializableHandle::from(dom.document.clone()),
SerializeOpts::default(),
)
.unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
@@ -201,10 +232,15 @@ mod passing {
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
serialize(
&mut buf,
&SerializableHandle::from(dom.document.clone()),
SerializeOpts::default(),
)
.unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
@@ -233,10 +269,15 @@ mod passing {
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
serialize(
&mut buf,
&SerializableHandle::from(dom.document.clone()),
SerializeOpts::default(),
)
.unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
@@ -269,10 +310,15 @@ mod passing {
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
serialize(
&mut buf,
&SerializableHandle::from(dom.document.clone()),
SerializeOpts::default(),
)
.unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
@@ -303,10 +349,15 @@ mod passing {
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
serialize(
&mut buf,
&SerializableHandle::from(dom.document.clone()),
SerializeOpts::default(),
)
.unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
@@ -340,10 +391,15 @@ mod passing {
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
serialize(
&mut buf,
&SerializableHandle::from(dom.document.clone()),
SerializeOpts::default(),
)
.unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
@@ -378,10 +434,15 @@ mod passing {
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
serialize(
&mut buf,
&SerializableHandle::from(dom.document.clone()),
SerializeOpts::default(),
)
.unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
@@ -424,10 +485,15 @@ mod passing {
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
serialize(
&mut buf,
&SerializableHandle::from(dom.document.clone()),
SerializeOpts::default(),
)
.unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
@@ -463,10 +529,15 @@ mod passing {
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
serialize(
&mut buf,
&SerializableHandle::from(dom.document.clone()),
SerializeOpts::default(),
)
.unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),
@@ -498,10 +569,15 @@ mod passing {
let client = Client::new();
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0);
html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options);
let mut buf: Vec<u8> = Vec::new();
serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap();
serialize(
&mut buf,
&SerializableHandle::from(dom.document.clone()),
SerializeOpts::default(),
)
.unwrap();
assert_eq!(
buf.iter().map(|&c| c as char).collect::<String>(),

View File

@@ -1,4 +1,5 @@
mod cli;
mod cookies;
mod css;
mod html;
mod js;

View File

@@ -16,7 +16,8 @@ mod passing {
assert_eq!(options.no_audio, false);
assert_eq!(options.base_url, None);
assert_eq!(options.no_css, false);
assert_eq!(options.charset, None);
assert_eq!(options.cookie_file, None);
assert_eq!(options.encoding, None);
assert_eq!(options.no_frames, false);
assert_eq!(options.no_fonts, false);
assert_eq!(options.no_images, false);

View File

@@ -46,7 +46,7 @@ mod passing {
}
#[test]
fn removesempty_fragment_and_keeps_empty_query() {
fn removes_empty_fragment_and_keeps_query() {
assert_eq!(
url::clean_url(Url::parse("https://somewhere.com/font.eot?a=b&#").unwrap()).as_str(),
"https://somewhere.com/font.eot?a=b&"

View File

@@ -0,0 +1,91 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use reqwest::Url;
use monolith::url;
#[test]
fn preserve_original() {
let original_url: Url = Url::parse("https://somewhere.com/font.eot#iefix").unwrap();
let referer_url: Url = url::get_referer_url(original_url.clone());
assert_eq!(referer_url.as_str(), "https://somewhere.com/font.eot");
assert_eq!(
original_url.as_str(),
"https://somewhere.com/font.eot#iefix"
);
}
#[test]
fn removes_fragment() {
assert_eq!(
url::get_referer_url(Url::parse("https://somewhere.com/font.eot#iefix").unwrap())
.as_str(),
"https://somewhere.com/font.eot"
);
}
#[test]
fn removes_empty_fragment() {
assert_eq!(
url::get_referer_url(Url::parse("https://somewhere.com/font.eot#").unwrap()).as_str(),
"https://somewhere.com/font.eot"
);
}
#[test]
fn removes_empty_fragment_and_keeps_empty_query() {
assert_eq!(
url::get_referer_url(Url::parse("https://somewhere.com/font.eot?#").unwrap()).as_str(),
"https://somewhere.com/font.eot?"
);
}
#[test]
fn removes_empty_fragment_and_keeps_query() {
assert_eq!(
url::get_referer_url(Url::parse("https://somewhere.com/font.eot?a=b&#").unwrap())
.as_str(),
"https://somewhere.com/font.eot?a=b&"
);
}
#[test]
fn removes_credentials() {
assert_eq!(
url::get_referer_url(Url::parse("https://cookie:monster@gibson.lan/path").unwrap())
.as_str(),
"https://gibson.lan/path"
);
}
#[test]
fn removes_empty_credentials() {
assert_eq!(
url::get_referer_url(Url::parse("https://@gibson.lan/path").unwrap()).as_str(),
"https://gibson.lan/path"
);
}
#[test]
fn removes_empty_username_credentials() {
assert_eq!(
url::get_referer_url(Url::parse("https://:monster@gibson.lan/path").unwrap()).as_str(),
"https://gibson.lan/path"
);
}
#[test]
fn removes_empty_password_credentials() {
assert_eq!(
url::get_referer_url(Url::parse("https://cookie@gibson.lan/path").unwrap()).as_str(),
"https://gibson.lan/path"
);
}
}

View File

@@ -1,5 +1,6 @@
mod clean_url;
mod create_data_url;
mod get_referer_url;
mod is_url_and_has_protocol;
mod parse_data_url;
mod resolve_url;

View File

@@ -0,0 +1,154 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use monolith::utils;
#[test]
fn sub_domain_is_within_dotted_sub_domain() {
assert!(utils::domain_is_within_domain(
"news.ycombinator.com",
".news.ycombinator.com"
));
}
#[test]
fn domain_is_within_dotted_domain() {
assert!(utils::domain_is_within_domain(
"ycombinator.com",
".ycombinator.com"
));
}
#[test]
fn sub_domain_is_within_dotted_domain() {
assert!(utils::domain_is_within_domain(
"news.ycombinator.com",
".ycombinator.com"
));
}
#[test]
fn sub_domain_is_within_dotted_top_level_domain() {
assert!(utils::domain_is_within_domain(
"news.ycombinator.com",
".com"
));
}
#[test]
fn domain_is_within_itself() {
assert!(utils::domain_is_within_domain(
"ycombinator.com",
"ycombinator.com"
));
}
#[test]
fn domain_with_trailing_dot_is_within_itself() {
assert!(utils::domain_is_within_domain(
"ycombinator.com.",
"ycombinator.com"
));
}
#[test]
fn domain_with_trailing_dot_is_within_single_dot() {
assert!(utils::domain_is_within_domain("ycombinator.com.", "."));
}
#[test]
fn domain_matches_single_dot() {
assert!(utils::domain_is_within_domain("ycombinator.com", "."));
}
#[test]
fn dotted_domain_must_be_within_dotted_domain() {
assert!(utils::domain_is_within_domain(
".ycombinator.com",
".ycombinator.com"
));
}
#[test]
fn empty_is_within_dot() {
assert!(utils::domain_is_within_domain("", "."));
}
#[test]
fn both_dots() {
assert!(utils::domain_is_within_domain(".", "."));
}
}
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝
// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗
// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod failing {
use monolith::utils;
#[test]
fn sub_domain_must_not_be_within_domain() {
assert!(!utils::domain_is_within_domain(
"news.ycombinator.com",
"ycombinator.com"
));
}
#[test]
fn domain_must_not_be_within_top_level_domain() {
assert!(!utils::domain_is_within_domain("ycombinator.com", "com"));
}
#[test]
fn different_domains_must_not_be_within_one_another() {
assert!(!utils::domain_is_within_domain(
"news.ycombinator.com",
"kernel.org"
));
}
#[test]
fn sub_domain_is_not_within_wrong_top_level_domain() {
assert!(!utils::domain_is_within_domain(
"news.ycombinator.com",
"org"
));
}
#[test]
fn dotted_domain_is_not_within_domain() {
assert!(!utils::domain_is_within_domain(
".ycombinator.com",
"ycombinator.com"
));
}
#[test]
fn different_domain_is_not_within_dotted_domain() {
assert!(!utils::domain_is_within_domain(
"www.doodleoptimize.com",
".ycombinator.com"
));
}
#[test]
fn no_domain_can_be_within_empty_domain() {
assert!(!utils::domain_is_within_domain("ycombinator.com", ""));
}
#[test]
fn both_can_not_be_empty() {
assert!(!utils::domain_is_within_domain("", ""));
}
}

View File

@@ -1,36 +0,0 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use monolith::utils;
#[test]
fn zero() {
assert_eq!(utils::indent(0), "");
}
#[test]
fn one() {
assert_eq!(utils::indent(1), " ");
}
#[test]
fn two() {
assert_eq!(utils::indent(2), " ");
}
#[test]
fn three() {
assert_eq!(utils::indent(3), " ");
}
#[test]
fn four() {
assert_eq!(utils::indent(4), " ");
}
}

View File

@@ -1,4 +1,4 @@
mod detect_media_type;
mod indent;
mod domain_is_within_domain;
mod parse_content_type;
mod retrieve_asset;

View File

@@ -32,7 +32,6 @@ mod passing {
&Url::parse("data:text/html;base64,c291cmNl").unwrap(),
&Url::parse("data:text/html;base64,dGFyZ2V0").unwrap(),
&options,
0,
)
.unwrap();
assert_eq!(&media_type, "text/html");
@@ -75,7 +74,6 @@ mod passing {
))
.unwrap(),
&options,
0,
)
.unwrap();
assert_eq!(&media_type, "application/javascript");
@@ -124,7 +122,6 @@ mod failing {
&Url::parse("data:text/html;base64,SoUrCe").unwrap(),
&Url::parse("file:///etc/passwd").unwrap(),
&options,
0,
) {
Ok((..)) => {
assert!(false);
@@ -150,7 +147,6 @@ mod failing {
&Url::parse("https://kernel.org/").unwrap(),
&Url::parse("file:///etc/passwd").unwrap(),
&options,
0,
) {
Ok((..)) => {
assert!(false);