From 2e86ee67a507d654f44c8b222f440365d69357cf Mon Sep 17 00:00:00 2001 From: Sunshine Date: Thu, 11 Mar 2021 12:44:02 -1000 Subject: [PATCH] revamp codebase --- Cargo.lock | 59 +- Cargo.toml | 1 - src/css.rs | 194 ++-- src/html.rs | 999 ++++++------------ src/main.rs | 193 ++-- src/tests/cli/basic.rs | 54 +- src/tests/cli/data_url.rs | 2 +- src/tests/cli/local_files.rs | 158 ++- src/tests/css/embed_css.rs | 133 ++- src/tests/data/css/index.html | 11 + src/tests/data/css/style.css | 1 + src/tests/data/integrity/index.html | 2 - src/tests/data/svg/image.svg | 5 + src/tests/data/svg/index.html | 1 + src/tests/html/create_metadata_tag.rs | 30 +- src/tests/html/embed_srcset.rs | 47 +- src/tests/html/walk_and_embed_assets.rs | 87 +- src/tests/url/clean_url.rs | 29 +- src/tests/url/data_to_data_url.rs | 17 +- src/tests/url/file_url_to_fs_path.rs | 41 - src/tests/url/get_url_fragment.rs | 48 - src/tests/url/is_data_url.rs | 52 - src/tests/url/is_file_url.rs | 83 -- ...is_http_url.rs => is_http_or_https_url.rs} | 22 +- ...protocol.rs => is_url_and_has_protocol.rs} | 30 +- src/tests/url/mod.rs | 11 +- src/tests/url/parse_data_url.rs | 35 +- .../url/{decode_url.rs => percent_decode.rs} | 6 +- ...url_with_fragment.rs => percent_encode.rs} | 28 +- src/tests/url/resolve_url.rs | 106 +- src/tests/utils/detect_media_type.rs | 94 +- src/tests/utils/retrieve_asset.rs | 47 +- src/url.rs | 152 +-- src/utils.rs | 164 ++- 34 files changed, 1267 insertions(+), 1675 deletions(-) create mode 100644 src/tests/data/css/index.html create mode 100644 src/tests/data/css/style.css create mode 100644 src/tests/data/svg/image.svg create mode 100644 src/tests/data/svg/index.html delete mode 100644 src/tests/url/file_url_to_fs_path.rs delete mode 100644 src/tests/url/get_url_fragment.rs delete mode 100644 src/tests/url/is_data_url.rs delete mode 100644 src/tests/url/is_file_url.rs rename src/tests/url/{is_http_url.rs => is_http_or_https_url.rs} (72%) rename src/tests/url/{url_has_protocol.rs => is_url_and_has_protocol.rs} (74%) rename src/tests/url/{decode_url.rs => percent_decode.rs} (91%) rename src/tests/url/{url_with_fragment.rs => percent_encode.rs} (52%) diff --git a/Cargo.lock b/Cargo.lock index 56c609f..5490c65 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -43,7 +43,7 @@ dependencies = [ "futures-core 0.3.13 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 2.3.4 (registry+https://github.com/rust-lang/crates.io-index)", "pin-project-lite 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", - "tokio 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "tokio 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -101,7 +101,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "byteorder" -version = "1.4.2" +version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -193,7 +193,7 @@ dependencies = [ "proc-macro2 1.0.24 (registry+https://github.com/rust-lang/crates.io-index)", "quote 1.0.9 (registry+https://github.com/rust-lang/crates.io-index)", "smallvec 1.6.1 (registry+https://github.com/rust-lang/crates.io-index)", - "syn 1.0.62 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 1.0.63 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -202,7 +202,7 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "quote 1.0.9 (registry+https://github.com/rust-lang/crates.io-index)", - "syn 1.0.62 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 1.0.63 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -380,8 +380,8 @@ dependencies = [ "http 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", "indexmap 1.6.2 (registry+https://github.com/rust-lang/crates.io-index)", "slab 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", - "tokio 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", - "tokio-util 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", + "tokio 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "tokio-util 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "tracing 0.1.25 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -408,7 +408,7 @@ dependencies = [ "markup5ever 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", "proc-macro2 1.0.24 (registry+https://github.com/rust-lang/crates.io-index)", "quote 1.0.9 (registry+https://github.com/rust-lang/crates.io-index)", - "syn 1.0.62 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 1.0.63 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -457,7 +457,7 @@ dependencies = [ "itoa 0.4.7 (registry+https://github.com/rust-lang/crates.io-index)", "pin-project 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", "socket2 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)", - "tokio 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "tokio 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "tower-service 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "tracing 0.1.25 (registry+https://github.com/rust-lang/crates.io-index)", "want 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -471,7 +471,7 @@ dependencies = [ "bytes 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", "hyper 0.14.4 (registry+https://github.com/rust-lang/crates.io-index)", "native-tls 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", - "tokio 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "tokio 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "tokio-native-tls 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -608,9 +608,8 @@ dependencies = [ "cssparser 0.28.1 (registry+https://github.com/rust-lang/crates.io-index)", "html5ever 0.24.1 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.4.3 (registry+https://github.com/rust-lang/crates.io-index)", - "reqwest 0.11.1 (registry+https://github.com/rust-lang/crates.io-index)", + "reqwest 0.11.2 (registry+https://github.com/rust-lang/crates.io-index)", "sha2 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)", - "tempfile 3.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "url 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -770,7 +769,7 @@ dependencies = [ "proc-macro-hack 0.5.19 (registry+https://github.com/rust-lang/crates.io-index)", "proc-macro2 1.0.24 (registry+https://github.com/rust-lang/crates.io-index)", "quote 1.0.9 (registry+https://github.com/rust-lang/crates.io-index)", - "syn 1.0.62 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 1.0.63 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -804,7 +803,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "proc-macro2 1.0.24 (registry+https://github.com/rust-lang/crates.io-index)", "quote 1.0.9 (registry+https://github.com/rust-lang/crates.io-index)", - "syn 1.0.62 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 1.0.63 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -1086,7 +1085,7 @@ name = "regex-automata" version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "byteorder 1.4.2 (registry+https://github.com/rust-lang/crates.io-index)", + "byteorder 1.4.3 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -1104,7 +1103,7 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "async-compression 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1127,9 +1126,9 @@ dependencies = [ "pin-project-lite 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.124 (registry+https://github.com/rust-lang/crates.io-index)", "serde_urlencoded 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", - "tokio 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "tokio 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "tokio-native-tls 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", - "tokio-util 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", + "tokio-util 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "url 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "wasm-bindgen 0.2.71 (registry+https://github.com/rust-lang/crates.io-index)", "wasm-bindgen-futures 0.4.21 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1184,7 +1183,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "proc-macro2 1.0.24 (registry+https://github.com/rust-lang/crates.io-index)", "quote 1.0.9 (registry+https://github.com/rust-lang/crates.io-index)", - "syn 1.0.62 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 1.0.63 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -1288,7 +1287,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "syn" -version = "1.0.62" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "proc-macro2 1.0.24 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1360,7 +1359,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "tokio" -version = "1.2.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "autocfg 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1378,12 +1377,12 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "native-tls 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", - "tokio 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "tokio 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "tokio-util" -version = "0.6.3" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "bytes 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1391,7 +1390,7 @@ dependencies = [ "futures-sink 0.3.13 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.14 (registry+https://github.com/rust-lang/crates.io-index)", "pin-project-lite 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", - "tokio 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "tokio 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -1537,7 +1536,7 @@ dependencies = [ "log 0.4.14 (registry+https://github.com/rust-lang/crates.io-index)", "proc-macro2 1.0.24 (registry+https://github.com/rust-lang/crates.io-index)", "quote 1.0.9 (registry+https://github.com/rust-lang/crates.io-index)", - "syn 1.0.62 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 1.0.63 (registry+https://github.com/rust-lang/crates.io-index)", "wasm-bindgen-shared 0.2.71 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -1568,7 +1567,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "proc-macro2 1.0.24 (registry+https://github.com/rust-lang/crates.io-index)", "quote 1.0.9 (registry+https://github.com/rust-lang/crates.io-index)", - "syn 1.0.62 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 1.0.63 (registry+https://github.com/rust-lang/crates.io-index)", "wasm-bindgen-backend 0.2.71 (registry+https://github.com/rust-lang/crates.io-index)", "wasm-bindgen-shared 0.2.71 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -1628,7 +1627,7 @@ dependencies = [ "checksum block-buffer 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" "checksum bstr 0.2.15 (registry+https://github.com/rust-lang/crates.io-index)" = "a40b47ad93e1a5404e6c18dec46b628214fee441c70f4ab5d6942142cc268a3d" "checksum bumpalo 3.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "63396b8a4b9de3f4fdfb320ab6080762242f66a8ef174c49d8e19b674db4cdbe" -"checksum byteorder 1.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "ae44d1a3d5a19df61dd0c8beb138458ac2a53a7ac09eba97d55592540004306b" +"checksum byteorder 1.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" "checksum bytes 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b700ce4376041dcd0a327fd0097c41095743c4c8af8887265942faf1100bd040" "checksum cc 1.0.67 (registry+https://github.com/rust-lang/crates.io-index)" = "e3c69b077ad434294d3ce9f1f6143a2a4b89a8a2d54ef813d85003a4fd1137fd" "checksum cfg-if 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" @@ -1747,7 +1746,7 @@ dependencies = [ "checksum regex-automata 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4" "checksum regex-syntax 0.6.22 (registry+https://github.com/rust-lang/crates.io-index)" = "b5eb417147ba9860a96cfe72a0b93bf88fee1744b5636ec99ab20c1aa9376581" "checksum remove_dir_all 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" -"checksum reqwest 0.11.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0460542b551950620a3648c6aa23318ac6b3cd779114bd873209e6e8b5eb1c34" +"checksum reqwest 0.11.2 (registry+https://github.com/rust-lang/crates.io-index)" = "bf12057f289428dbf5c591c74bf10392e4a8003f993405a902f20117019022d4" "checksum ryu 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" "checksum schannel 0.1.19 (registry+https://github.com/rust-lang/crates.io-index)" = "8f05ba609c234e60bee0d547fe94a4c7e9da733d1c962cf6e59efa4cd9c8bc75" "checksum security-framework 2.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d493c5f39e02dfb062cd8f33301f90f9b13b650e8c1b1d0fd75c19dd64bff69d" @@ -1766,7 +1765,7 @@ dependencies = [ "checksum string_cache_codegen 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "f0f45ed1b65bf9a4bf2f7b7dc59212d1926e9eaf00fa998988e420fd124467c6" "checksum string_cache_shared 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b1884d1bc09741d466d9b14e6d37ac89d6909cbcac41dd9ae982d4d063bbedfc" "checksum strsim 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" -"checksum syn 1.0.62 (registry+https://github.com/rust-lang/crates.io-index)" = "123a78a3596b24fee53a6464ce52d8ecbf62241e6294c7e7fe12086cd161f512" +"checksum syn 1.0.63 (registry+https://github.com/rust-lang/crates.io-index)" = "8fd9bc7ccc2688b3344c2f48b9b546648b25ce0b20fc717ee7fa7981a8ca9717" "checksum tempfile 3.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" "checksum tendril 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a9ef557cb397a4f0a5a3a628f06515f78563f2209e64d47055d9dc6052bf5e33" "checksum textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" @@ -1774,9 +1773,9 @@ dependencies = [ "checksum time 0.1.44 (registry+https://github.com/rust-lang/crates.io-index)" = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" "checksum tinyvec 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "317cca572a0e89c3ce0ca1f1bdc9369547fe318a683418e42ac8f59d14701023" "checksum tinyvec_macros 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" -"checksum tokio 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e8190d04c665ea9e6b6a0dc45523ade572c088d2e6566244c1122671dbf4ae3a" +"checksum tokio 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8d56477f6ed99e10225f38f9f75f872f29b8b8bd8c0b946f63345bb144e9eeda" "checksum tokio-native-tls 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b" -"checksum tokio-util 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ebb7cb2f00c5ae8df755b252306272cd1790d39728363936e01827e11f0b017b" +"checksum tokio-util 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "ec31e5cc6b46e653cf57762f36f71d5e6386391d88a72fd6db4508f8f676fb29" "checksum tower-service 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" "checksum tracing 0.1.25 (registry+https://github.com/rust-lang/crates.io-index)" = "01ebdc2bb4498ab1ab5f5b73c5803825e60199229ccba0698170e3be0e7f959f" "checksum tracing-core 0.1.17 (registry+https://github.com/rust-lang/crates.io-index)" = "f50de3927f93d202783f4513cda820ab47ef17f624b03c096e86ef00c67e6b5f" diff --git a/Cargo.toml b/Cargo.toml index fc47af9..b0ef40c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,4 +39,3 @@ features = ["default-tls", "blocking", "gzip"] [dev-dependencies] assert_cmd = "1.0.2" -tempfile = "3.2.0" diff --git a/src/css.rs b/src/css.rs index 5492b88..a1205a9 100644 --- a/src/css.rs +++ b/src/css.rs @@ -1,9 +1,12 @@ -use cssparser::{ParseError, Parser, ParserInput, SourcePosition, Token}; +use cssparser::{ + serialize_identifier, serialize_string, ParseError, Parser, ParserInput, SourcePosition, Token, +}; use reqwest::blocking::Client; use std::collections::HashMap; +use url::Url; use crate::opts::Options; -use crate::url::{data_to_data_url, get_url_fragment, is_http_url, resolve_url, url_with_fragment}; +use crate::url::{data_to_data_url, resolve_url}; use crate::utils::retrieve_asset; const CSS_PROPS_WITH_IMAGE_URLS: &[&str] = &[ @@ -26,13 +29,30 @@ const CSS_PROPS_WITH_IMAGE_URLS: &[&str] = &[ "suffix", "symbols", ]; -const CSS_SPECIAL_CHARS: &'static str = "~!@$%^&*()+=,./'\";:?><[]{}|`#"; -pub fn is_image_url_prop(prop_name: &str) -> bool { - CSS_PROPS_WITH_IMAGE_URLS - .iter() - .find(|p| prop_name.eq_ignore_ascii_case(p)) - .is_some() +pub fn embed_css( + cache: &mut HashMap>, + client: &Client, + document_url: &Url, + css: &str, + options: &Options, + depth: u32, +) -> String { + let mut input = ParserInput::new(&css); + let mut parser = Parser::new(&mut input); + + process_css( + cache, + client, + document_url, + &mut parser, + options, + depth, + "", + "", + "", + ) + .unwrap() } pub fn enquote(input: String, double: bool) -> String { @@ -43,22 +63,29 @@ pub fn enquote(input: String, double: bool) -> String { } } -pub fn escape(value: &str) -> String { - let mut res = str!(&value); - - res = res.replace("\\", "\\\\"); - - for c in CSS_SPECIAL_CHARS.chars() { - res = res.replace(c, format!("\\{}", c).as_str()); - } +pub fn format_ident(ident: &str) -> String { + let mut res: String = String::new(); + let _ = serialize_identifier(ident, &mut res); + res +} +pub fn format_quoted_string(string: &str) -> String { + let mut res: String = String::new(); + let _ = serialize_string(string, &mut res); res } +pub fn is_image_url_prop(prop_name: &str) -> bool { + CSS_PROPS_WITH_IMAGE_URLS + .iter() + .find(|p| prop_name.eq_ignore_ascii_case(p)) + .is_some() +} + pub fn process_css<'a>( cache: &mut HashMap>, client: &Client, - parent_url: &str, + document_url: &Url, parser: &mut Parser, options: &Options, depth: u32, @@ -112,7 +139,7 @@ pub fn process_css<'a>( process_css( cache, client, - parent_url, + document_url, parser, options, depth, @@ -143,7 +170,7 @@ pub fn process_css<'a>( Token::Ident(ref value) => { curr_rule = str!(); curr_prop = str!(value); - result.push_str(&escape(value)); + result.push_str(&format_ident(value)); } // @import, @font-face, @charset, @media... Token::AtKeyword(ref value) => { @@ -164,23 +191,22 @@ pub fn process_css<'a>( curr_rule = str!(); // Skip empty import values - if value.len() < 1 { + if value.len() == 0 { result.push_str("''"); continue; } - let import_full_url = resolve_url(&parent_url, value).unwrap_or_default(); - let import_url_fragment = get_url_fragment(import_full_url.clone()); + let import_full_url: Url = resolve_url(&document_url, value); match retrieve_asset( cache, client, - &parent_url, + &document_url, &import_full_url, options, depth + 1, ) { Ok((import_contents, import_final_url, _import_media_type)) => { - let import_data_url = data_to_data_url( + let mut import_data_url = data_to_data_url( "text/css", embed_css( cache, @@ -193,63 +219,58 @@ pub fn process_css<'a>( .as_bytes(), &import_final_url, ); - let assembled_url: String = url_with_fragment( - import_data_url.as_str(), - import_url_fragment.as_str(), - ); - result.push_str(enquote(assembled_url, false).as_str()); + import_data_url.set_fragment(import_full_url.fragment()); + result.push_str(enquote(import_data_url.to_string(), false).as_str()); } Err(_) => { // Keep remote reference if unable to retrieve the asset - if is_http_url(import_full_url.clone()) { - let assembled_url: String = url_with_fragment( - import_full_url.as_str(), - import_url_fragment.as_str(), - ); - result.push_str(enquote(assembled_url, false).as_str()); + if import_full_url.scheme() == "http" + || import_full_url.scheme() == "https" + { + result + .push_str(enquote(import_full_url.to_string(), false).as_str()); } } } } else { if func_name == "url" { // Skip empty url()'s - if value.len() < 1 { + if value.len() == 0 { continue; } if options.no_images && is_image_url_prop(curr_prop.as_str()) { result.push_str(enquote(str!(empty_image!()), false).as_str()); } else { - let resolved_url = resolve_url(&parent_url, value).unwrap_or_default(); - let url_fragment = get_url_fragment(resolved_url.clone()); + let resolved_url: Url = resolve_url(&document_url, value); match retrieve_asset( cache, client, - &parent_url, + &document_url, &resolved_url, options, depth + 1, ) { Ok((data, final_url, media_type)) => { - let data_url = data_to_data_url(&media_type, &data, &final_url); - let assembled_url: String = - url_with_fragment(data_url.as_str(), url_fragment.as_str()); - result.push_str(enquote(assembled_url, false).as_str()); + let mut data_url = + data_to_data_url(&media_type, &data, &final_url); + data_url.set_fragment(resolved_url.fragment()); + result.push_str(enquote(data_url.to_string(), false).as_str()); } Err(_) => { // Keep remote reference if unable to retrieve the asset - if is_http_url(resolved_url.clone()) { - let assembled_url: String = url_with_fragment( - resolved_url.as_str(), - url_fragment.as_str(), + if resolved_url.scheme() == "http" + || resolved_url.scheme() == "https" + { + result.push_str( + enquote(resolved_url.to_string(), false).as_str(), ); - result.push_str(enquote(assembled_url, false).as_str()); } } } } } else { - result.push_str(enquote(str!(value), false).as_str()); + result.push_str(format_quoted_string(value).as_str()); } } } @@ -290,8 +311,9 @@ pub fn process_css<'a>( Token::IDHash(ref value) => { curr_rule = str!(); result.push_str("#"); - result.push_str(&escape(value)); + result.push_str(&format_ident(value)); } + // url() Token::UnquotedUrl(ref value) => { let is_import: bool = curr_rule == "import"; @@ -313,12 +335,17 @@ pub fn process_css<'a>( result.push_str("url("); if is_import { - let full_url = resolve_url(&parent_url, value).unwrap_or_default(); - let url_fragment = get_url_fragment(full_url.clone()); - match retrieve_asset(cache, client, &parent_url, &full_url, options, depth + 1) - { + let full_url: Url = resolve_url(&document_url, value); + match retrieve_asset( + cache, + client, + &document_url, + &full_url, + options, + depth + 1, + ) { Ok((css, final_url, _media_type)) => { - let data_url = data_to_data_url( + let mut data_url = data_to_data_url( "text/css", embed_css( cache, @@ -331,16 +358,13 @@ pub fn process_css<'a>( .as_bytes(), &final_url, ); - let assembled_url: String = - url_with_fragment(data_url.as_str(), url_fragment.as_str()); - result.push_str(enquote(assembled_url, false).as_str()); + data_url.set_fragment(full_url.fragment()); + result.push_str(enquote(data_url.to_string(), false).as_str()); } Err(_) => { // Keep remote reference if unable to retrieve the asset - if is_http_url(full_url.clone()) { - let assembled_url: String = - url_with_fragment(full_url.as_str(), url_fragment.as_str()); - result.push_str(enquote(assembled_url, false).as_str()); + if full_url.scheme() == "http" || full_url.scheme() == "https" { + result.push_str(enquote(full_url.to_string(), false).as_str()); } } } @@ -348,28 +372,24 @@ pub fn process_css<'a>( if is_image_url_prop(curr_prop.as_str()) && options.no_images { result.push_str(enquote(str!(empty_image!()), false).as_str()); } else { - let full_url = resolve_url(&parent_url, value).unwrap_or_default(); - let url_fragment = get_url_fragment(full_url.clone()); + let full_url: Url = resolve_url(&document_url, value); match retrieve_asset( cache, client, - &parent_url, + &document_url, &full_url, options, depth + 1, ) { Ok((data, final_url, media_type)) => { - let data_url = data_to_data_url(&media_type, &data, &final_url); - let assembled_url: String = - url_with_fragment(data_url.as_str(), url_fragment.as_str()); - result.push_str(enquote(assembled_url, false).as_str()); + let mut data_url = data_to_data_url(&media_type, &data, &final_url); + data_url.set_fragment(full_url.fragment()); + result.push_str(enquote(data_url.to_string(), false).as_str()); } Err(_) => { // Keep remote reference if unable to retrieve the asset - if is_http_url(full_url.clone()) { - let assembled_url: String = - url_with_fragment(full_url.as_str(), url_fragment.as_str()); - result.push_str(enquote(assembled_url, false).as_str()); + if full_url.scheme() == "http" || full_url.scheme() == "https" { + result.push_str(enquote(full_url.to_string(), false).as_str()); } } } @@ -377,6 +397,7 @@ pub fn process_css<'a>( } result.push_str(")"); } + // = Token::Delim(ref value) => result.push_str(&value.to_string()), Token::Function(ref name) => { let function_name: &str = &name.clone(); @@ -388,7 +409,7 @@ pub fn process_css<'a>( process_css( cache, client, - parent_url, + document_url, parser, options, depth, @@ -413,28 +434,3 @@ pub fn process_css<'a>( Ok(result) } - -pub fn embed_css( - cache: &mut HashMap>, - client: &Client, - parent_url: &str, - css: &str, - options: &Options, - depth: u32, -) -> String { - let mut input = ParserInput::new(&css); - let mut parser = Parser::new(&mut input); - - process_css( - cache, - client, - parent_url, - &mut parser, - options, - depth, - "", - "", - "", - ) - .unwrap() -} diff --git a/src/html.rs b/src/html.rs index 7449d0d..3f5e34d 100644 --- a/src/html.rs +++ b/src/html.rs @@ -17,10 +17,7 @@ use std::default::Default; use crate::css::embed_css; use crate::js::attr_is_event_handler; use crate::opts::Options; -use crate::url::{ - data_to_data_url, get_url_fragment, is_http_url, resolve_url, url_has_protocol, - url_with_fragment, -}; +use crate::url::{clean_url, data_to_data_url, is_url_and_has_protocol, resolve_url}; use crate::utils::retrieve_asset; struct SrcSetItem<'a> { @@ -112,41 +109,54 @@ pub fn compose_csp(options: &Options) -> String { string_list.join(" ") } -pub fn create_metadata_tag(url: &str) -> String { +pub fn create_metadata_tag(url: &Url) -> String { let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true); + let mut clean_url: Url = clean_url(url.clone()); - // Safe to unwrap (we just put this through an HTTP request) - match Url::parse(url) { - Ok(mut clean_url) => { - clean_url.set_fragment(None); + // Prevent credentials from getting into metadata + if clean_url.scheme() == "http" || clean_url.scheme() == "https" { + // Only HTTP(S) URLs may feature credentials + clean_url.set_username("").unwrap(); + clean_url.set_password(None).unwrap(); + } - // Prevent credentials from getting into metadata - if is_http_url(url) { - // Only HTTP(S) URLs may feature credentials - clean_url.set_username("").unwrap(); - clean_url.set_password(None).unwrap(); - } + format!( + "", + if clean_url.scheme() == "http" || clean_url.scheme() == "https" { + &clean_url.as_str() + } else { + "local source" + }, + timestamp, + env!("CARGO_PKG_NAME"), + env!("CARGO_PKG_VERSION"), + ) +} - format!( - "", - if is_http_url(url) { - &clean_url.as_str() - } else { - "local source" - }, - timestamp, - env!("CARGO_PKG_NAME"), - env!("CARGO_PKG_VERSION"), - ) +pub fn determine_link_node_type(node: &Handle) -> &str { + let mut link_type: &str = "unknown"; + + if let Some(link_attr_rel_value) = get_node_attr(node, "rel") { + if is_icon(&link_attr_rel_value) { + link_type = "icon"; + } else if link_attr_rel_value.eq_ignore_ascii_case("stylesheet") + || link_attr_rel_value.eq_ignore_ascii_case("alternate stylesheet") + { + link_type = "stylesheet"; + } else if link_attr_rel_value.eq_ignore_ascii_case("preload") { + link_type = "preload"; + } else if link_attr_rel_value.eq_ignore_ascii_case("dns-prefetch") { + link_type = "dns-prefetch"; } - Err(_) => str!(), } + + link_type } pub fn embed_srcset( cache: &mut HashMap>, client: &Client, - parent_url: &str, + document_url: &Url, srcset: &str, options: &Options, depth: u32, @@ -169,30 +179,26 @@ pub fn embed_srcset( if options.no_images { result.push_str(empty_image!()); } else { - let image_full_url = resolve_url(&parent_url, part.path).unwrap_or_default(); - let image_url_fragment = get_url_fragment(image_full_url.clone()); + let image_full_url: Url = resolve_url(&document_url, part.path); match retrieve_asset( cache, client, - &parent_url, + &document_url, &image_full_url, options, depth + 1, ) { Ok((image_data, image_final_url, image_media_type)) => { - let image_data_url = + let mut image_data_url = data_to_data_url(&image_media_type, &image_data, &image_final_url); // Append retreved asset as a data URL - let assembled_url: String = - url_with_fragment(image_data_url.as_str(), image_url_fragment.as_str()); - result.push_str(assembled_url.as_ref()); + image_data_url.set_fragment(image_full_url.fragment()); + result.push_str(image_data_url.as_ref()); } Err(_) => { // Keep remote reference if unable to retrieve the asset - if is_http_url(image_full_url.clone()) { - let assembled_url: String = - url_with_fragment(image_full_url.as_str(), image_url_fragment.as_str()); - result.push_str(assembled_url.as_ref()); + if image_full_url.scheme() == "http" || image_full_url.scheme() == "https" { + result.push_str(image_full_url.as_ref()); } else { // Avoid breaking the structure in case if not an HTTP(S) URL result.push_str(empty_image!()); @@ -454,9 +460,9 @@ pub fn stringify_document(handle: &Handle, options: &Options) -> String { ], Default::default(), ); - // Note: the CSP meta-tag has to be prepended, never appended, - // since there already may be one defined in the original document, - // and browsers don't allow re-defining them (for obvious reasons) + // The CSP meta-tag has to be prepended, never appended, + // since there already may be one defined in the original document, + // and browsers don't allow re-defining them (for obvious reasons) head.children.borrow_mut().reverse(); head.children.borrow_mut().push(meta.clone()); head.children.borrow_mut().reverse(); @@ -471,10 +477,117 @@ pub fn stringify_document(handle: &Handle, options: &Options) -> String { result } +pub fn retrieve_and_embed_asset( + cache: &mut HashMap>, + client: &Client, + document_url: &Url, + node: &Handle, + attr_name: &str, + attr_value: &str, + options: &Options, + depth: u32, +) { + let resolved_url: Url = resolve_url(document_url, attr_value.clone()); + + match retrieve_asset( + cache, + client, + &document_url.clone(), + &resolved_url, + options, + depth + 1, + ) { + Ok((data, final_url, mut media_type)) => { + // Check integrity if it's a LINK or SCRIPT tag + let node_name: &str = get_node_name(&node).unwrap(); + let mut ok_to_include: bool = true; + + if node_name == "link" || node_name == "script" { + let node_integrity_attr_value: Option = get_node_attr(node, "integrity"); + + // Check integrity + if let Some(node_integrity_attr_value) = node_integrity_attr_value { + if !node_integrity_attr_value.is_empty() { + ok_to_include = check_integrity(&data, &node_integrity_attr_value); + } + } + + // Wipe integrity attribute + set_node_attr(node, "integrity", None); + } + + if ok_to_include { + if node_name == "link" { + let link_type: &str = determine_link_node_type(node); + // CSS LINK nodes requires special treatment + if link_type == "stylesheet" { + let css: String = embed_css( + cache, + client, + &final_url, + &String::from_utf8_lossy(&data), + options, + depth + 1, + ); + let css_data_url = data_to_data_url("text/css", css.as_bytes(), &final_url); + + set_node_attr(&node, attr_name, Some(css_data_url.to_string())); + + return; // Do not fall through + } + } else if node_name == "frame" || node_name == "iframe" { + let frame_dom = html_to_dom(&String::from_utf8_lossy(&data)); + walk_and_embed_assets( + cache, + client, + &final_url, + &frame_dom.document, + &options, + depth + 1, + ); + + let mut frame_data: Vec = Vec::new(); + serialize( + &mut frame_data, + &frame_dom.document, + SerializeOpts::default(), + ) + .unwrap(); + + let mut frame_data_url = data_to_data_url(&media_type, &frame_data, &final_url); + + frame_data_url.set_fragment(resolved_url.fragment()); + + set_node_attr(node, attr_name, Some(frame_data_url.to_string())); + + return; // Do not fall through + } + + // Everything else + if node_name == "script" { + media_type = "application/javascript".to_string(); + } + let mut data_url = data_to_data_url(&media_type, &data, &final_url); + data_url.set_fragment(resolved_url.fragment()); + set_node_attr(node, attr_name, Some(data_url.to_string())); + } + } + Err(_) => { + if resolved_url.scheme() == "http" || resolved_url.scheme() == "https" { + // Keep remote reference if unable to retrieve the asset + set_node_attr(node, attr_name, Some(resolved_url.to_string())); + } else { + // Exclude non-remote URLs + set_node_attr(node, attr_name, None); + } + } + } +} + pub fn walk_and_embed_assets( cache: &mut HashMap>, client: &Client, - url: &str, + document_url: &Url, node: &Handle, options: &Options, depth: u32, @@ -483,7 +596,7 @@ pub fn walk_and_embed_assets( NodeData::Document => { // Dig deeper for child in node.children.borrow().iter() { - walk_and_embed_assets(cache, client, &url, child, options, depth); + walk_and_embed_assets(cache, client, &document_url, child, options, depth); } } NodeData::Element { @@ -524,198 +637,65 @@ pub fn walk_and_embed_assets( } } "link" => { - // Read and remember integrity attribute value of this LINK node - let link_attr_integrity_value: Option = - get_node_attr(node, "integrity"); - - // Remove integrity attribute from the LINK node - if link_attr_integrity_value != None { - set_node_attr(node, "integrity", None); - } - - enum LinkType { - Icon, - Stylesheet, - Preload, - DnsPrefetch, - Unknown, - } + let link_type: &str = determine_link_node_type(node); - let mut link_type = LinkType::Unknown; - if let Some(link_attr_rel_value) = get_node_attr(node, "rel") { - if is_icon(&link_attr_rel_value) { - link_type = LinkType::Icon; - } else if link_attr_rel_value.eq_ignore_ascii_case("stylesheet") - || link_attr_rel_value.eq_ignore_ascii_case("alternate stylesheet") - { - link_type = LinkType::Stylesheet; - } else if link_attr_rel_value.eq_ignore_ascii_case("preload") { - link_type = LinkType::Preload; - } else if link_attr_rel_value.eq_ignore_ascii_case("dns-prefetch") { - link_type = LinkType::DnsPrefetch; - } - } - // Shadow the variable (to make it non-mutable) - let link_type = link_type; - - match link_type { - LinkType::Icon => { - // Find and resolve this LINK node's href attribute - if let Some(link_attr_href_value) = get_node_attr(node, "href") { - if !options.no_images && !link_attr_href_value.is_empty() { - let link_href_full_url = - resolve_url(&url, link_attr_href_value).unwrap_or_default(); - let link_href_url_fragment = - get_url_fragment(link_href_full_url.clone()); - match retrieve_asset( - cache, - client, - &url, - &link_href_full_url, - options, - depth + 1, - ) { - Ok(( - link_href_data, - link_href_final_url, - link_href_media_type, - )) => { - let mut ok_to_include = true; - - // Check integrity - if let Some(link_attr_integrity_value) = - link_attr_integrity_value - { - if !link_attr_integrity_value.is_empty() { - ok_to_include = check_integrity( - &link_href_data, - &link_attr_integrity_value, - ); - } - } - - if ok_to_include { - let link_href_data_url = data_to_data_url( - &link_href_media_type, - &link_href_data, - &link_href_final_url, - ); - // Add new data URL href attribute - let assembled_url: String = url_with_fragment( - link_href_data_url.as_str(), - link_href_url_fragment.as_str(), - ); - set_node_attr(&node, "href", Some(assembled_url)); - } - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(link_href_full_url.clone()) { - let assembled_url: String = url_with_fragment( - link_href_full_url.as_str(), - link_href_url_fragment.as_str(), - ); - set_node_attr(node, "href", Some(assembled_url)); - } - } - } - } else { - set_node_attr(node, "href", None); - } + if link_type == "icon" { + // Find and resolve this LINK node's href attribute + if let Some(link_attr_href_value) = get_node_attr(node, "href") { + if !options.no_images && !link_attr_href_value.is_empty() { + retrieve_and_embed_asset( + cache, + client, + &document_url, + node, + "href", + &link_attr_href_value, + options, + depth, + ); + } else { + set_node_attr(node, "href", None); } } - LinkType::Stylesheet => { - // Find and resolve this LINK node's href attribute - if let Some(link_attr_href_value) = get_node_attr(node, "href") { + } else if link_type == "stylesheet" { + // Find and resolve this LINK node's href attribute + if let Some(link_attr_href_value) = get_node_attr(node, "href") { + if options.no_css { set_node_attr(node, "href", None); - - if !options.no_css && !link_attr_href_value.is_empty() { - let link_href_full_url = - resolve_url(&url, link_attr_href_value).unwrap_or_default(); - match retrieve_asset( + } else { + if !link_attr_href_value.is_empty() { + retrieve_and_embed_asset( cache, client, - &url, - &link_href_full_url, + &document_url, + node, + "href", + &link_attr_href_value, options, - depth + 1, - ) { - Ok(( - link_href_data, - link_href_final_url, - _link_href_media_type, - )) => { - let mut ok_to_include = true; - - // Check integrity - if let Some(link_attr_integrity_value) = - link_attr_integrity_value - { - if !link_attr_integrity_value.is_empty() { - ok_to_include = check_integrity( - &link_href_data, - &link_attr_integrity_value, - ); - } - } - - if ok_to_include { - let css: String = embed_css( - cache, - client, - &link_href_final_url, - &String::from_utf8_lossy(&link_href_data), - options, - depth + 1, - ); - let link_href_data_url = data_to_data_url( - "text/css", - css.as_bytes(), - &link_href_final_url, - ); - // Add new data URL href attribute - set_node_attr( - &node, - "href", - Some(link_href_data_url), - ); - } - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(link_href_full_url.clone()) { - set_node_attr( - &node, - "href", - Some(link_href_full_url), - ); - } - } - } + depth, + ); } } } - LinkType::Preload | LinkType::DnsPrefetch => { - // Since all resources are embedded as data URLs, preloading and prefetching are not necessary - set_node_attr(node, "rel", None); - } - LinkType::Unknown => { - // Make sure that all other LINKs' href attributes are full URLs - if let Some(link_attr_href_value) = get_node_attr(node, "href") { - let href_full_url = - resolve_url(&url, link_attr_href_value).unwrap_or_default(); - set_node_attr(node, "href", Some(href_full_url)); - } + } else if link_type == "preload" || link_type == "dns-prefetch" { + // Since all resources are embedded as data URLs, preloading and prefetching are not necessary + set_node_attr(node, "rel", None); + } else { + // Make sure that all other LINKs' href attributes are full URLs + if let Some(link_attr_href_value) = get_node_attr(node, "href") { + let href_full_url: Url = + resolve_url(&document_url, &link_attr_href_value); + set_node_attr(node, "href", Some(href_full_url.to_string())); } } } "base" => { - if is_http_url(url) { + if document_url.scheme() == "http" || document_url.scheme() == "https" { // Ensure the BASE node doesn't have a relative URL if let Some(base_attr_href_value) = get_node_attr(node, "href") { - let href_full_url = - resolve_url(&url, base_attr_href_value).unwrap_or_default(); - set_node_attr(node, "href", Some(href_full_url)); + let href_full_url: Url = + resolve_url(document_url, &base_attr_href_value); + set_node_attr(node, "href", Some(href_full_url.to_string())); } } } @@ -726,46 +706,16 @@ pub fn walk_and_embed_assets( set_node_attr(node, "background", None); if !options.no_images && !body_attr_background_value.is_empty() { - let background_full_url = - resolve_url(&url, body_attr_background_value).unwrap_or_default(); - let background_url_fragment = - get_url_fragment(background_full_url.clone()); - match retrieve_asset( + retrieve_and_embed_asset( cache, client, - &url, - &background_full_url, + document_url, + node, + "background", + &body_attr_background_value, options, - depth + 1, - ) { - Ok(( - background_data, - background_final_url, - background_media_type, - )) => { - let background_data_url = data_to_data_url( - &background_media_type, - &background_data, - &background_final_url, - ); - // Convert background attribute to data URL - let assembled_url: String = url_with_fragment( - background_data_url.as_str(), - background_url_fragment.as_str(), - ); - set_node_attr(node, "background", Some(assembled_url)); - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(background_full_url.clone()) { - let assembled_url: String = url_with_fragment( - background_full_url.as_str(), - background_url_fragment.as_str(), - ); - set_node_attr(node, "background", Some(assembled_url)); - } - } - } + depth, + ); } } } @@ -793,63 +743,39 @@ pub fn walk_and_embed_assets( set_node_attr(node, "src", Some(str!())); } else { // Add data URL src attribute - let img_full_url = resolve_url( - &url, - if !img_attr_data_src_value - .clone() - .unwrap_or_default() - .is_empty() - { - img_attr_data_src_value.unwrap_or_default() - } else { - img_attr_src_value.unwrap_or_default() - }, - ) - .unwrap_or_default(); - let img_url_fragment = get_url_fragment(img_full_url.clone()); - - match retrieve_asset( + let img_full_url: String = if !img_attr_data_src_value + .clone() + .unwrap_or_default() + .is_empty() + { + img_attr_data_src_value.unwrap_or_default() + } else { + img_attr_src_value.unwrap_or_default() + }; + retrieve_and_embed_asset( cache, client, - &url, + document_url, + node, + "src", &img_full_url, options, - depth + 1, - ) { - Ok((img_data, img_final_url, img_media_type)) => { - let img_data_url = data_to_data_url( - &img_media_type, - &img_data, - &img_final_url, - ); - let assembled_url: String = url_with_fragment( - img_data_url.as_str(), - img_url_fragment.as_str(), - ); - set_node_attr(node, "src", Some(assembled_url)); - } - Err(_) => { - if is_http_url(img_full_url.clone()) { - // Keep remote reference if unable to retrieve the asset - let assembled_url: String = url_with_fragment( - img_full_url.as_str(), - img_url_fragment.as_str(), - ); - set_node_attr(node, "src", Some(assembled_url)); - } else { - // Don't keep original reference if it's not a remote target - set_node_attr(node, "src", None); - } - } - } + depth, + ); } } // Resolve srcset attribute if let Some(img_srcset) = get_node_attr(node, "srcset") { if !img_srcset.is_empty() { - let resolved_srcset: String = - embed_srcset(cache, client, &url, &img_srcset, options, depth); + let resolved_srcset: String = embed_srcset( + cache, + client, + &document_url, + &img_srcset, + options, + depth, + ); set_node_attr(node, "srcset", Some(resolved_srcset)); } } @@ -871,46 +797,16 @@ pub fn walk_and_embed_assets( }; set_node_attr(node, "src", Some(value)); } else { - let input_image_full_url = - resolve_url(&url, input_attr_src_value).unwrap_or_default(); - let input_image_url_fragment = - get_url_fragment(input_image_full_url.clone()); - match retrieve_asset( + retrieve_and_embed_asset( cache, client, - &url, - &input_image_full_url, + document_url, + node, + "src", + &input_attr_src_value, options, - depth + 1, - ) { - Ok(( - input_image_data, - input_image_final_url, - input_image_media_type, - )) => { - let input_image_data_url = data_to_data_url( - &input_image_media_type, - &input_image_data, - &input_image_final_url, - ); - // Add data URL src attribute - let assembled_url: String = url_with_fragment( - input_image_data_url.as_str(), - input_image_url_fragment.as_str(), - ); - set_node_attr(node, "src", Some(assembled_url)); - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(input_image_full_url.clone()) { - let assembled_url: String = url_with_fragment( - input_image_full_url.as_str(), - input_image_url_fragment.as_str(), - ); - set_node_attr(node, "src", Some(assembled_url)); - } - } - } + depth, + ); } } } @@ -933,40 +829,16 @@ pub fn walk_and_embed_assets( } if !options.no_images && !image_href.is_empty() { - let image_full_url = resolve_url(&url, image_href).unwrap_or_default(); - let image_url_fragment = get_url_fragment(image_full_url.clone()); - match retrieve_asset( + retrieve_and_embed_asset( cache, client, - &url, - &image_full_url, + document_url, + node, + "href", + &image_href, options, - depth + 1, - ) { - Ok((image_data, image_final_url, image_media_type)) => { - let image_data_url = data_to_data_url( - &image_media_type, - &image_data, - &image_final_url, - ); - // Add new data URL href attribute - let assembled_url: String = url_with_fragment( - image_data_url.as_str(), - image_url_fragment.as_str(), - ); - set_node_attr(node, "href", Some(assembled_url)); - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(image_full_url.clone()) { - let assembled_url: String = url_with_fragment( - image_full_url.as_str(), - image_url_fragment.as_str(), - ); - set_node_attr(node, "href", Some(assembled_url)); - } - } - } + depth, + ); } } "source" => { @@ -978,87 +850,31 @@ pub fn walk_and_embed_assets( if options.no_audio { set_node_attr(node, "src", None); } else { - let src_full_url: String = - resolve_url(&url, source_attr_src_value.clone()) - .unwrap_or_else(|_| source_attr_src_value.to_string()); - let src_url_fragment = get_url_fragment(src_full_url.clone()); - match retrieve_asset( + retrieve_and_embed_asset( cache, client, - &url, - &src_full_url, + document_url, + node, + "src", + &source_attr_src_value, options, - depth + 1, - ) { - Ok((src_data, src_final_url, src_media_type)) => { - let src_data_url = data_to_data_url( - &src_media_type, - &src_data, - &src_final_url, - ); - let assembled_url: String = url_with_fragment( - src_data_url.as_str(), - src_url_fragment.as_str(), - ); - set_node_attr(node, "src", Some(assembled_url)); - } - Err(_) => { - if is_http_url(src_full_url.clone()) { - // Keep remote reference if unable to retrieve the asset - let assembled_url: String = url_with_fragment( - src_full_url.as_str(), - src_url_fragment.as_str(), - ); - set_node_attr(node, "src", Some(assembled_url)); - } else { - // Exclude non-remote URLs - set_node_attr(node, "src", None); - } - } - } + depth, + ); } } else if parent_node_name == "video" { if options.no_video { set_node_attr(node, "src", None); } else { - let src_full_url: String = - resolve_url(&url, source_attr_src_value.clone()) - .unwrap_or_else(|_| source_attr_src_value.to_string()); - let src_url_fragment = get_url_fragment(src_full_url.clone()); - match retrieve_asset( + retrieve_and_embed_asset( cache, client, - &url, - &src_full_url, + document_url, + node, + "src", + &source_attr_src_value, options, - depth + 1, - ) { - Ok((src_data, src_final_url, src_media_type)) => { - let src_data_url = data_to_data_url( - &src_media_type, - &src_data, - &src_final_url, - ); - let assembled_url: String = url_with_fragment( - src_data_url.as_str(), - src_url_fragment.as_str(), - ); - set_node_attr(node, "src", Some(assembled_url)); - } - Err(_) => { - if is_http_url(src_full_url.clone()) { - // Keep remote reference if unable to retrieve the asset - let assembled_url: String = url_with_fragment( - src_full_url.as_str(), - src_url_fragment.as_str(), - ); - set_node_attr(node, "src", Some(assembled_url)); - } else { - // Exclude non-remote URLs - set_node_attr(node, "src", None); - } - } - } + depth, + ); } } } @@ -1072,7 +888,7 @@ pub fn walk_and_embed_assets( let resolved_srcset: String = embed_srcset( cache, client, - &url, + &document_url, &source_attr_srcset_value, options, depth, @@ -1085,35 +901,30 @@ pub fn walk_and_embed_assets( } "a" | "area" => { if let Some(anchor_attr_href_value) = get_node_attr(node, "href") { - if options.no_js - && anchor_attr_href_value - .clone() - .trim() - .starts_with("javascript:") + if anchor_attr_href_value + .clone() + .trim() + .starts_with("javascript:") { - // Replace with empty JS call to preserve original behavior - set_node_attr(node, "href", Some(str!("javascript:;"))); + if options.no_js { + // Replace with empty JS call to preserve original behavior + set_node_attr(node, "href", Some(str!("javascript:;"))); + } } else if anchor_attr_href_value.clone().starts_with('#') - || url_has_protocol(anchor_attr_href_value.clone()) + || is_url_and_has_protocol(&anchor_attr_href_value.clone()) { - // Don't touch email links or hrefs which begin with a hash + // Don't touch mailto: links or hrefs which begin with a hash sign } else { - let href_full_url = - resolve_url(&url, anchor_attr_href_value).unwrap_or_default(); - set_node_attr(node, "href", Some(href_full_url)); + let href_full_url: Url = + resolve_url(document_url, &anchor_attr_href_value); + set_node_attr(node, "href", Some(href_full_url.to_string())); } } } "script" => { // Read values of integrity and src attributes - let script_attr_integrity: Option = get_node_attr(node, "integrity"); let script_attr_src: Option = get_node_attr(node, "src"); - // Wipe integrity attribute - if script_attr_integrity != None { - set_node_attr(node, "integrity", None); - } - if options.no_js { // Empty inner content node.children.borrow_mut().clear(); @@ -1122,52 +933,16 @@ pub fn walk_and_embed_assets( set_node_attr(node, "src", None); } } else if !script_attr_src.clone().unwrap_or_default().is_empty() { - let script_full_url = - resolve_url(&url, script_attr_src.unwrap_or_default()) - .unwrap_or_default(); - match retrieve_asset( + retrieve_and_embed_asset( cache, client, - &url, - &script_full_url, + document_url, + node, + "src", + &script_attr_src.unwrap_or_default(), options, - depth + 1, - ) { - Ok((script_data, script_final_url, _script_media_type)) => { - let mut ok_to_include = true; - - // Check integrity - if let Some(script_attr_integrity_value) = script_attr_integrity { - if !script_attr_integrity_value.is_empty() { - ok_to_include = check_integrity( - &script_data, - &script_attr_integrity_value, - ); - } - } - - if ok_to_include { - // Only embed if we're able to validate integrity - let script_data_url = data_to_data_url( - "application/javascript", - &script_data, - &script_final_url, - ); - set_node_attr(node, "src", Some(script_data_url)); - } else { - set_node_attr(node, "src", None); - } - } - Err(_) => { - if is_http_url(script_full_url.clone()) { - // Keep remote reference if unable to retrieve the asset - set_node_attr(node, "src", Some(script_full_url)); - } else { - // Remove src attribute if target is not remote - set_node_attr(node, "src", None); - } - } - }; + depth, + ); } } "style" => { @@ -1181,7 +956,7 @@ pub fn walk_and_embed_assets( let replacement = embed_css( cache, client, - &url, + &document_url, tendril.as_ref(), options, depth, @@ -1195,11 +970,9 @@ pub fn walk_and_embed_assets( "form" => { if let Some(form_attr_action_value) = get_node_attr(node, "action") { // Modify action property to ensure it's a full URL - if !is_http_url(form_attr_action_value.clone()) { - let form_action_full_url = - resolve_url(&url, form_attr_action_value).unwrap_or_default(); - set_node_attr(node, "action", Some(form_action_full_url)); - } + let form_action_full_url: Url = + resolve_url(document_url, &form_attr_action_value); + set_node_attr(node, "action", Some(form_action_full_url.to_string())); } } "frame" | "iframe" => { @@ -1208,154 +981,57 @@ pub fn walk_and_embed_assets( // Empty the src attribute set_node_attr(node, "src", Some(str!())); } else { - let frame_src = frame_attr_src_value.trim(); - // Ignore (i)frames with empty source (they cause infinite loops) - if !frame_src.is_empty() { - let frame_full_url = - resolve_url(&url, frame_src).unwrap_or_default(); - let frame_url_fragment = get_url_fragment(frame_full_url.clone()); - match retrieve_asset( + if !frame_attr_src_value.trim().is_empty() { + retrieve_and_embed_asset( cache, client, - &url, - &frame_full_url, + &document_url, + node, + "href", + &frame_attr_src_value, options, - depth + 1, - ) { - Ok((frame_data, frame_final_url, frame_media_type)) => { - let frame_dom = - html_to_dom(&String::from_utf8_lossy(&frame_data)); - walk_and_embed_assets( - cache, - client, - &frame_final_url, - &frame_dom.document, - &options, - depth + 1, - ); - let mut frame_data: Vec = Vec::new(); - serialize( - &mut frame_data, - &frame_dom.document, - SerializeOpts::default(), - ) - .unwrap(); - let frame_data_url = data_to_data_url( - &frame_media_type, - &frame_data, - &frame_final_url, - ); - let assembled_url: String = url_with_fragment( - frame_data_url.as_str(), - frame_url_fragment.as_str(), - ); - set_node_attr(node, "src", Some(assembled_url)); - } - Err(_) => { - // Keep remote reference if unable to retrieve the asset - if is_http_url(frame_full_url.clone()) { - let assembled_url: String = url_with_fragment( - frame_full_url.as_str(), - frame_url_fragment.as_str(), - ); - set_node_attr(node, "src", Some(assembled_url)); - } - } - } + depth, + ); } } } } "audio" => { + // Embed audio source if let Some(audio_attr_src_value) = get_node_attr(node, "src") { if options.no_audio { set_node_attr(node, "src", None); } else { - let src_full_url: String = - resolve_url(&url, audio_attr_src_value.clone()) - .unwrap_or_else(|_| audio_attr_src_value.to_string()); - let src_url_fragment = get_url_fragment(src_full_url.clone()); - match retrieve_asset( + retrieve_and_embed_asset( cache, client, - &url, - &src_full_url, + document_url, + node, + "src", + &audio_attr_src_value, options, - depth + 1, - ) { - Ok((src_data, src_final_url, src_media_type)) => { - let src_data_url = data_to_data_url( - &src_media_type, - &src_data, - &src_final_url, - ); - let assembled_url: String = url_with_fragment( - src_data_url.as_str(), - src_url_fragment.as_str(), - ); - set_node_attr(node, "src", Some(assembled_url)); - } - Err(_) => { - if is_http_url(src_full_url.clone()) { - // Keep remote reference if unable to retrieve the asset - let assembled_url: String = url_with_fragment( - src_full_url.as_str(), - src_url_fragment.as_str(), - ); - set_node_attr(node, "src", Some(assembled_url)); - } else { - // Exclude non-remote URLs - set_node_attr(node, "src", None); - } - } - } + depth, + ); } } } "video" => { + // Embed video source if let Some(video_attr_src_value) = get_node_attr(node, "src") { if options.no_video { set_node_attr(node, "src", None); } else { - let src_full_url: String = - resolve_url(&url, video_attr_src_value.clone()) - .unwrap_or_else(|_| video_attr_src_value.to_string()); - let src_url_fragment = get_url_fragment(src_full_url.clone()); - match retrieve_asset( + retrieve_and_embed_asset( cache, client, - &url, - &src_full_url, + document_url, + node, + "src", + &video_attr_src_value, options, - depth + 1, - ) { - Ok((src_data, src_final_url, src_media_type)) => { - let src_data_url = data_to_data_url( - &src_media_type, - &src_data, - &src_final_url, - ); - let assembled_url: String = url_with_fragment( - src_data_url.as_str(), - src_url_fragment.as_str(), - ); - set_node_attr(node, "src", Some(assembled_url)); - } - Err(_) => { - if is_http_url(src_full_url.clone()) { - // Keep remote reference if unable to retrieve the asset - let assembled_url: String = url_with_fragment( - src_full_url.as_str(), - src_url_fragment.as_str(), - ); - set_node_attr(node, "src", Some(assembled_url)); - } else { - // Exclude non-remote URLs - set_node_attr(node, "src", None); - } - } - } + depth, + ); } } @@ -1366,48 +1042,16 @@ pub fn walk_and_embed_assets( if options.no_images { set_node_attr(node, "poster", Some(str!(empty_image!()))); } else { - let video_poster_full_url = - resolve_url(&url, video_attr_poster_value).unwrap_or_default(); - let video_poster_url_fragment = - get_url_fragment(video_poster_full_url.clone()); - match retrieve_asset( + retrieve_and_embed_asset( cache, client, - &url, - &video_poster_full_url, + document_url, + node, + "poster", + &video_attr_poster_value, options, - depth + 1, - ) { - Ok(( - video_poster_data, - video_poster_final_url, - video_poster_media_type, - )) => { - let video_poster_data_url = data_to_data_url( - &video_poster_media_type, - &video_poster_data, - &video_poster_final_url, - ); - let assembled_url: String = url_with_fragment( - video_poster_data_url.as_str(), - video_poster_url_fragment.as_str(), - ); - set_node_attr(node, "poster", Some(assembled_url)); - } - Err(_) => { - if is_http_url(video_poster_full_url.clone()) { - // Keep remote reference if unable to retrieve the asset - let assembled_url: String = url_with_fragment( - video_poster_full_url.as_str(), - video_poster_url_fragment.as_str(), - ); - set_node_attr(node, "poster", Some(assembled_url)); - } else { - // Get rid of poster attribute if the URL is not remote - set_node_attr(node, "poster", None); - } - } - } + depth, + ); } } } @@ -1424,7 +1068,7 @@ pub fn walk_and_embed_assets( walk_and_embed_assets( cache, client, - &url, + &document_url, &noscript_contents_dom.document, &options, depth, @@ -1458,12 +1102,19 @@ pub fn walk_and_embed_assets( } else { // Embed URLs found within the style attribute of this node if let Some(node_attr_style_value) = get_node_attr(node, "style") { - let embedded_style = - embed_css(cache, client, &url, &node_attr_style_value, options, depth); + let embedded_style = embed_css( + cache, + client, + &document_url, + &node_attr_style_value, + options, + depth, + ); set_node_attr(node, "style", Some(embedded_style)); } } + // Strip all JS from document if options.no_js { let attrs_mut = &mut attrs.borrow_mut(); // Get rid of JS event attributes @@ -1481,7 +1132,7 @@ pub fn walk_and_embed_assets( // Dig deeper for child in node.children.borrow().iter() { - walk_and_embed_assets(cache, client, &url, child, options, depth); + walk_and_embed_assets(cache, client, &document_url, child, options, depth); } } _ => { diff --git a/src/main.rs b/src/main.rs index 0f1e243..d04fba6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,21 +1,19 @@ use reqwest::blocking::Client; use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; use std::collections::HashMap; -use std::env; use std::fs; use std::io::{self, prelude::*, Error, Write}; use std::path::Path; use std::process; use std::time::Duration; +use url::Url; use monolith::html::{ add_favicon, create_metadata_tag, get_base_url, has_favicon, html_to_dom, set_base_url, stringify_document, walk_and_embed_assets, }; use monolith::opts::Options; -use monolith::url::{ - data_to_data_url, is_data_url, is_file_url, is_http_url, parse_data_url, resolve_url, -}; +use monolith::url::{data_to_data_url, parse_data_url, resolve_url}; use monolith::utils::retrieve_asset; mod macros; @@ -50,62 +48,87 @@ impl Output { pub fn read_stdin() -> String { let mut buffer = String::new(); + for line in io::stdin().lock().lines() { buffer += line.unwrap_or_default().as_str(); buffer += "\n"; } + buffer } fn main() { let options = Options::from_args(); - let original_target: &str = &options.target; - let target_url: &str; - let mut base_url: String; - let mut dom; - let mut use_stdin: bool = false; + let mut target: String = str!(&options.target.clone()); - // Pre-process the input - let cwd_normalized: String = - str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/"); - let path = Path::new(original_target); - let mut target: String = str!(original_target.clone()).replace("\\", "/"); - let path_is_relative: bool = path.is_relative(); - - // Determine exact target URL - if target.clone().len() == 0 { + // Check if target was provided + if target.len() == 0 { if !options.silent { eprintln!("No target specified"); } process::exit(1); - } else if target.clone() == "-" { + } + + let target_url: Url; + let mut base_url: Url; + let mut use_stdin: bool = false; + + // Determine exact target URL + if target.clone() == "-" { // Read from pipe (stdin) use_stdin = true; - // Default target URL to empty data URL; the user can control it via --base-url - target_url = "data:text/html," - } else if is_http_url(target.clone()) || is_data_url(target.clone()) { - target_url = target.as_str(); - } else if is_file_url(target.clone()) { - target_url = target.as_str(); - } else if path.exists() { - if !path.is_file() { - if !options.silent { - eprintln!("Local target is not a file: {}", original_target); + // Set default target URL to an empty data URL; the user can control it via --base-url + target_url = Url::parse("data:text/html,").unwrap(); + } else { + match Url::parse(&target.clone()) { + Ok(parsed_url) => { + if parsed_url.scheme() == "data" + || parsed_url.scheme() == "file" + || (parsed_url.scheme() == "http" || parsed_url.scheme() == "https") + { + target_url = parsed_url; + } else { + if !options.silent { + eprintln!("Unsupported target URL type: {}", &parsed_url.scheme()); + } + process::exit(1); + } + } + Err(_err) => { + // Failed to parse given base URL, + // perhaps it's a filesystem path? + let path: &Path = Path::new(&target); + + if path.exists() { + if path.is_file() { + match Url::from_file_path(fs::canonicalize(&path).unwrap()) { + Ok(file_url) => { + target_url = file_url; + } + Err(_err) => { + if !options.silent { + eprintln!( + "Could not generate file URL out of given path: {}", + "err" + ); + } + process::exit(1); + } + } + } else { + if !options.silent { + eprintln!("Local target is not a file: {}", &options.target); + } + process::exit(1); + } + } else { + // Last chance, now we do what browsers do: + // prepend "http://" and hope it points to a website + target.insert_str(0, "http://"); + target_url = Url::parse(&target).unwrap(); + } } - process::exit(1); - } - target.insert_str(0, if cfg!(windows) { "file:///" } else { "file://" }); - if path_is_relative { - target.insert_str(if cfg!(windows) { 8 } else { 7 }, &cwd_normalized); - target.insert_str( - if cfg!(windows) { 8 } else { 7 } + &cwd_normalized.len(), - "/", - ); } - target_url = target.as_str(); - } else { - target.insert_str(0, "http://"); - target_url = target.as_str(); } // Define output @@ -123,7 +146,7 @@ fn main() { let timeout: u64 = if options.timeout > 0 { options.timeout } else { - std::u64::MAX / 4 + std::u64::MAX / 4 // This is pretty close to infinity }; let client = Client::builder() .timeout(Duration::from_secs(timeout)) @@ -133,13 +156,17 @@ fn main() { .expect("Failed to initialize HTTP client"); // At this stage we assume that the base URL is the same as the target URL - base_url = str!(target_url); + base_url = target_url.clone(); + + let mut dom; // Retrieve target document if use_stdin { dom = html_to_dom(&read_stdin()); - } else if is_file_url(target_url) || is_http_url(target_url) { - match retrieve_asset(&mut cache, &client, target_url, target_url, &options, 0) { + } else if target_url.scheme() == "file" + || (target_url.scheme() == "http" || target_url.scheme() == "https") + { + match retrieve_asset(&mut cache, &client, &target_url, &target_url, &options, 0) { Ok((data, final_url, _media_type)) => { if options.base_url.clone().unwrap_or(str!()).is_empty() { base_url = final_url @@ -153,61 +180,97 @@ fn main() { process::exit(1); } } - } else if is_data_url(target_url) { - let (media_type, data): (String, Vec) = parse_data_url(target_url); + } else if target_url.scheme() == "data" { + let (media_type, data): (String, Vec) = parse_data_url(&target_url); + if !media_type.eq_ignore_ascii_case("text/html") { if !options.silent { eprintln!("Unsupported data URL media type"); } process::exit(1); } + dom = html_to_dom(&String::from_utf8_lossy(&data)); } else { process::exit(1); } // Use custom base URL if specified, read and use what's in the DOM otherwise - if !options.base_url.clone().unwrap_or(str!()).is_empty() { - if is_data_url(options.base_url.clone().unwrap()) { - if !options.silent { - eprintln!("Data URLs cannot be used as base URL"); - } - process::exit(1); - } else { - base_url = options.base_url.clone().unwrap(); + let b: String = options.base_url.clone().unwrap_or(str!()); + if b.is_empty() { + // No custom base URL is specified, + // try to see if the document has BASE tag + if let Some(existing_base_url) = get_base_url(&dom.document) { + base_url = resolve_url(&target_url, &existing_base_url); } } else { - if let Some(existing_base_url) = get_base_url(&dom.document) { - base_url = resolve_url(target_url, existing_base_url).unwrap(); + // Custom base URL provided + match Url::parse(&b) { + Ok(parsed_url) => { + if parsed_url.scheme() == "file" { + // File base URLs can only work with + // documents saved from filesystem + if target_url.scheme() == "file" { + base_url = parsed_url; + } + } else { + base_url = parsed_url; + } + } + Err(_) => { + // Failed to parse given base URL, + // perhaps it's a filesystem path? + if target_url.scheme() == "file" { + // Relative paths could work for documents saved from filesystem + let path: &Path = Path::new(&b); + if path.exists() { + match Url::from_file_path(fs::canonicalize(&path).unwrap()) { + Ok(file_url) => { + base_url = file_url; + } + Err(_) => { + if !options.silent { + eprintln!("Could not map given path to base URL: {}", b); + } + process::exit(1); + } + } + } + } + } } } // Embed remote assets walk_and_embed_assets(&mut cache, &client, &base_url, &dom.document, &options, 0); - // Update or add new BASE tag to reroute network requests and hash-links in the final document + // Update or add new BASE tag to reroute network requests + // and hash-links in the final document if let Some(new_base_url) = options.base_url.clone() { dom = set_base_url(&dom.document, new_base_url); } // Request and embed /favicon.ico (unless it's already linked in the document) - if !options.no_images && is_http_url(target_url) && !has_favicon(&dom.document) { - let favicon_ico_url: String = resolve_url(&base_url, "/favicon.ico").unwrap(); + if !options.no_images + && (target_url.scheme() == "http" || target_url.scheme() == "https") + && !has_favicon(&dom.document) + { + let favicon_ico_url: Url = resolve_url(&base_url, "/favicon.ico"); match retrieve_asset( &mut cache, &client, - &base_url, + &target_url, &favicon_ico_url, &options, 0, ) { Ok((data, final_url, media_type)) => { - let favicon_data_url: String = data_to_data_url(&media_type, &data, &final_url); - dom = add_favicon(&dom.document, favicon_data_url); + let favicon_data_url: Url = data_to_data_url(&media_type, &data, &final_url); + dom = add_favicon(&dom.document, favicon_data_url.to_string()); } Err(_) => { - // Failed to retrieve favicon.ico + // Failed to retrieve /favicon.ico } } } diff --git a/src/tests/cli/basic.rs b/src/tests/cli/basic.rs index be22213..60ae9da 100644 --- a/src/tests/cli/basic.rs +++ b/src/tests/cli/basic.rs @@ -9,9 +9,10 @@ mod passing { use assert_cmd::prelude::*; use std::env; - use std::io::Write; + use std::fs; + use std::path::Path; use std::process::{Command, Stdio}; - use tempfile::NamedTempFile; + use url::Url; #[test] fn print_version() -> Result<(), Box> { @@ -58,48 +59,37 @@ mod passing { #[test] fn css_import_string() -> Result<(), Box> { - let file_url_prefix: &str = if cfg!(windows) { "file:///" } else { "file://" }; let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; - let mut file_css = NamedTempFile::new()?; - writeln!(file_css, "body{{background-color:#000;color:#fff}}")?; - let mut file_html = NamedTempFile::new()?; - writeln!( - file_html, - "\ - \n\ - ", - file = file_url_prefix, - css_path = str!(file_css.path().to_str().unwrap()).replace("\\", "/"), - )?; - let out = cmd.arg("-M").arg(file_html.path()).output().unwrap(); + let path_html: &Path = Path::new("src/tests/data/css/index.html"); + let path_css: &Path = Path::new("src/tests/data/css/style.css"); + + assert!(path_html.is_file()); + assert!(path_css.is_file()); + + let out = cmd.arg("-M").arg(path_html.as_os_str()).output().unwrap(); // STDOUT should contain embedded CSS url()'s assert_eq!( std::str::from_utf8(&out.stdout).unwrap(), - "\n\n\n" + "\n\n" ); - // STDERR should list temporary files that got retrieved + // STDERR should list files that got retrieved assert_eq!( std::str::from_utf8(&out.stderr).unwrap(), format!( "\ - {file}{html_path}\n \ - {file}{css_path}\n \ - {file}{css_path}\n \ - {file}{css_path}\n\ + {file_url_html}\n \ + {file_url_css}\n \ + {file_url_css}\n \ + {file_url_css}\n\ ", - file = file_url_prefix, - html_path = str!(file_html.path().to_str().unwrap()).replace("\\", "/"), - css_path = str!(file_css.path().to_str().unwrap()).replace("\\", "/"), + file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()) + .unwrap() + .into_string(), + file_url_css = Url::from_file_path(fs::canonicalize(&path_css).unwrap()) + .unwrap() + .into_string(), ) ); diff --git a/src/tests/cli/data_url.rs b/src/tests/cli/data_url.rs index 2c23283..d1b255a 100644 --- a/src/tests/cli/data_url.rs +++ b/src/tests/cli/data_url.rs @@ -220,7 +220,7 @@ mod passing { // STDOUT should contain HTML with no JS in it assert_eq!( std::str::from_utf8(&out.stdout).unwrap(), - "\n" + "\n" ); // STDERR should be empty diff --git a/src/tests/cli/local_files.rs b/src/tests/cli/local_files.rs index fc0c4b1..9603325 100644 --- a/src/tests/cli/local_files.rs +++ b/src/tests/cli/local_files.rs @@ -9,12 +9,13 @@ mod passing { use assert_cmd::prelude::*; use std::env; - use std::io::Write; + use std::fs; + use std::path::Path; use std::process::Command; - use tempfile::NamedTempFile; + use url::Url; #[test] - fn local_file_target_input() -> Result<(), Box> { + fn local_file_target_input_relative_target_path() -> Result<(), Box> { let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; let cwd_normalized: String = str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/"); @@ -36,7 +37,7 @@ mod passing { \n \ \n \ Local HTML file\n \ - \n \ + \n \ \n\n\n\n \ \"\"\n \ Tricky href\n \ @@ -46,13 +47,15 @@ mod passing { " ); - // STDERR should contain list of retrieved file URLs + // STDERR should contain list of retrieved file URLs, two missing assert_eq!( std::str::from_utf8(&out.stderr).unwrap(), format!( "\ {file}{cwd}/src/tests/data/basic/local-file.html\n \ {file}{cwd}/src/tests/data/basic/local-style.css\n \ + {file}{cwd}/src/tests/data/basic/local-style-does-not-exist.css (not found)\n \ + {file}{cwd}/src/tests/data/basic/monolith.png (not found)\n \ {file}{cwd}/src/tests/data/basic/local-script.js\n\ ", file = file_url_protocol, @@ -68,26 +71,15 @@ mod passing { #[test] fn local_file_target_input_absolute_target_path() -> Result<(), Box> { - let cwd = env::current_dir().unwrap(); - let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/"); let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let path_html: &Path = Path::new("src/tests/data/basic/local-file.html"); + let out = cmd .arg("-M") - .arg("-jciI") - .arg(if cfg!(windows) { - format!( - "{cwd}\\src\\tests\\data\\basic\\local-file.html", - cwd = cwd.to_str().unwrap() - ) - } else { - format!( - "{cwd}/src/tests/data/basic/local-file.html", - cwd = cwd.to_str().unwrap() - ) - }) + .arg("-Ijci") + .arg(path_html.as_os_str()) .output() .unwrap(); - let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" }; // STDOUT should contain HTML from the local file assert_eq!( @@ -114,9 +106,10 @@ mod passing { assert_eq!( std::str::from_utf8(&out.stderr).unwrap(), format!( - "{file}{cwd}/src/tests/data/basic/local-file.html\n", - file = file_url_protocol, - cwd = cwd_normalized, + "{file_url_html}\n", + file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()) + .unwrap() + .into_string(), ) ); @@ -175,19 +168,11 @@ mod passing { // STDERR should contain list of retrieved file URLs assert_eq!( std::str::from_utf8(&out.stderr).unwrap(), - if cfg!(windows) { - format!( - "{file}{cwd}/src/tests/data/basic/local-file.html\n", - file = file_url_protocol, - cwd = cwd_normalized, - ) - } else { - format!( - "{file}{cwd}/src/tests/data/basic/local-file.html\n", - file = file_url_protocol, - cwd = cwd_normalized, - ) - } + format!( + "{file}{cwd}/src/tests/data/basic/local-file.html\n", + file = file_url_protocol, + cwd = cwd_normalized, + ) ); // The exit code should be 0 @@ -199,40 +184,97 @@ mod passing { #[test] fn embed_file_url_local_asset_within_style_attribute() -> Result<(), Box> { - let file_url_prefix: &str = if cfg!(windows) { "file:///" } else { "file://" }; let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; - let mut file_svg = NamedTempFile::new()?; - writeln!(file_svg, "\ - \ - \ - SVG\ - \n")?; - let mut file_html = NamedTempFile::new()?; - writeln!( - file_html, - "
\n", - file = file_url_prefix, - path = str!(file_svg.path().to_str().unwrap()).replace("\\", "/"), - )?; - let out = cmd.arg("-M").arg(file_html.path()).output().unwrap(); + let path_html: &Path = Path::new("src/tests/data/svg/index.html"); + let path_svg: &Path = Path::new("src/tests/data/svg/image.svg"); + + let out = cmd.arg("-M").arg(path_html.as_os_str()).output().unwrap(); // STDOUT should contain HTML with date URL for background-image in it assert_eq!( std::str::from_utf8(&out.stdout).unwrap(), - "
body {}"; assert_eq!( - css::embed_css(cache, &client, "file:///", &CSS, &options, 0), + css::embed_css(cache, &client, &document_url, &CSS, &options, 0), CSS ); } @@ -135,6 +130,7 @@ mod passing { fn attribute_selectors() { let cache = &mut HashMap::new(); let client = Client::new(); + let document_url: Url = Url::parse("https://doesntmatter.local/").unwrap(); let mut options = Options::default(); options.silent = true; @@ -143,38 +139,42 @@ mod passing { /* Attribute exists */ } - [data-value='foo'] { + [data-value=\"foo\"] { /* Attribute has this exact value */ } - [data-value*='foo'] { + [data-value*=\"foo\"] { /* Attribute value contains this value somewhere in it */ } - [data-value~='foo'] { + [data-value~=\"foo\"] { /* Attribute has this value in a space-separated list somewhere */ } - [data-value^='foo'] { + [data-value^=\"foo\"] { /* Attribute value starts with this */ } - [data-value|='foo'] { + [data-value|=\"foo\"] { /* Attribute value starts with this in a dash-separated list */ } - [data-value$='foo'] { + [data-value$=\"foo\"] { /* Attribute value ends with this */ } "; - assert_eq!(css::embed_css(cache, &client, "", &CSS, &options, 0), CSS); + assert_eq!( + css::embed_css(cache, &client, &document_url, &CSS, &options, 0), + CSS + ); } #[test] fn import_string() { let cache = &mut HashMap::new(); let client = Client::new(); + let document_url: Url = Url::parse("https://doesntmatter.local/").unwrap(); let mut options = Options::default(); options.silent = true; @@ -187,16 +187,9 @@ mod passing { "; assert_eq!( - css::embed_css( - cache, - &client, - "https://doesntmatter.local/", - &CSS, - &options, - 0, - ), + css::embed_css(cache, &client, &document_url, &CSS, &options, 0,), "\ - @charset 'UTF-8';\n\ + @charset \"UTF-8\";\n\ \n\ @import 'data:text/css;base64,aHRtbHtiYWNrZ3JvdW5kLWNvbG9yOiMwMDB9';\n\ \n\ @@ -209,6 +202,7 @@ mod passing { fn hash_urls() { let cache = &mut HashMap::new(); let client = Client::new(); + let document_url: Url = Url::parse("https://doesntmatter.local/").unwrap(); let mut options = Options::default(); options.silent = true; @@ -223,14 +217,7 @@ mod passing { "; assert_eq!( - css::embed_css( - cache, - &client, - "https://doesntmatter.local/", - &CSS, - &options, - 0, - ), + css::embed_css(cache, &client, &document_url, &CSS, &options, 0,), CSS ); } @@ -239,6 +226,7 @@ mod passing { fn transform_percentages_and_degrees() { let cache = &mut HashMap::new(); let client = Client::new(); + let document_url: Url = Url::parse("https://doesntmatter.local/").unwrap(); let mut options = Options::default(); options.silent = true; @@ -251,14 +239,7 @@ mod passing { "; assert_eq!( - css::embed_css( - cache, - &client, - "https://doesntmatter.local/", - &CSS, - &options, - 0, - ), + css::embed_css(cache, &client, &document_url, &CSS, &options, 0,), CSS ); } @@ -267,6 +248,7 @@ mod passing { fn unusual_indents() { let cache = &mut HashMap::new(); let client = Client::new(); + let document_url: Url = Url::parse("https://doesntmatter.local/").unwrap(); let mut options = Options::default(); options.silent = true; @@ -281,14 +263,7 @@ mod passing { "; assert_eq!( - css::embed_css( - cache, - &client, - "https://doesntmatter.local/", - &CSS, - &options, - 0, - ), + css::embed_css(cache, &client, &document_url, &CSS, &options, 0,), CSS ); } @@ -297,6 +272,7 @@ mod passing { fn exclude_fonts() { let cache = &mut HashMap::new(); let client = Client::new(); + let document_url: Url = Url::parse("https://doesntmatter.local/").unwrap(); let mut options = Options::default(); options.no_fonts = true; options.silent = true; @@ -320,30 +296,47 @@ mod passing { font-family: 'My Font' Verdana\n\ }\n\ "; - const CSS_OUT: &str = " \ \n\ \n\ #identifier {\n \ - font-family: 'My Font' Arial\n\ + font-family: \"My Font\" Arial\n\ }\n\ \n \ \n\ \n\ div {\n \ - font-family: 'My Font' Verdana\n\ + font-family: \"My Font\" Verdana\n\ }\n\ "; assert_eq!( - css::embed_css( - cache, - &client, - "https://doesntmatter.local/", - &CSS, - &options, - 0, - ), + css::embed_css(cache, &client, &document_url, &CSS, &options, 0,), + CSS_OUT + ); + } + + #[test] + fn content() { + let cache = &mut HashMap::new(); + let client = Client::new(); + let document_url: Url = Url::parse("data:,").unwrap(); + let mut options = Options::default(); + options.silent = true; + + const CSS: &str = "\ + #language a[href=\"#translations\"]:before {\n\ + content: url(data:,) \"\\A\";\n\ + white-space: pre }\n\ + "; + const CSS_OUT: &str = "\ + #language a[href=\"#translations\"]:before {\n\ + content: url('data:;base64,') \"\\a \";\n\ + white-space: pre }\n\ + "; + + assert_eq!( + css::embed_css(cache, &client, &document_url, &CSS, &options, 0,), CSS_OUT ); } diff --git a/src/tests/data/css/index.html b/src/tests/data/css/index.html new file mode 100644 index 0000000..973b232 --- /dev/null +++ b/src/tests/data/css/index.html @@ -0,0 +1,11 @@ + diff --git a/src/tests/data/css/style.css b/src/tests/data/css/style.css new file mode 100644 index 0000000..4838b18 --- /dev/null +++ b/src/tests/data/css/style.css @@ -0,0 +1 @@ +body{background-color:#000;color:#fff} diff --git a/src/tests/data/integrity/index.html b/src/tests/data/integrity/index.html index 080bc6c..e104c60 100644 --- a/src/tests/data/integrity/index.html +++ b/src/tests/data/integrity/index.html @@ -3,8 +3,6 @@ - - Local HTML file diff --git a/src/tests/data/svg/image.svg b/src/tests/data/svg/image.svg new file mode 100644 index 0000000..e181299 --- /dev/null +++ b/src/tests/data/svg/image.svg @@ -0,0 +1,5 @@ + + + + SVG + diff --git a/src/tests/data/svg/index.html b/src/tests/data/svg/index.html new file mode 100644 index 0000000..fb47711 --- /dev/null +++ b/src/tests/data/svg/index.html @@ -0,0 +1 @@ +
diff --git a/src/tests/html/create_metadata_tag.rs b/src/tests/html/create_metadata_tag.rs index ea59731..12a11d2 100644 --- a/src/tests/html/create_metadata_tag.rs +++ b/src/tests/html/create_metadata_tag.rs @@ -8,14 +8,15 @@ #[cfg(test)] mod passing { use chrono::prelude::*; + use reqwest::Url; use crate::html; #[test] fn http_url() { - let url = "http://192.168.1.1/"; + let url: Url = Url::parse("http://192.168.1.1/").unwrap(); let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true); - let metadata_comment: String = html::create_metadata_tag(url); + let metadata_comment: String = html::create_metadata_tag(&url); assert_eq!( metadata_comment, @@ -31,9 +32,9 @@ mod passing { #[test] fn file_url() { - let url = "file:///home/monolith/index.html"; + let url: Url = Url::parse("file:///home/monolith/index.html").unwrap(); let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true); - let metadata_comment: String = html::create_metadata_tag(url); + let metadata_comment: String = html::create_metadata_tag(&url); assert_eq!( metadata_comment, @@ -48,9 +49,9 @@ mod passing { #[test] fn data_url() { - let url = "data:text/html,Hello%2C%20World!"; + let url: Url = Url::parse("data:text/html,Hello%2C%20World!").unwrap(); let timestamp = Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true); - let metadata_comment: String = html::create_metadata_tag(url); + let metadata_comment: String = html::create_metadata_tag(&url); assert_eq!( metadata_comment, @@ -63,20 +64,3 @@ mod passing { ); } } - -// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ -// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝ -// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗ -// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║ -// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝ -// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ - -#[cfg(test)] -mod failing { - use crate::html; - - #[test] - fn empty_string() { - assert_eq!(html::create_metadata_tag(""), ""); - } -} diff --git a/src/tests/html/embed_srcset.rs b/src/tests/html/embed_srcset.rs index 704bdfa..1f920cd 100644 --- a/src/tests/html/embed_srcset.rs +++ b/src/tests/html/embed_srcset.rs @@ -8,6 +8,7 @@ #[cfg(test)] mod passing { use reqwest::blocking::Client; + use reqwest::Url; use std::collections::HashMap; use crate::html; @@ -21,7 +22,14 @@ mod passing { let mut options = Options::default(); options.no_images = true; options.silent = true; - let embedded_css = html::embed_srcset(cache, &client, "", &srcset_value, &options, 0); + let embedded_css = html::embed_srcset( + cache, + &client, + &Url::parse("data:,").unwrap(), + &srcset_value, + &options, + 0, + ); assert_eq!( embedded_css, @@ -42,7 +50,14 @@ mod passing { let mut options = Options::default(); options.no_images = true; options.silent = true; - let embedded_css = html::embed_srcset(cache, &client, "", &srcset_value, &options, 0); + let embedded_css = html::embed_srcset( + cache, + &client, + &Url::parse("data:,").unwrap(), + &srcset_value, + &options, + 0, + ); assert_eq!( embedded_css, @@ -58,7 +73,14 @@ mod passing { let mut options = Options::default(); options.no_images = true; options.silent = true; - let embedded_css = html::embed_srcset(cache, &client, "", &srcset_value, &options, 0); + let embedded_css = html::embed_srcset( + cache, + &client, + &Url::parse("data:,").unwrap(), + &srcset_value, + &options, + 0, + ); assert_eq!( embedded_css, @@ -74,7 +96,14 @@ mod passing { let mut options = Options::default(); options.no_images = true; options.silent = true; - let embedded_css = html::embed_srcset(cache, &client, "", &srcset_value, &options, 0); + let embedded_css = html::embed_srcset( + cache, + &client, + &Url::parse("data:,").unwrap(), + &srcset_value, + &options, + 0, + ); assert_eq!( embedded_css, @@ -98,6 +127,7 @@ mod passing { #[cfg(test)] mod failing { use reqwest::blocking::Client; + use reqwest::Url; use std::collections::HashMap; use crate::html; @@ -111,7 +141,14 @@ mod failing { let mut options = Options::default(); options.no_images = true; options.silent = true; - let embedded_css = html::embed_srcset(cache, &client, "", &srcset_value, &options, 0); + let embedded_css = html::embed_srcset( + cache, + &client, + &Url::parse("data:,").unwrap(), + &srcset_value, + &options, + 0, + ); assert_eq!( embedded_css, diff --git a/src/tests/html/walk_and_embed_assets.rs b/src/tests/html/walk_and_embed_assets.rs index d16b438..855cc37 100644 --- a/src/tests/html/walk_and_embed_assets.rs +++ b/src/tests/html/walk_and_embed_assets.rs @@ -10,6 +10,7 @@ mod passing { use html5ever::serialize::{serialize, SerializeOpts}; use reqwest::blocking::Client; use std::collections::HashMap; + use url::Url; use crate::html; use crate::opts::Options; @@ -18,9 +19,9 @@ mod passing { fn basic() { let cache = &mut HashMap::new(); - let html = "

"; + let html: &str = "

"; let dom = html::html_to_dom(&html); - let url = "http://localhost"; + let url: Url = Url::parse("http://localhost").unwrap(); let mut options = Options::default(); options.silent = true; @@ -42,7 +43,7 @@ mod passing { fn ensure_no_recursive_iframe() { let html = "

"; let dom = html::html_to_dom(&html); - let url = "http://localhost"; + let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); let mut options = Options::default(); @@ -65,7 +66,7 @@ mod passing { fn ensure_no_recursive_frame() { let html = ""; let dom = html::html_to_dom(&html); - let url = "http://localhost"; + let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); let mut options = Options::default(); @@ -91,7 +92,7 @@ mod passing { \
"; let dom = html::html_to_dom(&html); - let url = "http://localhost"; + let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); let mut options = Options::default(); @@ -125,7 +126,7 @@ mod passing { let html = "\
"; let dom = html::html_to_dom(&html); - let url = "http://localhost"; + let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); let mut options = Options::default(); @@ -162,7 +163,7 @@ mod passing { let html = ""; let dom = html::html_to_dom(&html); - let url = "http://localhost"; + let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); let mut options = Options::default(); @@ -186,7 +187,7 @@ mod passing { fn no_frames() { let html = ""; let dom = html::html_to_dom(&html); - let url = "http://localhost"; + let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); let mut options = Options::default(); @@ -210,7 +211,7 @@ mod passing { fn no_iframes() { let html = ""; let dom = html::html_to_dom(&html); - let url = "http://localhost"; + let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); let mut options = Options::default(); @@ -237,7 +238,7 @@ mod passing { \
"; let dom = html::html_to_dom(&html); - let url = "http://localhost"; + let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); let mut options = Options::default(); @@ -258,37 +259,37 @@ mod passing { ); } - #[test] - fn discards_integrity() { - let html = "No integrity\ - \ - "; - let dom = html::html_to_dom(&html); - let url = "http://localhost"; - let cache = &mut HashMap::new(); - - let mut options = Options::default(); - options.no_css = true; - options.no_frames = true; - options.no_js = true; - options.no_images = true; - options.silent = true; - - let client = Client::new(); - - html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0); - - let mut buf: Vec = Vec::new(); - serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); - - assert_eq!( - buf.iter().map(|&c| c as char).collect::(), - "\ - No integrity\ - \ - " - ); - } + // #[test] + // fn discards_integrity() { + // let html = "No integrity\ + // \ + // "; + // let dom = html::html_to_dom(&html); + // let url: Url = Url::parse("http://localhost").unwrap(); + // let cache = &mut HashMap::new(); + + // let mut options = Options::default(); + // options.no_css = true; + // options.no_frames = true; + // options.no_js = true; + // options.no_images = true; + // options.silent = true; + + // let client = Client::new(); + + // html::walk_and_embed_assets(cache, &client, &url, &dom.document, &options, 0); + + // let mut buf: Vec = Vec::new(); + // serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); + + // assert_eq!( + // buf.iter().map(|&c| c as char).collect::(), + // "\ + // No integrity\ + // \ + // " + // ); + // } #[test] fn removes_unwanted_meta_tags() { @@ -300,7 +301,7 @@ mod passing { \ "; let dom = html::html_to_dom(&html); - let url = "http://localhost"; + let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); let mut options = Options::default(); @@ -339,7 +340,7 @@ mod passing { \ "; let dom = html::html_to_dom(&html); - let url = "http://localhost"; + let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); let mut options = Options::default(); diff --git a/src/tests/url/clean_url.rs b/src/tests/url/clean_url.rs index 4c1de79..1f31bd7 100644 --- a/src/tests/url/clean_url.rs +++ b/src/tests/url/clean_url.rs @@ -7,12 +7,23 @@ #[cfg(test)] mod passing { + use reqwest::Url; + use crate::url; + #[test] + fn preserve_original() { + let u: Url = Url::parse("https://somewhere.com/font.eot#iefix").unwrap(); + + url::clean_url(u.clone()); + + assert_eq!(u.as_str(), "https://somewhere.com/font.eot#iefix"); + } + #[test] fn removes_fragment() { assert_eq!( - url::clean_url("https://somewhere.com/font.eot#iefix"), + url::clean_url(Url::parse("https://somewhere.com/font.eot#iefix").unwrap()).as_str(), "https://somewhere.com/font.eot" ); } @@ -20,31 +31,31 @@ mod passing { #[test] fn removes_empty_fragment() { assert_eq!( - url::clean_url("https://somewhere.com/font.eot#"), + url::clean_url(Url::parse("https://somewhere.com/font.eot#").unwrap()).as_str(), "https://somewhere.com/font.eot" ); } #[test] - fn removes_empty_query_and_empty_fragment() { + fn removes_empty_fragment_and_keeps_empty_query() { assert_eq!( - url::clean_url("https://somewhere.com/font.eot?#"), - "https://somewhere.com/font.eot" + url::clean_url(Url::parse("https://somewhere.com/font.eot?#").unwrap()).as_str(), + "https://somewhere.com/font.eot?" ); } #[test] - fn removes_empty_query_amp_and_empty_fragment() { + fn removesempty_fragment_and_keeps_empty_query() { assert_eq!( - url::clean_url("https://somewhere.com/font.eot?a=b&#"), - "https://somewhere.com/font.eot?a=b" + url::clean_url(Url::parse("https://somewhere.com/font.eot?a=b&#").unwrap()).as_str(), + "https://somewhere.com/font.eot?a=b&" ); } #[test] fn keeps_credentials() { assert_eq!( - url::clean_url("https://cookie:monster@gibson.internet/"), + url::clean_url(Url::parse("https://cookie:monster@gibson.internet/").unwrap()).as_str(), "https://cookie:monster@gibson.internet/" ); } diff --git a/src/tests/url/data_to_data_url.rs b/src/tests/url/data_to_data_url.rs index f10e4a8..bac0126 100644 --- a/src/tests/url/data_to_data_url.rs +++ b/src/tests/url/data_to_data_url.rs @@ -7,16 +7,18 @@ #[cfg(test)] mod passing { + use reqwest::Url; + use crate::url; #[test] fn encode_string_with_specific_media_type() { let mime = "application/javascript"; let data = "var word = 'hello';\nalert(word);\n"; - let data_url = url::data_to_data_url(mime, data.as_bytes(), ""); + let data_url = url::data_to_data_url(mime, data.as_bytes(), &Url::parse("data:,").unwrap()); assert_eq!( - &data_url, + data_url.as_str(), "data:application/javascript;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK" ); } @@ -24,8 +26,15 @@ mod passing { #[test] fn encode_append_fragment() { let data = "\n"; - let data_url = url::data_to_data_url("image/svg+xml", data.as_bytes(), ""); + let data_url = url::data_to_data_url( + "image/svg+xml", + data.as_bytes(), + &Url::parse("data:,").unwrap(), + ); - assert_eq!(&data_url, "data:image/svg+xml;base64,PHN2Zz48L3N2Zz4K"); + assert_eq!( + data_url.as_str(), + "data:image/svg+xml;base64,PHN2Zz48L3N2Zz4K" + ); } } diff --git a/src/tests/url/file_url_to_fs_path.rs b/src/tests/url/file_url_to_fs_path.rs deleted file mode 100644 index 6194e3f..0000000 --- a/src/tests/url/file_url_to_fs_path.rs +++ /dev/null @@ -1,41 +0,0 @@ -// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ -// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ -// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ -// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ -// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ -// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ - -#[cfg(test)] -mod passing { - use crate::url; - - #[test] - fn remove_protocl_and_fragment() { - if cfg!(windows) { - assert_eq!( - url::file_url_to_fs_path("file:///C:/documents/some-path/some-file.svg#fragment"), - "C:\\documents\\some-path\\some-file.svg" - ); - } else { - assert_eq!( - url::file_url_to_fs_path("file:///tmp/some-path/some-file.svg#fragment"), - "/tmp/some-path/some-file.svg" - ); - } - } - - #[test] - fn decodes_urls() { - if cfg!(windows) { - assert_eq!( - url::file_url_to_fs_path("file:///C:/Documents%20and%20Settings/some-file.html"), - "C:\\Documents and Settings\\some-file.html" - ); - } else { - assert_eq!( - url::file_url_to_fs_path("file:///home/user/My%20Documents"), - "/home/user/My Documents" - ); - } - } -} diff --git a/src/tests/url/get_url_fragment.rs b/src/tests/url/get_url_fragment.rs deleted file mode 100644 index b1b130b..0000000 --- a/src/tests/url/get_url_fragment.rs +++ /dev/null @@ -1,48 +0,0 @@ -// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ -// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ -// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ -// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ -// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ -// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ - -#[cfg(test)] -mod passing { - use crate::url; - - #[test] - fn data_url() { - assert_eq!( - url::get_url_fragment( - "data:image/svg+xml;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h#test" - ), - "test" - ); - } -} - -// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ -// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝ -// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗ -// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║ -// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝ -// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ - -#[cfg(test)] -mod failing { - use crate::url; - - #[test] - fn https_empty() { - assert_eq!(url::get_url_fragment("https://kernel.org#"), ""); - } - - #[test] - fn no_fragment() { - assert_eq!(url::get_url_fragment("https://kernel.org"), ""); - } - - #[test] - fn dummy_data_url() { - assert_eq!(url::get_url_fragment("data:text/html,"), ""); - } -} diff --git a/src/tests/url/is_data_url.rs b/src/tests/url/is_data_url.rs deleted file mode 100644 index efd059c..0000000 --- a/src/tests/url/is_data_url.rs +++ /dev/null @@ -1,52 +0,0 @@ -// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ -// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ -// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ -// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ -// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ -// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ - -#[cfg(test)] -mod passing { - use crate::url; - - #[test] - fn data_url_text_html() { - assert!(url::is_data_url( - "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h" - )); - } - - #[test] - fn data_url_no_media_type() { - assert!(url::is_data_url( - "data:;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h" - )); - } -} - -// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ -// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝ -// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗ -// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║ -// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝ -// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ - -#[cfg(test)] -mod failing { - use crate::url; - - #[test] - fn https_url() { - assert!(!url::is_data_url("https://kernel.org")); - } - - #[test] - fn no_protocol_url() { - assert!(!url::is_data_url("//kernel.org")); - } - - #[test] - fn empty_string() { - assert!(!url::is_data_url("")); - } -} diff --git a/src/tests/url/is_file_url.rs b/src/tests/url/is_file_url.rs deleted file mode 100644 index 927b793..0000000 --- a/src/tests/url/is_file_url.rs +++ /dev/null @@ -1,83 +0,0 @@ -// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ -// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ -// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ -// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ -// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ -// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ - -#[cfg(test)] -mod passing { - use crate::url; - - #[test] - fn unix_file_url() { - assert!(url::is_file_url( - "file:///home/user/Websites/my-website/index.html" - )); - } - - #[test] - fn windows_file_url() { - assert!(url::is_file_url( - "file:///C:/Documents%20and%20Settings/user/Websites/my-website/assets/images/logo.png" - )); - } - - #[test] - fn unix_url_with_backslashes() { - assert!(url::is_file_url( - "file:\\\\\\home\\user\\Websites\\my-website\\index.html" - )); - } - - #[test] - fn windows_file_url_with_backslashes() { - assert!(url::is_file_url( - "file:\\\\\\C:\\Documents%20and%20Settings\\user\\Websites\\my-website\\assets\\images\\logo.png" - )); - } -} - -// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ -// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝ -// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗ -// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║ -// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝ -// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ - -#[cfg(test)] -mod failing { - use crate::url; - - #[test] - fn url_with_no_protocl() { - assert!(!url::is_file_url("//kernel.org")); - } - - #[test] - fn dot_slash_filename() { - assert!(!url::is_file_url("./index.html")); - } - - #[test] - fn just_filename() { - assert!(!url::is_file_url("some-local-page.htm")); - } - - #[test] - fn https_ip_port_url() { - assert!(!url::is_file_url("https://1.2.3.4:80/www/index.html")); - } - - #[test] - fn data_url() { - assert!(!url::is_file_url( - "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h" - )); - } - - #[test] - fn just_word_file() { - assert!(!url::is_file_url("file")); - } -} diff --git a/src/tests/url/is_http_url.rs b/src/tests/url/is_http_or_https_url.rs similarity index 72% rename from src/tests/url/is_http_url.rs rename to src/tests/url/is_http_or_https_url.rs index 622d340..1e0a579 100644 --- a/src/tests/url/is_http_url.rs +++ b/src/tests/url/is_http_or_https_url.rs @@ -7,21 +7,23 @@ #[cfg(test)] mod passing { + use reqwest::Url; + use crate::url; #[test] fn http_url() { - assert!(url::is_http_url("http://kernel.org")); + assert!(url::is_http_or_https_url(&Url::parse("http://kernel.org").unwrap())); } #[test] fn https_url() { - assert!(url::is_http_url("https://www.rust-lang.org/")); + assert!(url::is_http_or_https_url(&Url::parse("https://www.rust-lang.org/").unwrap())); } #[test] fn http_url_with_backslashes() { - assert!(url::is_http_url("http:\\\\freebsd.org\\")); + assert!(url::is_http_or_https_url(&Url::parse("http:\\\\freebsd.org\\").unwrap())); } } @@ -34,32 +36,34 @@ mod passing { #[cfg(test)] mod failing { + use reqwest::Url; + use crate::url; #[test] fn url_with_no_protocol() { - assert!(!url::is_http_url("//kernel.org")); + assert!(!url::is_http_or_https_url(&Url::parse("//kernel.org").unwrap())); } #[test] fn dot_slash_filename() { - assert!(!url::is_http_url("./index.html")); + assert!(!url::is_http_or_https_url(&Url::parse("./index.html").unwrap())); } #[test] fn just_filename() { - assert!(!url::is_http_url("some-local-page.htm")); + assert!(!url::is_http_or_https_url(&Url::parse("some-local-page.htm").unwrap())); } #[test] fn https_ip_port_url() { - assert!(!url::is_http_url("ftp://1.2.3.4/www/index.html")); + assert!(!url::is_http_or_https_url(&Url::parse("ftp://1.2.3.4/www/index.html").unwrap())); } #[test] fn data_url() { - assert!(!url::is_http_url( - "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h" + assert!(!url::is_http_or_https_url( + &Url::parse("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h").unwrap() )); } } diff --git a/src/tests/url/url_has_protocol.rs b/src/tests/url/is_url_and_has_protocol.rs similarity index 74% rename from src/tests/url/url_has_protocol.rs rename to src/tests/url/is_url_and_has_protocol.rs index b111a07..cae497a 100644 --- a/src/tests/url/url_has_protocol.rs +++ b/src/tests/url/is_url_and_has_protocol.rs @@ -11,53 +11,53 @@ mod passing { #[test] fn mailto() { - assert!(url::url_has_protocol( + assert!(url::is_url_and_has_protocol( "mailto:somebody@somewhere.com?subject=hello" )); } #[test] fn tel() { - assert!(url::url_has_protocol("tel:5551234567")); + assert!(url::is_url_and_has_protocol("tel:5551234567")); } #[test] fn ftp_no_slashes() { - assert!(url::url_has_protocol("ftp:some-ftp-server.com")); + assert!(url::is_url_and_has_protocol("ftp:some-ftp-server.com")); } #[test] fn ftp_with_credentials() { - assert!(url::url_has_protocol( + assert!(url::is_url_and_has_protocol( "ftp://user:password@some-ftp-server.com" )); } #[test] fn javascript() { - assert!(url::url_has_protocol("javascript:void(0)")); + assert!(url::is_url_and_has_protocol("javascript:void(0)")); } #[test] fn http() { - assert!(url::url_has_protocol("http://news.ycombinator.com")); + assert!(url::is_url_and_has_protocol("http://news.ycombinator.com")); } #[test] fn https() { - assert!(url::url_has_protocol("https://github.com")); + assert!(url::is_url_and_has_protocol("https://github.com")); } #[test] fn mailto_uppercase() { - assert!(url::url_has_protocol( + assert!(url::is_url_and_has_protocol( "MAILTO:somebody@somewhere.com?subject=hello" )); } #[test] fn empty_data_url() { - assert!(url::url_has_protocol("data:text/html,")); + assert!(url::is_url_and_has_protocol("data:text/html,")); } } @@ -74,21 +74,25 @@ mod failing { #[test] fn url_with_no_protocol() { - assert!(!url::url_has_protocol("//some-hostname.com/some-file.html")); + assert!(!url::is_url_and_has_protocol( + "//some-hostname.com/some-file.html" + )); } #[test] fn relative_path() { - assert!(!url::url_has_protocol("some-hostname.com/some-file.html")); + assert!(!url::is_url_and_has_protocol( + "some-hostname.com/some-file.html" + )); } #[test] fn relative_to_root_path() { - assert!(!url::url_has_protocol("/some-file.html")); + assert!(!url::is_url_and_has_protocol("/some-file.html")); } #[test] fn empty_string() { - assert!(!url::url_has_protocol("")); + assert!(!url::is_url_and_has_protocol("")); } } diff --git a/src/tests/url/mod.rs b/src/tests/url/mod.rs index fe06cda..50efbc6 100644 --- a/src/tests/url/mod.rs +++ b/src/tests/url/mod.rs @@ -1,12 +1,7 @@ mod clean_url; mod data_to_data_url; -mod decode_url; -mod file_url_to_fs_path; -mod get_url_fragment; -mod is_data_url; -mod is_file_url; -mod is_http_url; +mod is_url_and_has_protocol; mod parse_data_url; +mod percent_decode; +mod percent_encode; mod resolve_url; -mod url_has_protocol; -mod url_with_fragment; diff --git a/src/tests/url/parse_data_url.rs b/src/tests/url/parse_data_url.rs index c383bb8..e39ce97 100644 --- a/src/tests/url/parse_data_url.rs +++ b/src/tests/url/parse_data_url.rs @@ -7,11 +7,13 @@ #[cfg(test)] mod passing { + use reqwest::Url; + use crate::url; #[test] fn parse_text_html_base64() { - let (media_type, data) = url::parse_data_url("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="); + let (media_type, data) = url::parse_data_url(&Url::parse("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg==").unwrap()); assert_eq!(media_type, "text/html"); assert_eq!( @@ -23,7 +25,7 @@ mod passing { #[test] fn parse_text_html_utf8() { let (media_type, data) = url::parse_data_url( - "data:text/html;utf8,Work expands so as to fill the time available for its completion", + &Url::parse("data:text/html;utf8,Work expands so as to fill the time available for its completion").unwrap(), ); assert_eq!(media_type, "text/html"); @@ -36,7 +38,10 @@ mod passing { #[test] fn parse_text_html_plaintext() { let (media_type, data) = url::parse_data_url( - "data:text/html,Work expands so as to fill the time available for its completion", + &Url::parse( + "data:text/html,Work expands so as to fill the time available for its completion", + ) + .unwrap(), ); assert_eq!(media_type, "text/html"); @@ -46,20 +51,10 @@ mod passing { ); } - #[test] - fn parse_text_html_charset_utf_8_between_two_whitespaces() { - let (media_type, data) = url::parse_data_url(" data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion "); - - assert_eq!(media_type, "text/html"); - assert_eq!( - String::from_utf8_lossy(&data), - "Work expands so as to fill the time available for its completion" - ); - } - #[test] fn parse_text_css_url_encoded() { - let (media_type, data) = url::parse_data_url("data:text/css,div{background-color:%23000}"); + let (media_type, data) = + url::parse_data_url(&Url::parse("data:text/css,div{background-color:%23000}").unwrap()); assert_eq!(media_type, "text/css"); assert_eq!(String::from_utf8_lossy(&data), "div{background-color:#000}"); @@ -67,7 +62,7 @@ mod passing { #[test] fn parse_no_media_type_base64() { - let (media_type, data) = url::parse_data_url("data:;base64,dGVzdA=="); + let (media_type, data) = url::parse_data_url(&Url::parse("data:;base64,dGVzdA==").unwrap()); assert_eq!(media_type, ""); assert_eq!(String::from_utf8_lossy(&data), "test"); @@ -75,7 +70,7 @@ mod passing { #[test] fn parse_no_media_type_no_encoding() { - let (media_type, data) = url::parse_data_url("data:;,test%20test"); + let (media_type, data) = url::parse_data_url(&Url::parse("data:;,test%20test").unwrap()); assert_eq!(media_type, ""); assert_eq!(String::from_utf8_lossy(&data), "test test"); @@ -91,11 +86,13 @@ mod passing { #[cfg(test)] mod failing { + use reqwest::Url; + use crate::url; #[test] - fn just_word_data() { - let (media_type, data) = url::parse_data_url("data"); + fn empty_data_url() { + let (media_type, data) = url::parse_data_url(&Url::parse("data:,").unwrap()); assert_eq!(media_type, ""); assert_eq!(String::from_utf8_lossy(&data), ""); diff --git a/src/tests/url/decode_url.rs b/src/tests/url/percent_decode.rs similarity index 91% rename from src/tests/url/decode_url.rs rename to src/tests/url/percent_decode.rs index 5cec664..8972f58 100644 --- a/src/tests/url/decode_url.rs +++ b/src/tests/url/percent_decode.rs @@ -12,7 +12,7 @@ mod passing { #[test] fn decode_unicode_characters() { assert_eq!( - url::decode_url(str!( + url::percent_decode(str!( "%E6%A4%9C%E3%83%92%E3%83%A0%E8%A7%A3%E5%A1%97%E3%82%83%E3%83%83%20%3D%20%E3%82%B5" )), "検ヒム解塗ゃッ = サ" @@ -22,7 +22,7 @@ mod passing { #[test] fn decode_file_url() { assert_eq!( - url::decode_url(str!("file:///tmp/space%20here/test%231.html")), + url::percent_decode(str!("file:///tmp/space%20here/test%231.html")), "file:///tmp/space here/test#1.html" ); } @@ -30,7 +30,7 @@ mod passing { #[test] fn plus_sign() { assert_eq!( - url::decode_url(str!( + url::percent_decode(str!( "fonts.somewhere.com/css?family=Open+Sans:300,400,400italic,600,600italic" )), "fonts.somewhere.com/css?family=Open+Sans:300,400,400italic,600,600italic" diff --git a/src/tests/url/url_with_fragment.rs b/src/tests/url/percent_encode.rs similarity index 52% rename from src/tests/url/url_with_fragment.rs rename to src/tests/url/percent_encode.rs index 955acf3..3fa1c21 100644 --- a/src/tests/url/url_with_fragment.rs +++ b/src/tests/url/percent_encode.rs @@ -10,31 +10,7 @@ mod passing { use crate::url; #[test] - fn url_with_fragment_url() { - let url = "https://localhost.localdomain/path/"; - let fragment = "test"; - let assembled_url = url::url_with_fragment(url, fragment); - - assert_eq!(&assembled_url, "https://localhost.localdomain/path/#test"); - } - #[test] - fn url_with_fragment_empty_url() { - let url = "https://localhost.localdomain/path/"; - let fragment = ""; - let assembled_url = url::url_with_fragment(url, fragment); - - assert_eq!(&assembled_url, "https://localhost.localdomain/path/"); - } - - #[test] - fn url_with_fragment_data_url() { - let url = "data:image/svg+xml;base64,PHN2Zz48L3N2Zz4K"; - let fragment = "fragment"; - let assembled_url = url::url_with_fragment(url, fragment); - - assert_eq!( - &assembled_url, - "data:image/svg+xml;base64,PHN2Zz48L3N2Zz4K#fragment" - ); + fn apostrophe() { + assert_eq!(url::percent_encode(str!("'")), "%27"); } } diff --git a/src/tests/url/resolve_url.rs b/src/tests/url/resolve_url.rs index dbce125..edfe773 100644 --- a/src/tests/url/resolve_url.rs +++ b/src/tests/url/resolve_url.rs @@ -7,26 +7,21 @@ #[cfg(test)] mod passing { + use reqwest::Url; + use crate::url; #[test] fn from_https_to_level_up_relative() { - assert_eq!( - url::resolve_url("https://www.kernel.org", "../category/signatures.html") - .unwrap_or_default(), - "https://www.kernel.org/category/signatures.html" - ); - } - - #[test] - fn from_just_filename_to_full_https_url() { assert_eq!( url::resolve_url( - "saved_page.htm", - "https://www.kernel.org/category/signatures.html", + &Url::parse("https://www.kernel.org").unwrap(), + "../category/signatures.html" ) - .unwrap_or_default(), - "https://www.kernel.org/category/signatures.html" + .as_str(), + Url::parse("https://www.kernel.org/category/signatures.html") + .unwrap() + .as_str() ); } @@ -34,10 +29,10 @@ mod passing { fn from_https_url_to_url_with_no_protocol() { assert_eq!( url::resolve_url( - "https://www.kernel.org", + &Url::parse("https://www.kernel.org").unwrap(), "//www.kernel.org/theme/images/logos/tux.png", ) - .unwrap_or_default(), + .as_str(), "https://www.kernel.org/theme/images/logos/tux.png" ); } @@ -46,10 +41,10 @@ mod passing { fn from_https_url_to_url_with_no_protocol_and_on_different_hostname() { assert_eq!( url::resolve_url( - "https://www.kernel.org", + &Url::parse("https://www.kernel.org").unwrap(), "//another-host.org/theme/images/logos/tux.png", ) - .unwrap_or_default(), + .as_str(), "https://another-host.org/theme/images/logos/tux.png" ); } @@ -58,10 +53,10 @@ mod passing { fn from_https_url_to_relative_root_path() { assert_eq!( url::resolve_url( - "https://www.kernel.org/category/signatures.html", + &Url::parse("https://www.kernel.org/category/signatures.html").unwrap(), "/theme/images/logos/tux.png", ) - .unwrap_or_default(), + .as_str(), "https://www.kernel.org/theme/images/logos/tux.png" ); } @@ -70,10 +65,10 @@ mod passing { fn from_https_to_just_filename() { assert_eq!( url::resolve_url( - "https://www.w3schools.com/html/html_iframe.asp", + &Url::parse("https://www.w3schools.com/html/html_iframe.asp").unwrap(), "default.asp", ) - .unwrap_or_default(), + .as_str(), "https://www.w3schools.com/html/default.asp" ); } @@ -82,10 +77,11 @@ mod passing { fn from_data_url_to_https() { assert_eq!( url::resolve_url( - "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h", + &Url::parse("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h") + .unwrap(), "https://www.kernel.org/category/signatures.html", ) - .unwrap_or_default(), + .as_str(), "https://www.kernel.org/category/signatures.html" ); } @@ -94,10 +90,11 @@ mod passing { fn from_data_url_to_data_url() { assert_eq!( url::resolve_url( - "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h", + &Url::parse("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h") + .unwrap(), "data:text/html;base64,PGEgaHJlZj0iaW5kZXguaHRtbCI+SG9tZTwvYT4K", ) - .unwrap_or_default(), + .as_str(), "data:text/html;base64,PGEgaHJlZj0iaW5kZXguaHRtbCI+SG9tZTwvYT4K" ); } @@ -106,10 +103,10 @@ mod passing { fn from_file_url_to_relative_path() { assert_eq!( url::resolve_url( - "file:///home/user/Websites/my-website/index.html", + &Url::parse("file:///home/user/Websites/my-website/index.html").unwrap(), "assets/images/logo.png", ) - .unwrap_or_default(), + .as_str(), "file:///home/user/Websites/my-website/assets/images/logo.png" ); } @@ -118,10 +115,10 @@ mod passing { fn from_file_url_to_relative_path_with_backslashes() { assert_eq!( url::resolve_url( - "file:\\\\\\home\\user\\Websites\\my-website\\index.html", + &Url::parse("file:\\\\\\home\\user\\Websites\\my-website\\index.html").unwrap(), "assets\\images\\logo.png", ) - .unwrap_or_default(), + .as_str(), "file:///home/user/Websites/my-website/assets/images/logo.png" ); } @@ -130,10 +127,11 @@ mod passing { fn from_data_url_to_file_url() { assert_eq!( url::resolve_url( - "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h", + &Url::parse("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h") + .unwrap(), "file:///etc/passwd", ) - .unwrap_or_default(), + .as_str(), "file:///etc/passwd" ); } @@ -142,31 +140,30 @@ mod passing { fn preserve_fragment() { assert_eq!( url::resolve_url( - "http://doesnt-matter.local/", + &Url::parse("http://doesnt-matter.local/").unwrap(), "css/fonts/fontmarvelous.svg#fontmarvelous", ) - .unwrap_or_default(), + .as_str(), "http://doesnt-matter.local/css/fonts/fontmarvelous.svg#fontmarvelous" ); } - #[test] - fn resolve_from_file_url_to_file_url() { - assert_eq!( - if cfg!(windows) { - url::resolve_url("file:///c:/index.html", "file:///c:/image.png") - .unwrap_or_default() - } else { - url::resolve_url("file:///tmp/index.html", "file:///tmp/image.png") - .unwrap_or_default() - }, - if cfg!(windows) { - "file:///c:/image.png" - } else { - "file:///tmp/image.png" - } - ); - } + // #[test] + // fn resolve_from_file_url_to_file_url() { + // assert_eq!( + // if cfg!(windows) { + // url::resolve_url(&Url::parse("file:///c:/index.html").unwrap(), "file:///c:/image.png").as_str() + // } else { + // url::resolve_url(&Url::parse("file:///tmp/index.html").unwrap(), "file:///tmp/image.png") + // .as_str() + // }, + // if cfg!(windows) { + // "file:///c:/image.png" + // } else { + // "file:///tmp/image.png" + // } + // ); + // } } // ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ @@ -178,17 +175,20 @@ mod passing { #[cfg(test)] mod failing { + use reqwest::Url; + use crate::url; #[test] fn from_data_url_to_url_with_no_protocol() { assert_eq!( url::resolve_url( - "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h", + &Url::parse("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h") + .unwrap(), "//www.w3schools.com/html/html_iframe.asp", ) - .unwrap_or_default(), - "" + .as_str(), + "data:," ); } } diff --git a/src/tests/utils/detect_media_type.rs b/src/tests/utils/detect_media_type.rs index b024525..970af13 100644 --- a/src/tests/utils/detect_media_type.rs +++ b/src/tests/utils/detect_media_type.rs @@ -7,131 +7,171 @@ #[cfg(test)] mod passing { + use reqwest::Url; + use crate::utils; #[test] fn image_gif87() { - assert_eq!(utils::detect_media_type(b"GIF87a", ""), "image/gif"); + let dummy_url: Url = Url::parse("data:,").unwrap(); + assert_eq!(utils::detect_media_type(b"GIF87a", &dummy_url), "image/gif"); } #[test] fn image_gif89() { - assert_eq!(utils::detect_media_type(b"GIF89a", ""), "image/gif"); + let dummy_url: Url = Url::parse("data:,").unwrap(); + assert_eq!(utils::detect_media_type(b"GIF89a", &dummy_url), "image/gif"); } #[test] fn image_jpeg() { - assert_eq!(utils::detect_media_type(b"\xFF\xD8\xFF", ""), "image/jpeg"); + let dummy_url: Url = Url::parse("data:,").unwrap(); + assert_eq!( + utils::detect_media_type(b"\xFF\xD8\xFF", &dummy_url), + "image/jpeg" + ); } #[test] fn image_png() { + let dummy_url: Url = Url::parse("data:,").unwrap(); assert_eq!( - utils::detect_media_type(b"\x89PNG\x0D\x0A\x1A\x0A", ""), + utils::detect_media_type(b"\x89PNG\x0D\x0A\x1A\x0A", &dummy_url), "image/png" ); } #[test] fn image_svg() { - assert_eq!(utils::detect_media_type(b">(input: T) -> String { - let mut url = Url::parse(input.as_ref()).unwrap(); +pub fn clean_url(url: Url) -> Url { + let mut url = url.clone(); - // Clear fragment + // Clear fragment (if any) url.set_fragment(None); - // Get rid of stray question mark - if url.query() == Some("") { - url.set_query(None); - } - - // Remove empty trailing ampersand(s) - let mut result: String = url.to_string(); - while result.ends_with("&") { - result.pop(); - } - - result + url } -pub fn data_to_data_url(media_type: &str, data: &[u8], url: &str) -> String { +pub fn data_to_data_url(media_type: &str, data: &[u8], final_asset_url: &Url) -> Url { let media_type: String = if media_type.is_empty() { - detect_media_type(data, &url) + detect_media_type(data, &final_asset_url) } else { media_type.to_string() }; - format!("data:{};base64,{}", media_type, base64::encode(data)) -} - -pub fn decode_url(input: String) -> String { - let input: String = input.replace("+", "%2B"); - - form_urlencoded::parse(input.as_bytes()) - .map(|(key, val)| { - [ - key.to_string(), - if val.to_string().len() == 0 { - str!() - } else { - str!('=') - }, - val.to_string(), - ] - .concat() - }) - .collect() -} - -pub fn file_url_to_fs_path(url: &str) -> String { - if !is_file_url(url) { - return str!(); - } + let mut data_url: Url = Url::parse("data:,").unwrap(); - let cutoff_l = if cfg!(windows) { 8 } else { 7 }; - let mut fs_file_path: String = decode_url(url.to_string()[cutoff_l..].to_string()); - let url_fragment = get_url_fragment(url); - if url_fragment != "" { - let max_len = fs_file_path.len() - 1 - url_fragment.len(); - fs_file_path = fs_file_path[0..max_len].to_string(); - } - - if cfg!(windows) { - fs_file_path = fs_file_path.replace("/", "\\"); - } + data_url.set_path(format!("{};base64,{}", media_type, base64::encode(data)).as_str()); - // File paths should not be %-encoded - decode_url(fs_file_path) + data_url } -pub fn get_url_fragment>(url: T) -> String { - match Url::parse(url.as_ref()) { - Ok(parsed_url) => parsed_url.fragment().unwrap_or("").to_string(), - Err(_err) => str!(), +pub fn is_url_and_has_protocol(input: &str) -> bool { + match Url::parse(&input) { + Ok(parsed_url) => { + return parsed_url.scheme().len() > 0; + } + Err(_) => { + return false; + } } } -pub fn is_data_url>(url: T) -> bool { - Url::parse(url.as_ref()) - .and_then(|u| Ok(u.scheme() == "data")) - .unwrap_or(false) -} - -pub fn is_file_url>(url: T) -> bool { - Url::parse(url.as_ref()) - .and_then(|u| Ok(u.scheme() == "file")) - .unwrap_or(false) -} - -pub fn is_http_url>(url: T) -> bool { - Url::parse(url.as_ref()) - .and_then(|u| Ok(u.scheme() == "http" || u.scheme() == "https")) - .unwrap_or(false) -} - -pub fn parse_data_url>(url: T) -> (String, Vec) { - let parsed_url: Url = Url::parse(url.as_ref()).unwrap_or(Url::parse("data:,").unwrap()); - let path: String = parsed_url.path().to_string(); +pub fn parse_data_url(url: &Url) -> (String, Vec) { + let path: String = url.path().to_string(); let comma_loc: usize = path.find(',').unwrap_or(path.len()); let meta_data: String = path.chars().take(comma_loc).collect(); let raw_data: String = path.chars().skip(comma_loc + 1).collect(); - let text: String = decode_url(raw_data); + let text: String = percent_decode(raw_data); let meta_data_items: Vec<&str> = meta_data.split(';').collect(); let mut media_type: String = str!(); @@ -137,31 +75,35 @@ pub fn parse_data_url>(url: T) -> (String, Vec) { (media_type, data) } -pub fn resolve_url, U: AsRef>(from: T, to: U) -> Result { - let result = if is_http_url(to.as_ref()) { - to.as_ref().to_string() - } else { - Url::parse(from.as_ref())? - .join(to.as_ref())? - .as_ref() - .to_string() - }; - Ok(result) -} +pub fn percent_decode(input: String) -> String { + let input: String = input.replace("+", "%2B"); -pub fn url_has_protocol>(url: T) -> bool { - Url::parse(url.as_ref()) - .and_then(|u| Ok(u.scheme().len() > 0)) - .unwrap_or(false) + form_urlencoded::parse(input.as_bytes()) + .map(|(key, val)| { + [ + key.to_string(), + if val.to_string().len() == 0 { + str!() + } else { + str!('=') + }, + val.to_string(), + ] + .concat() + }) + .collect() } -pub fn url_with_fragment(url: &str, fragment: &str) -> String { - let mut result = str!(&url); +pub fn percent_encode(input: String) -> String { + form_urlencoded::byte_serialize(input.as_bytes()).collect() +} - if !fragment.is_empty() { - result += "#"; - result += fragment; +pub fn resolve_url(from: &Url, to: &str) -> Url { + match Url::parse(&to) { + Ok(parsed_url) => parsed_url, + Err(_) => match from.join(to) { + Ok(joined) => joined, + Err(_) => Url::parse("data:,").unwrap(), + }, } - - result } diff --git a/src/utils.rs b/src/utils.rs index 90030d7..2be4d28 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -2,15 +2,14 @@ use reqwest::blocking::Client; use reqwest::header::CONTENT_TYPE; use std::collections::HashMap; use std::fs; -use std::path::Path; +use std::path::{Path, PathBuf}; +use url::Url; use crate::opts::Options; -use crate::url::{clean_url, file_url_to_fs_path, is_data_url, is_file_url, parse_data_url}; +use crate::url::{clean_url, parse_data_url}; const ANSI_COLOR_RED: &'static str = "\x1b[31m"; const ANSI_COLOR_RESET: &'static str = "\x1b[0m"; -const INDENT: &'static str = " "; - const MAGIC: [[&[u8]; 2]; 18] = [ // Image [b"GIF87a", b"image/gif"], @@ -34,24 +33,16 @@ const MAGIC: [[&[u8]; 2]; 18] = [ [b"....moov", b"video/quicktime"], [b"\x1A\x45\xDF\xA3", b"video/webm"], ]; -const PLAINTEXT_MEDIA_TYPES: &[&str] = &[ - "application/javascript", - "image/svg+xml", - // "text/css", - // "text/csv", - // "text/html", - // "text/javascript", - // "text/plain", -]; +const PLAINTEXT_MEDIA_TYPES: &[&str] = &["application/javascript", "image/svg+xml"]; -pub fn detect_media_type(data: &[u8], url: &str) -> String { - for item in MAGIC.iter() { - if data.starts_with(item[0]) { - return String::from_utf8(item[1].to_vec()).unwrap(); +pub fn detect_media_type(data: &[u8], url: &Url) -> String { + for magic_item in MAGIC.iter() { + if data.starts_with(magic_item[0]) { + return String::from_utf8(magic_item[1].to_vec()).unwrap(); } } - if url.to_lowercase().ends_with(".svg") { + if url.path().to_lowercase().ends_with(".svg") { return str!("image/svg+xml"); } @@ -64,68 +55,109 @@ pub fn is_plaintext_media_type(media_type: &str) -> bool { } pub fn indent(level: u32) -> String { - let mut result = str!(); + let mut result: String = String::new(); let mut l: u32 = level; + while l > 0 { - result += INDENT; + result += " "; l -= 1; } + result } pub fn retrieve_asset( cache: &mut HashMap>, client: &Client, - parent_url: &str, - url: &str, + parent_url: &Url, + url: &Url, options: &Options, depth: u32, -) -> Result<(Vec, String, String), reqwest::Error> { - if url.len() == 0 { - // Provoke error - client.get("").send()?; - } - - if is_data_url(&url) { +) -> Result<(Vec, Url, String), reqwest::Error> { + if url.scheme() == "data" { let (media_type, data) = parse_data_url(url); - Ok((data, url.to_string(), media_type)) - } else if is_file_url(&url) { - // Check if parent_url is also file:/// - // (if not, then we don't embed the asset) - if !is_file_url(&parent_url) { + Ok((data, url.clone(), media_type)) + } else if url.scheme() == "file" { + // Check if parent_url is also file:/// (if not, then we don't embed the asset) + if parent_url.scheme() != "file" { + if !options.silent { + eprintln!( + "{}{}{} ({}){}", + indent(depth).as_str(), + if options.no_color { "" } else { ANSI_COLOR_RED }, + &url, + "Security Error", + if options.no_color { + "" + } else { + ANSI_COLOR_RESET + }, + ); + } // Provoke error client.get("").send()?; } - let fs_file_path: String = file_url_to_fs_path(url); - let path = Path::new(&fs_file_path); + let path_buf: PathBuf = url.to_file_path().unwrap().clone(); + let path: &Path = path_buf.as_path(); if path.exists() { + if path.is_dir() { + if !options.silent { + eprintln!( + "{}{}{} (is a directory){}", + indent(depth).as_str(), + if options.no_color { "" } else { ANSI_COLOR_RED }, + &url, + if options.no_color { + "" + } else { + ANSI_COLOR_RESET + }, + ); + } + + // Provoke error + Err(client.get("").send().unwrap_err()) + } else { + if !options.silent { + eprintln!("{}{}", indent(depth).as_str(), &url); + } + + Ok((fs::read(&path).expect(""), url.clone(), str!())) + } + } else { if !options.silent { - eprintln!("{}{}", indent(depth).as_str(), &url); + eprintln!( + "{}{}{} (not found){}", + indent(depth).as_str(), + if options.no_color { "" } else { ANSI_COLOR_RED }, + &url, + if options.no_color { + "" + } else { + ANSI_COLOR_RESET + }, + ); } - Ok((fs::read(&fs_file_path).expect(""), url.to_string(), str!())) - } else { // Provoke error Err(client.get("").send().unwrap_err()) } } else { - let cache_key: String = clean_url(&url); + let cache_key: String = clean_url(url.clone()).as_str().to_string(); if cache.contains_key(&cache_key) { - // URL is in cache, we get and return it + // URL is in cache, + // we get and return it if !options.silent { eprintln!("{}{} (from cache)", indent(depth).as_str(), &url); } - Ok(( - cache.get(&cache_key).unwrap().to_vec(), - url.to_string(), - str!(), - )) + Ok((cache.get(&cache_key).unwrap().to_vec(), url.clone(), str!())) } else { - // URL not in cache, we retrieve the file - match client.get(url).send() { + // URL not in cache, + // we retrieve the file + match client.get(url.as_str()).send() { Ok(mut response) => { if !options.ignore_errors && response.status() != 200 { if !options.silent { @@ -146,24 +178,22 @@ pub fn retrieve_asset( return Err(client.get("").send().unwrap_err()); } - let res_url = response.url().to_string(); - if !options.silent { - if url == res_url { + if url.as_str() == response.url().as_str() { eprintln!("{}{}", indent(depth).as_str(), &url); } else { - eprintln!("{}{} -> {}", indent(depth).as_str(), &url, &res_url); + eprintln!("{}{} -> {}", indent(depth).as_str(), &url, &response.url()); } } - let new_cache_key: String = clean_url(&res_url); + let new_cache_key: String = clean_url(response.url().clone()).to_string(); // Convert response into a byte array let mut data: Vec = vec![]; - response.copy_to(&mut data)?; + response.copy_to(&mut data).unwrap(); - // Attempt to obtain media type by reading the Content-Type header - let media_type = response + // Attempt to obtain media type by reading Content-Type header + let media_type: &str = response .headers() .get(CONTENT_TYPE) .and_then(|header| header.to_str().ok()) @@ -172,9 +202,27 @@ pub fn retrieve_asset( // Add retrieved resource to cache cache.insert(new_cache_key, data.clone()); - Ok((data, res_url, media_type.to_string())) + // Return + Ok((data, response.url().clone(), media_type.to_string())) + } + Err(error) => { + if !options.silent { + eprintln!( + "{}{}{} ({}){}", + indent(depth).as_str(), + if options.no_color { "" } else { ANSI_COLOR_RED }, + &url, + error, + if options.no_color { + "" + } else { + ANSI_COLOR_RESET + }, + ); + } + + Err(client.get("").send().unwrap_err()) } - Err(error) => Err(error), } } }