diff --git a/Cargo.lock b/Cargo.lock index 4cf4156..1da2872 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,9 +26,9 @@ dependencies = [ [[package]] name = "assert_cmd" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f57fec1ac7e4de72dcc69811795f1a7172ed06012f80a5d1ee651b62484f588" +checksum = "a88b6bd5df287567ffdf4ddf4d33060048e1068308e5f62d81c6f9824a045a48" dependencies = [ "bstr", "doc-comment", @@ -497,9 +497,9 @@ checksum = "6456b8a6c8f33fee7d958fcd1b60d55b11940a79e63ae87013e6d22e26034440" [[package]] name = "hyper" -version = "0.14.8" +version = "0.14.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3f71a7eea53a3f8257a7b4795373ff886397178cd634430ea94e12d7fe4fe34" +checksum = "07d6baa1b441335f3ce5098ac421fb6547c46dda735ca1bc6d0153c838f9dd83" dependencies = [ "bytes", "futures-channel", @@ -511,7 +511,7 @@ dependencies = [ "httparse", "httpdate", "itoa", - "pin-project", + "pin-project-lite", "socket2", "tokio", "tower-service", @@ -678,6 +678,7 @@ dependencies = [ "chrono", "clap", "cssparser", + "encoding_rs", "html5ever", "regex", "reqwest", @@ -880,26 +881,6 @@ dependencies = [ "siphasher 0.3.5", ] -[[package]] -name = "pin-project" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7509cc106041c40a4518d2af7a61530e1eed0e6285296a3d8c5472806ccc4a4" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48c950132583b500556b1efd71d45b319029f2b71518d979fcc208e16b42426f" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "pin-project-lite" version = "0.2.6" @@ -1281,9 +1262,9 @@ dependencies = [ [[package]] name = "security-framework" -version = "2.2.0" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3670b1d2fdf6084d192bc71ead7aabe6c06aa2ea3fbd9cc3ac111fa5c2b1bd84" +checksum = "23a2ac85147a3a11d77ecf1bc7166ec0b92febfa4461c37944e180f319ece467" dependencies = [ "bitflags", "core-foundation", @@ -1294,9 +1275,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3676258fd3cfe2c9a0ec99ce3038798d847ce3e4bb17746373eb9f0f1ac16339" +checksum = "7e4effb91b4b8b6fb7732e670b6cee160278ff8e6bf485c7805d9e319d76e284" dependencies = [ "core-foundation-sys", "libc", @@ -1594,9 +1575,9 @@ dependencies = [ [[package]] name = "unicode-normalization" -version = "0.1.18" +version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33717dca7ac877f497014e10d73f3acf948c342bee31b5ca7892faf94ccc6b49" +checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9" dependencies = [ "tinyvec", ] diff --git a/Cargo.toml b/Cargo.toml index cc85506..2460b69 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,6 +27,7 @@ base64 = "0.13.0" chrono = "0.4.19" # Used for formatting creation timestamp clap = "2.33.3" cssparser = "0.28.1" +encoding_rs = "0.8.28" html5ever = "0.24.1" regex = "1.5.4" # Used for parsing srcset and NOSCRIPT sha2 = "0.9.5" # Used for calculating checksums during integrity checks diff --git a/src/css.rs b/src/css.rs index c266dd3..f19ac0f 100644 --- a/src/css.rs +++ b/src/css.rs @@ -198,9 +198,14 @@ pub fn process_css<'a>( options, depth + 1, ) { - Ok((import_contents, import_final_url, _import_media_type)) => { + Ok(( + import_contents, + import_final_url, + import_media_type, + _import_charset, + )) => { let mut import_data_url = create_data_url( - "text/css", + &import_media_type, embed_css( cache, client, @@ -247,7 +252,7 @@ pub fn process_css<'a>( options, depth + 1, ) { - Ok((data, final_url, media_type)) => { + Ok((data, final_url, media_type, _charset)) => { let mut data_url = create_data_url(&media_type, &data, &final_url); data_url.set_fragment(resolved_url.fragment()); @@ -343,9 +348,9 @@ pub fn process_css<'a>( options, depth + 1, ) { - Ok((css, final_url, _media_type)) => { + Ok((css, final_url, media_type, _charset)) => { let mut data_url = create_data_url( - "text/css", + &media_type, embed_css( cache, client, @@ -381,7 +386,7 @@ pub fn process_css<'a>( options, depth + 1, ) { - Ok((data, final_url, media_type)) => { + Ok((data, final_url, media_type, _charset)) => { let mut data_url = create_data_url(&media_type, &data, &final_url); data_url.set_fragment(full_url.fragment()); result diff --git a/src/html.rs b/src/html.rs index 836f5cc..fcc99fb 100644 --- a/src/html.rs +++ b/src/html.rs @@ -1,5 +1,6 @@ use base64; use chrono::prelude::*; +use encoding_rs::Encoding; use html5ever::interface::QualName; use html5ever::parse_document; use html5ever::rcdom::{Handle, NodeData, RcDom}; @@ -18,7 +19,7 @@ use crate::css::embed_css; use crate::js::attr_is_event_handler; use crate::opts::Options; use crate::url::{clean_url, create_data_url, is_url_and_has_protocol, resolve_url}; -use crate::utils::retrieve_asset; +use crate::utils::{parse_content_type, retrieve_asset}; struct SrcSetItem<'a> { path: &'a str, @@ -31,9 +32,8 @@ pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom { let mut buf: Vec = Vec::new(); serialize(&mut buf, document, SerializeOpts::default()) .expect("unable to serialize DOM into buffer"); - let result = String::from_utf8(buf).unwrap(); - let mut dom = html_to_dom(&result); + let mut dom = html_to_dom(&buf, "utf-8".to_string()); let doc = dom.get_document(); if let Some(html) = get_child_node_by_name(&doc, "html") { if let Some(head) = get_child_node_by_name(&html, "head") { @@ -115,7 +115,7 @@ pub fn create_metadata_tag(url: &Url) -> String { // Prevent credentials from getting into metadata if clean_url.scheme() == "http" || clean_url.scheme() == "https" { - // Only HTTP(S) URLs may feature credentials + // Only HTTP(S) URLs can contain credentials clean_url.set_username("").unwrap(); clean_url.set_password(None).unwrap(); } @@ -188,7 +188,8 @@ pub fn embed_srcset( options, depth + 1, ) { - Ok((image_data, image_final_url, image_media_type)) => { + Ok((image_data, image_final_url, image_media_type, _image_charset)) => { + // TODO: use image_charset let mut image_data_url = create_data_url(&image_media_type, &image_data, &image_final_url); // Append retreved asset as a data URL @@ -253,6 +254,48 @@ pub fn find_base_node(node: &Handle) -> Option { None } +pub fn find_meta_charset_or_content_type_node(node: &Handle) -> Option { + match node.data { + NodeData::Document => { + // Dig deeper + for child in node.children.borrow().iter() { + if let Some(meta_charset_node) = find_meta_charset_or_content_type_node(child) { + return Some(meta_charset_node); + } + } + } + NodeData::Element { ref name, .. } => { + match name.local.as_ref() { + "head" => { + if let Some(meta_node) = get_child_node_by_name(node, "meta") { + if let Some(_) = get_node_attr(&meta_node, "charset") { + return Some(meta_node); + } else if let Some(meta_node_http_equiv_attr_value) = + get_node_attr(&meta_node, "http-equiv") + { + if meta_node_http_equiv_attr_value.eq_ignore_ascii_case("content-type") + { + return Some(meta_node); + } + } + } + } + _ => {} + } + + // Dig deeper + for child in node.children.borrow().iter() { + if let Some(meta_charset_node) = find_meta_charset_or_content_type_node(child) { + return Some(meta_charset_node); + } + } + } + _ => {} + } + + None +} + pub fn get_base_url(handle: &Handle) -> Option { if let Some(base_node) = find_base_node(handle) { get_node_attr(&base_node, "href") @@ -261,6 +304,24 @@ pub fn get_base_url(handle: &Handle) -> Option { } } +pub fn get_charset(node: &Handle) -> Option { + if let Some(meta_charset_node) = find_meta_charset_or_content_type_node(node) { + if let Some(meta_charset_node_attr_value) = get_node_attr(&meta_charset_node, "charset") { + // Processing + return Some(meta_charset_node_attr_value); + } else if let Some(meta_content_type_node_attr_value) = + get_node_attr(&meta_charset_node, "content") + { + // Processing + let (_media_type, charset, _is_base64) = + parse_content_type(&meta_content_type_node_attr_value); + return Some(charset); + } + } + + return None; +} + pub fn get_child_node_by_name(parent: &Handle, node_name: &str) -> Option { let children = parent.children.borrow(); let matching_children = children.iter().find(|child| match child.data { @@ -273,13 +334,6 @@ pub fn get_child_node_by_name(parent: &Handle, node_name: &str) -> Option Option<&'_ str> { - match &node.data { - NodeData::Element { ref name, .. } => Some(name.local.as_ref()), - _ => None, - } -} - pub fn get_node_attr(node: &Handle, attr_name: &str) -> Option { match &node.data { NodeData::Element { ref attrs, .. } => { @@ -294,6 +348,13 @@ pub fn get_node_attr(node: &Handle, attr_name: &str) -> Option { } } +pub fn get_node_name(node: &Handle) -> Option<&'_ str> { + match &node.data { + NodeData::Element { ref name, .. } => Some(name.local.as_ref()), + _ => None, + } +} + pub fn get_parent_node(child: &Handle) -> Handle { let parent = child.parent.take().clone(); parent.and_then(|node| node.upgrade()).unwrap() @@ -340,10 +401,19 @@ pub fn has_favicon(handle: &Handle) -> bool { found_favicon } -pub fn html_to_dom(data: &str) -> RcDom { +pub fn html_to_dom(data: &Vec, document_encoding: String) -> RcDom { + let s: String; + + if let Some(encoding) = Encoding::for_label(document_encoding.as_bytes()) { + let (string, _, _) = encoding.decode(&data); + s = string.to_string(); + } else { + s = String::from_utf8_lossy(&data).to_string(); + } + parse_document(RcDom::default(), Default::default()) .from_utf8() - .read_from(&mut data.as_bytes()) + .read_from(&mut s.as_bytes()) .unwrap() } @@ -355,9 +425,8 @@ pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom { let mut buf: Vec = Vec::new(); serialize(&mut buf, document, SerializeOpts::default()) .expect("unable to serialize DOM into buffer"); - let result = String::from_utf8(buf).unwrap(); - let mut dom = html_to_dom(&result); + let mut dom = html_to_dom(&buf, "utf-8".to_string()); let doc = dom.get_document(); if let Some(html_node) = get_child_node_by_name(&doc, "html") { if let Some(head_node) = get_child_node_by_name(&html_node, "head") { @@ -383,6 +452,41 @@ pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom { dom } +pub fn set_charset(mut dom: RcDom, desired_charset: String) -> RcDom { + if let Some(meta_charset_node) = find_meta_charset_or_content_type_node(&dom.document) { + if let Some(_) = get_node_attr(&meta_charset_node, "charset") { + set_node_attr(&meta_charset_node, "charset", Some(desired_charset)); + } else if let Some(_) = get_node_attr(&meta_charset_node, "content") { + set_node_attr( + &meta_charset_node, + "content", + Some(format!("text/html;charset={}", desired_charset)), + ); + } + } else { + let meta_charset_node = dom.create_element( + QualName::new(None, ns!(), local_name!("meta")), + vec![Attribute { + name: QualName::new(None, ns!(), local_name!("charset")), + value: format_tendril!("{}", desired_charset), + }], + Default::default(), + ); + + // Insert newly created META charset node into HEAD + if let Some(html_node) = get_child_node_by_name(&dom.document, "html") { + if let Some(head_node) = get_child_node_by_name(&html_node, "head") { + head_node + .children + .borrow_mut() + .push(meta_charset_node.clone()); + } + } + } + + dom +} + pub fn set_node_attr(node: &Handle, attr_name: &str, attr_value: Option) { match &node.data { NodeData::Element { ref attrs, .. } => { @@ -423,16 +527,10 @@ pub fn set_node_attr(node: &Handle, attr_name: &str, attr_value: Option) }; } -pub fn stringify_document(handle: &Handle, options: &Options) -> String { +pub fn serialize_document(mut dom: RcDom, document_encoding: String, options: &Options) -> Vec { let mut buf: Vec = Vec::new(); - serialize(&mut buf, handle, SerializeOpts::default()) - .expect("Unable to serialize DOM into buffer"); - - let mut result = String::from_utf8(buf).unwrap(); + let doc = dom.get_document(); - // We can't make it isolate the page right away since it may have no HEAD element, - // ergo we have to serialize, parse the DOM again, insert the CSP meta tag, and then - // finally serialize and return the resulting string if options.isolate || options.no_css || options.no_fonts @@ -441,9 +539,6 @@ pub fn stringify_document(handle: &Handle, options: &Options) -> String { || options.no_images { // Take care of CSP - let mut buf: Vec = Vec::new(); - let mut dom = html_to_dom(&result); - let doc = dom.get_document(); if let Some(html) = get_child_node_by_name(&doc, "html") { if let Some(head) = get_child_node_by_name(&html, "head") { let meta = dom.create_element( @@ -468,19 +563,27 @@ pub fn stringify_document(handle: &Handle, options: &Options) -> String { head.children.borrow_mut().reverse(); } } - - serialize(&mut buf, &doc, SerializeOpts::default()) - .expect("Unable to serialize DOM into buffer"); - result = String::from_utf8(buf).unwrap(); } + serialize(&mut buf, &doc, SerializeOpts::default()) + .expect("Unable to serialize DOM into buffer"); + // Unwrap NOSCRIPT elements if options.unwrap_noscript { + let s: &str = &String::from_utf8_lossy(&buf); let noscript_re = Regex::new(r"<(?P/?noscript[^>]*)>").unwrap(); - result = noscript_re.replace_all(&result, "").to_string(); + buf = noscript_re.replace_all(&s, "").as_bytes().to_vec(); } - result + if !document_encoding.is_empty() { + if let Some(encoding) = Encoding::for_label(document_encoding.as_bytes()) { + let s: &str = &String::from_utf8_lossy(&buf); + let (data, _, _) = encoding.encode(s); + buf = data.to_vec(); + } + } + + buf } pub fn retrieve_and_embed_asset( @@ -503,7 +606,7 @@ pub fn retrieve_and_embed_asset( options, depth + 1, ) { - Ok((data, final_url, mut media_type)) => { + Ok((data, final_url, mut media_type, _charset)) => { let node_name: &str = get_node_name(&node).unwrap(); // Check integrity if it's a LINK or SCRIPT element @@ -537,7 +640,7 @@ pub fn retrieve_and_embed_asset( set_node_attr(&node, attr_name, Some(css_data_url.to_string())); } else if node_name == "frame" || node_name == "iframe" { // (I)FRAMEs are also quite different from conventional resources - let frame_dom = html_to_dom(&String::from_utf8_lossy(&data)); + let frame_dom = html_to_dom(&data, "utf-8".to_string()); walk_and_embed_assets( cache, client, @@ -556,6 +659,7 @@ pub fn retrieve_and_embed_asset( .unwrap(); // Create and embed data URL + // TODO: use charset let mut frame_data_url = create_data_url(&media_type, &frame_data, &final_url); frame_data_url.set_fragment(resolved_url.fragment()); set_node_attr(node, attr_name, Some(frame_data_url.to_string())); @@ -629,20 +733,7 @@ pub fn walk_and_embed_assets( meta_attr_http_equiv_value )), ); - } else if meta_attr_http_equiv_value.eq_ignore_ascii_case("Content-Type") { - // Enforce charset to be set to UTF-8 - if let Some(_attr_value) = get_node_attr(node, "content") { - set_node_attr( - &node, - "content", - Some(str!("text/html; charset=utf-8")), - ); - } } - } else if let Some(_meta_attr_http_equiv_value) = get_node_attr(node, "charset") - { - // Enforce charset to be set to UTF-8 - set_node_attr(&node, "charset", Some(str!("utf-8"))); } } "link" => { @@ -1078,7 +1169,8 @@ pub fn walk_and_embed_assets( // Get contents of NOSCRIPT node let mut noscript_contents = contents.borrow_mut(); // Parse contents of NOSCRIPT node as DOM - let noscript_contents_dom: RcDom = html_to_dom(&noscript_contents); + let noscript_contents_dom: RcDom = + html_to_dom(&noscript_contents.as_bytes().to_vec(), str!()); // Embed assets of NOSCRIPT node contents walk_and_embed_assets( cache, @@ -1098,7 +1190,7 @@ pub fn walk_and_embed_assets( let mut buf: Vec = Vec::new(); serialize(&mut buf, &body, SerializeOpts::default()) .expect("Unable to serialize DOM into buffer"); - let result = String::from_utf8(buf).unwrap(); + let result = String::from_utf8_lossy(&buf); noscript_contents.push_slice(&result); } } diff --git a/src/main.rs b/src/main.rs index 23e245b..7b9ac16 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,4 @@ +use html5ever::rcdom::RcDom; use reqwest::blocking::Client; use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; use std::collections::HashMap; @@ -9,11 +10,11 @@ use std::time::Duration; use url::Url; use monolith::html::{ - add_favicon, create_metadata_tag, get_base_url, has_favicon, html_to_dom, set_base_url, - stringify_document, walk_and_embed_assets, + add_favicon, create_metadata_tag, get_base_url, get_charset, has_favicon, html_to_dom, + serialize_document, set_base_url, set_charset, walk_and_embed_assets, }; use monolith::opts::Options; -use monolith::url::{create_data_url, parse_data_url, resolve_url}; +use monolith::url::{create_data_url, resolve_url}; use monolith::utils::retrieve_asset; mod macros; @@ -32,29 +33,35 @@ impl Output { } } - fn writeln_str(&mut self, s: &str) -> Result<(), Error> { + fn write(&mut self, bytes: &Vec) -> Result<(), Error> { match self { Output::Stdout(stdout) => { - writeln!(stdout, "{}", s)?; + stdout.write_all(bytes)?; + // Ensure newline at end of output + if bytes.last() != Some(&b"\n"[0]) { + stdout.write(b"\n")?; + } stdout.flush() } - Output::File(f) => { - writeln!(f, "{}", s)?; - f.flush() + Output::File(file) => { + file.write_all(bytes)?; + // Ensure newline at end of output + if bytes.last() != Some(&b"\n"[0]) { + file.write(b"\n")?; + } + file.flush() } } } } -pub fn read_stdin() -> String { - let mut buffer = String::new(); +pub fn read_stdin() -> Vec { + let mut buffer: Vec = vec![]; - for line in io::stdin().lock().lines() { - buffer += line.unwrap_or_default().as_str(); - buffer += "\n"; + match io::stdin().lock().read_to_end(&mut buffer) { + Ok(_) => buffer, + Err(_) => buffer, } - - buffer } fn main() { @@ -77,7 +84,7 @@ fn main() { if target.clone() == "-" { // Read from pipe (stdin) use_stdin = true; - // Set default target URL to an empty data URL; the user can control it via --base-url + // Set default target URL to an empty data URL; the user can set it via --base-url target_url = Url::parse("data:text/html,").unwrap(); } else { match Url::parse(&target.clone()) { @@ -96,7 +103,7 @@ fn main() { } Err(_err) => { // Failed to parse given base URL, - // perhaps it's a filesystem path? + // perhaps it's a filesystem path? let path: &Path = Path::new(&target); if path.exists() { @@ -123,7 +130,7 @@ fn main() { } } else { // Last chance, now we do what browsers do: - // prepend "http://" and hope it points to a website + // prepend "http://" and hope it points to a website target.insert_str(0, "http://"); target_url = Url::parse(&target).unwrap(); } @@ -131,9 +138,6 @@ fn main() { } } - // Define output - let mut output = Output::new(&options.output).expect("Could not prepare output"); - // Initialize client let mut cache = HashMap::new(); let mut header_map = HeaderMap::new(); @@ -158,20 +162,33 @@ fn main() { // At this stage we assume that the base URL is the same as the target URL base_url = target_url.clone(); - let mut dom; + let data: Vec; + let mut document_encoding: String = str!(); + let mut dom: RcDom; // Retrieve target document if use_stdin { - dom = html_to_dom(&read_stdin()); + data = read_stdin(); } else if target_url.scheme() == "file" || (target_url.scheme() == "http" || target_url.scheme() == "https") + || target_url.scheme() == "data" { match retrieve_asset(&mut cache, &client, &target_url, &target_url, &options, 0) { - Ok((data, final_url, _media_type)) => { + Ok((retrieved_data, final_url, media_type, charset)) => { + // Make sure the media type is text/html + if !media_type.eq_ignore_ascii_case("text/html") { + if !options.silent { + eprintln!("Unsupported document media type"); + } + process::exit(1); + } + if options.base_url.clone().unwrap_or(str!()).is_empty() { - base_url = final_url + base_url = final_url; } - dom = html_to_dom(&String::from_utf8_lossy(&data)); + + data = retrieved_data; + document_encoding = charset; } Err(_) => { if !options.silent { @@ -180,36 +197,37 @@ fn main() { process::exit(1); } } - } else if target_url.scheme() == "data" { - let (media_type, data): (String, Vec) = parse_data_url(&target_url); - - if !media_type.eq_ignore_ascii_case("text/html") { - if !options.silent { - eprintln!("Unsupported data URL media type"); - } - process::exit(1); - } - - dom = html_to_dom(&String::from_utf8_lossy(&data)); } else { process::exit(1); } + // Initial parse to read document's charset from META tag + dom = html_to_dom(&data, document_encoding.clone()); + + // Attempt to determine document's charset + if let Some(charset) = get_charset(&dom.document) { + if !charset.is_empty() { + // TODO && label(charset) != UTF_8 + document_encoding = charset; + dom = html_to_dom(&data, document_encoding.clone()); + } + } + // Use custom base URL if specified, read and use what's in the DOM otherwise - let b: String = options.base_url.clone().unwrap_or(str!()); - if b.is_empty() { + let custom_base_url: String = options.base_url.clone().unwrap_or(str!()); + if custom_base_url.is_empty() { // No custom base URL is specified, - // try to see if the document has BASE tag + // try to see if the document has BASE tag if let Some(existing_base_url) = get_base_url(&dom.document) { base_url = resolve_url(&target_url, &existing_base_url); } } else { // Custom base URL provided - match Url::parse(&b) { + match Url::parse(&custom_base_url) { Ok(parsed_url) => { if parsed_url.scheme() == "file" { // File base URLs can only work with - // documents saved from filesystem + // documents saved from filesystem if target_url.scheme() == "file" { base_url = parsed_url; } @@ -219,10 +237,10 @@ fn main() { } Err(_) => { // Failed to parse given base URL, - // perhaps it's a filesystem path? + // perhaps it's a filesystem path? if target_url.scheme() == "file" { // Relative paths could work for documents saved from filesystem - let path: &Path = Path::new(&b); + let path: &Path = Path::new(&custom_base_url); if path.exists() { match Url::from_file_path(fs::canonicalize(&path).unwrap()) { Ok(file_url) => { @@ -230,7 +248,10 @@ fn main() { } Err(_) => { if !options.silent { - eprintln!("Could not map given path to base URL: {}", b); + eprintln!( + "Could not map given path to base URL: {}", + custom_base_url + ); } process::exit(1); } @@ -241,11 +262,10 @@ fn main() { } } - // Embed remote assets + // Traverse through the document and embed remote assets walk_and_embed_assets(&mut cache, &client, &base_url, &dom.document, &options, 0); - // Update or add new BASE tag to reroute network requests - // and hash-links in the final document + // Update or add new BASE element to reroute network requests and hash-links if let Some(new_base_url) = options.base_url.clone() { dom = set_base_url(&dom.document, new_base_url); } @@ -265,7 +285,8 @@ fn main() { &options, 0, ) { - Ok((data, final_url, media_type)) => { + Ok((data, final_url, media_type, _charset)) => { + // TODO: use charset let favicon_data_url: Url = create_data_url(&media_type, &data, &final_url); dom = add_favicon(&dom.document, favicon_data_url.to_string()); } @@ -275,20 +296,26 @@ fn main() { } } + // Enforce UTF-8 encoding for documents that may end up having garbled html entities + // due to html5ever forcefully converting them into UTF-8 byte sequences. + if document_encoding.eq_ignore_ascii_case("iso-8859-1") { + document_encoding = str!("utf-8"); + dom = set_charset(dom, document_encoding.clone()); + } + // Serialize DOM tree - let mut result: String = stringify_document(&dom.document, &options); + let mut result: Vec = serialize_document(dom, document_encoding, &options); // Add metadata tag if !options.no_metadata { - let metadata_comment: String = create_metadata_tag(&target_url); - result.insert_str(0, &metadata_comment); - if metadata_comment.len() > 0 { - result.insert_str(metadata_comment.len(), "\n"); - } + let mut metadata_comment: String = create_metadata_tag(&target_url); + metadata_comment += "\n"; + result.splice(0..0, metadata_comment.as_bytes().to_vec()); } + // Define output + let mut output = Output::new(&options.output).expect("Could not prepare output"); + // Write result into stdout or file - output - .writeln_str(&result) - .expect("Could not write HTML output"); + output.write(&result).expect("Could not write HTML output"); } diff --git a/src/tests/cli/base_url.rs b/src/tests/cli/base_url.rs index 192e1e7..d84932f 100644 --- a/src/tests/cli/base_url.rs +++ b/src/tests/cli/base_url.rs @@ -22,18 +22,18 @@ mod passing { .output() .unwrap(); + // STDERR should be empty + assert_eq!(String::from_utf8_lossy(&out.stderr), ""); + // STDOUT should contain newly added base URL assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), "\ \ Hello, World!\n" ); - // STDERR should be empty - assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); - - // The exit code should be 0 + // Exit code should be 0 out.assert().code(0); } @@ -46,18 +46,18 @@ mod passing { .output() .unwrap(); + // STDERR should be empty + assert_eq!(String::from_utf8_lossy(&out.stderr), ""); + // STDOUT should contain newly added base URL assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), "\ \ Hello, World!\n" ); - // STDERR should be empty - assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); - - // The exit code should be 0 + // Exit code should be 0 out.assert().code(0); } @@ -72,18 +72,18 @@ mod passing { .output() .unwrap(); + // STDERR should be empty + assert_eq!(String::from_utf8_lossy(&out.stderr), ""); + // STDOUT should contain newly added base URL assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), "\ \ Hello, World!\n" ); - // STDERR should be empty - assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); - - // The exit code should be 0 + // Exit code should be 0 out.assert().code(0); } @@ -98,18 +98,18 @@ mod passing { .output() .unwrap(); + // STDERR should be empty + assert_eq!(String::from_utf8_lossy(&out.stderr), ""); + // STDOUT should contain newly added base URL assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), "\ \ Hello, World!\n" ); - // STDERR should be empty - assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); - - // The exit code should be 0 + // Exit code should be 0 out.assert().code(0); } } diff --git a/src/tests/cli/basic.rs b/src/tests/cli/basic.rs index 53b2be9..d3d7b93 100644 --- a/src/tests/cli/basic.rs +++ b/src/tests/cli/basic.rs @@ -19,16 +19,16 @@ mod passing { let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); let out = cmd.arg("-V").output().unwrap(); + // STDERR should be empty + assert_eq!(String::from_utf8_lossy(&out.stderr), ""); + // STDOUT should contain program name and version assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), format!("{} {}\n", env!("CARGO_PKG_NAME"), env!("CARGO_PKG_VERSION")) ); - // STDERR should be empty - assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); - - // The exit code should be 0 + // Exit code should be 0 out.assert().code(0); } @@ -46,11 +46,17 @@ mod passing { cmd.stdin(echo_out); let out = cmd.arg("-M").arg("-").output().unwrap(); + // STDERR should be empty + assert_eq!(String::from_utf8_lossy(&out.stderr), ""); + // STDOUT should contain HTML created out of STDIN assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), "Hello from STDIN\n\n" ); + + // Exit code should be 0 + out.assert().code(0); } #[test] @@ -64,15 +70,9 @@ mod passing { let out = cmd.arg("-M").arg(path_html.as_os_str()).output().unwrap(); - // STDOUT should contain embedded CSS url()'s - assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), - "\n\n" - ); - // STDERR should list files that got retrieved assert_eq!( - std::str::from_utf8(&out.stderr).unwrap(), + String::from_utf8_lossy(&out.stderr), format!( "\ {file_url_html}\n \ @@ -85,7 +85,13 @@ mod passing { ) ); - // The exit code should be 0 + // STDOUT should contain embedded CSS url()'s + assert_eq!( + String::from_utf8_lossy(&out.stdout), + "\n\n" + ); + + // Exit code should be 0 out.assert().code(0); } } @@ -108,16 +114,16 @@ mod failing { let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); let out = cmd.arg("").output().unwrap(); - // STDOUT should be empty - assert_eq!(std::str::from_utf8(&out.stdout).unwrap(), ""); - // STDERR should contain error description assert_eq!( - std::str::from_utf8(&out.stderr).unwrap(), + String::from_utf8_lossy(&out.stderr), "No target specified\n" ); - // The exit code should be 1 + // STDOUT should be empty + assert_eq!(String::from_utf8_lossy(&out.stdout), ""); + + // Exit code should be 1 out.assert().code(1); } } diff --git a/src/tests/cli/data_url.rs b/src/tests/cli/data_url.rs index 280ed40..c89c9d4 100644 --- a/src/tests/cli/data_url.rs +++ b/src/tests/cli/data_url.rs @@ -21,18 +21,18 @@ mod passing { .output() .unwrap(); + // STDERR should be empty + assert_eq!(String::from_utf8_lossy(&out.stderr), ""); + // STDOUT should contain isolated HTML assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), "\ \ Hello, World!\n" ); - // STDERR should be empty - assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); - - // The exit code should be 0 + // Exit code should be 0 out.assert().code(0); } @@ -46,19 +46,19 @@ mod passing { .output() .unwrap(); + // STDERR should be empty + assert_eq!(String::from_utf8_lossy(&out.stderr), ""); + // STDOUT should contain HTML with no CSS assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), "\ \ \ Hello\n" ); - // STDERR should be empty - assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); - - // The exit code should be 0 + // Exit code should be 0 out.assert().code(0); } @@ -72,19 +72,19 @@ mod passing { .output() .unwrap(); + // STDERR should be empty + assert_eq!(String::from_utf8_lossy(&out.stderr), ""); + // STDOUT should contain HTML with no web fonts assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), "\ \ \ Hi\n" ); - // STDERR should be empty - assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); - - // The exit code should be 0 + // Exit code should be 0 out.assert().code(0); } @@ -98,18 +98,18 @@ mod passing { .output() .unwrap(); + // STDERR should be empty + assert_eq!(String::from_utf8_lossy(&out.stderr), ""); + // STDOUT should contain HTML with no iframes assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), "\ \ Hi\n" ); - // STDERR should be empty - assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); - - // The exit code should be 0 + // Exit code should be 0 out.assert().code(0); } @@ -123,9 +123,12 @@ mod passing { .output() .unwrap(); + // STDERR should be empty + assert_eq!(String::from_utf8_lossy(&out.stderr), ""); + // STDOUT should contain HTML with no images assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), format!( "\ \ @@ -140,10 +143,7 @@ mod passing { ) ); - // STDERR should be empty - assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); - - // The exit code should be 0 + // Exit code should be 0 out.assert().code(0); } @@ -157,9 +157,12 @@ mod passing { .output() .unwrap(); + // STDERR should be empty + assert_eq!(String::from_utf8_lossy(&out.stderr), ""); + // STDOUT should contain HTML with no JS assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), "\ \ \ @@ -168,10 +171,7 @@ mod passing { \n" ); - // STDERR should be empty - assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); - - // The exit code should be 0 + // Exit code should be 0 out.assert().code(0); } } @@ -194,16 +194,16 @@ mod failing { let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); let out = cmd.arg("data:,Hello%2C%20World!").output().unwrap(); - // STDOUT should contain HTML - assert_eq!(std::str::from_utf8(&out.stdout).unwrap(), ""); - // STDERR should contain error description assert_eq!( - std::str::from_utf8(&out.stderr).unwrap(), - "Unsupported data URL media type\n" + String::from_utf8_lossy(&out.stderr), + "Unsupported document media type\n" ); - // The exit code should be 1 + // STDOUT should contain HTML + assert_eq!(String::from_utf8_lossy(&out.stdout), ""); + + // Exit code should be 1 out.assert().code(1); } @@ -216,16 +216,16 @@ mod failing { .output() .unwrap(); + // STDERR should be empty + assert_eq!(String::from_utf8_lossy(&out.stderr), ""); + // STDOUT should contain HTML with no JS in it assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), "\n" ); - // STDERR should be empty - assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); - - // The exit code should be 0 + // Exit code should be 0 out.assert().code(0); } } diff --git a/src/tests/cli/local_files.rs b/src/tests/cli/local_files.rs index 0c62e5b..d27af67 100644 --- a/src/tests/cli/local_files.rs +++ b/src/tests/cli/local_files.rs @@ -10,7 +10,7 @@ mod passing { use assert_cmd::prelude::*; use std::env; use std::fs; - use std::path::Path; + use std::path::{Path, MAIN_SEPARATOR}; use std::process::Command; use url::Url; @@ -21,18 +21,33 @@ mod passing { str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/"); let out = cmd .arg("-M") - .arg(if cfg!(windows) { - "src\\tests\\data\\basic\\local-file.html" - } else { - "src/tests/data/basic/local-file.html" - }) + .arg(format!( + "src{s}tests{s}data{s}basic{s}local-file.html", + s = MAIN_SEPARATOR + )) .output() .unwrap(); let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" }; + // STDERR should contain list of retrieved file URLs, two missing + assert_eq!( + String::from_utf8_lossy(&out.stderr), + format!( + "\ + {file}{cwd}/src/tests/data/basic/local-file.html\n \ + {file}{cwd}/src/tests/data/basic/local-style.css\n \ + {file}{cwd}/src/tests/data/basic/local-style-does-not-exist.css (not found)\n \ + {file}{cwd}/src/tests/data/basic/monolith.png (not found)\n \ + {file}{cwd}/src/tests/data/basic/local-script.js\n\ + ", + file = file_url_protocol, + cwd = cwd_normalized + ) + ); + // STDOUT should contain HTML from the local file assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), "\ \n \ \n \ @@ -47,23 +62,7 @@ mod passing { " ); - // STDERR should contain list of retrieved file URLs, two missing - assert_eq!( - std::str::from_utf8(&out.stderr).unwrap(), - format!( - "\ - {file}{cwd}/src/tests/data/basic/local-file.html\n \ - {file}{cwd}/src/tests/data/basic/local-style.css\n \ - {file}{cwd}/src/tests/data/basic/local-style-does-not-exist.css (not found)\n \ - {file}{cwd}/src/tests/data/basic/monolith.png (not found)\n \ - {file}{cwd}/src/tests/data/basic/local-script.js\n\ - ", - file = file_url_protocol, - cwd = cwd_normalized - ) - ); - - // The exit code should be 0 + // Exit code should be 0 out.assert().code(0); } @@ -79,9 +78,18 @@ mod passing { .output() .unwrap(); + // STDERR should contain only the target file + assert_eq!( + String::from_utf8_lossy(&out.stderr), + format!( + "{file_url_html}\n", + file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()).unwrap(), + ) + ); + // STDOUT should contain HTML from the local file assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), format!( "\ \ @@ -100,16 +108,7 @@ mod passing { ) ); - // STDERR should contain only the target file - assert_eq!( - std::str::from_utf8(&out.stderr).unwrap(), - format!( - "{file_url_html}\n", - file_url_html = Url::from_file_path(fs::canonicalize(&path_html).unwrap()).unwrap(), - ) - ); - - // The exit code should be 0 + // Exit code should be 0 out.assert().code(0); } @@ -122,25 +121,27 @@ mod passing { let out = cmd .arg("-M") .arg("-cji") - .arg(if cfg!(windows) { - format!( - "{file}{cwd}/src/tests/data/basic/local-file.html", - file = file_url_protocol, - cwd = cwd_normalized, - ) - } else { - format!( - "{file}{cwd}/src/tests/data/basic/local-file.html", - file = file_url_protocol, - cwd = cwd_normalized, - ) - }) + .arg(format!( + "{file}{cwd}/src/tests/data/basic/local-file.html", + file = file_url_protocol, + cwd = cwd_normalized, + )) .output() .unwrap(); + // STDERR should contain list of retrieved file URLs + assert_eq!( + String::from_utf8_lossy(&out.stderr), + format!( + "{file}{cwd}/src/tests/data/basic/local-file.html\n", + file = file_url_protocol, + cwd = cwd_normalized, + ) + ); + // STDOUT should contain HTML from the local file assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), format!( "\ \ @@ -159,17 +160,7 @@ mod passing { ) ); - // STDERR should contain list of retrieved file URLs - assert_eq!( - std::str::from_utf8(&out.stderr).unwrap(), - format!( - "{file}{cwd}/src/tests/data/basic/local-file.html\n", - file = file_url_protocol, - cwd = cwd_normalized, - ) - ); - - // The exit code should be 0 + // Exit code should be 0 out.assert().code(0); } @@ -181,15 +172,9 @@ mod passing { let out = cmd.arg("-M").arg(path_html.as_os_str()).output().unwrap(); - // STDOUT should contain HTML with date URL for background-image in it - assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), - "
\n\n" - ); - // STDERR should list files that got retrieved assert_eq!( - std::str::from_utf8(&out.stderr).unwrap(), + String::from_utf8_lossy(&out.stderr), format!( "\ {file_url_html}\n \ @@ -200,7 +185,13 @@ mod passing { ) ); - // The exit code should be 0 + // STDOUT should contain HTML with date URL for background-image in it + assert_eq!( + String::from_utf8_lossy(&out.stdout), + "
\n\n" + ); + + // Exit code should be 0 out.assert().code(0); } @@ -229,9 +220,25 @@ mod passing { .output() .unwrap(); + // STDERR should contain list of retrieved file URLs + assert_eq!( + String::from_utf8_lossy(&out.stderr), + format!( + "\ + {file}{cwd}/src/tests/data/integrity/index.html\n \ + {file}{cwd}/src/tests/data/integrity/style.css\n \ + {file}{cwd}/src/tests/data/integrity/style.css\n \ + {file}{cwd}/src/tests/data/integrity/script.js\n \ + {file}{cwd}/src/tests/data/integrity/script.js\n\ + ", + file = file_url_protocol, + cwd = cwd_normalized, + ) + ); + // STDOUT should contain HTML from the local file; integrity attributes should be missing assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), format!( "\ \ @@ -247,23 +254,7 @@ mod passing { ) ); - // STDERR should contain list of retrieved file URLs - assert_eq!( - std::str::from_utf8(&out.stderr).unwrap(), - format!( - "\ - {file}{cwd}/src/tests/data/integrity/index.html\n \ - {file}{cwd}/src/tests/data/integrity/style.css\n \ - {file}{cwd}/src/tests/data/integrity/style.css\n \ - {file}{cwd}/src/tests/data/integrity/script.js\n \ - {file}{cwd}/src/tests/data/integrity/script.js\n\ - ", - file = file_url_protocol, - cwd = cwd_normalized, - ) - ); - - // The exit code should be 0 + // Exit code should be 0 out.assert().code(0); } } diff --git a/src/tests/cli/noscript.rs b/src/tests/cli/noscript.rs index 62cb917..85ebfdd 100644 --- a/src/tests/cli/noscript.rs +++ b/src/tests/cli/noscript.rs @@ -22,15 +22,9 @@ mod passing { let out = cmd.arg("-M").arg(path_html.as_os_str()).output().unwrap(); - // STDOUT should contain HTML with no CSS - assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), - "\n\n" - ); - // STDERR should contain target HTML and embedded SVG files assert_eq!( - std::str::from_utf8(&out.stderr).unwrap(), + String::from_utf8_lossy(&out.stderr), format!( "\ {file_url_html}\n \ @@ -41,7 +35,13 @@ mod passing { ) ); - // The exit code should be 0 + // STDOUT should contain HTML with no CSS + assert_eq!( + String::from_utf8_lossy(&out.stdout), + "\n\n" + ); + + // Exit code should be 0 out.assert().code(0); } @@ -53,15 +53,9 @@ mod passing { let out = cmd.arg("-Mn").arg(path_html.as_os_str()).output().unwrap(); - // STDOUT should contain HTML with no CSS - assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), - "\n\n" - ); - // STDERR should contain target HTML and embedded SVG files assert_eq!( - std::str::from_utf8(&out.stderr).unwrap(), + String::from_utf8_lossy(&out.stderr), format!( "\ {file_url_html}\n \ @@ -72,7 +66,13 @@ mod passing { ) ); - // The exit code should be 0 + // STDOUT should contain HTML with no CSS + assert_eq!( + String::from_utf8_lossy(&out.stdout), + "\n\n" + ); + + // Exit code should be 0 out.assert().code(0); } @@ -84,15 +84,9 @@ mod passing { let out = cmd.arg("-Mn").arg(path_html.as_os_str()).output().unwrap(); - // STDOUT should contain HTML with no CSS - assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), - "

JS is not active

\n\n" - ); - // STDERR should contain target HTML and embedded SVG files assert_eq!( - std::str::from_utf8(&out.stderr).unwrap(), + String::from_utf8_lossy(&out.stderr), format!( "\ {file_url_html}\n \ @@ -103,7 +97,13 @@ mod passing { ) ); - // The exit code should be 0 + // STDOUT should contain HTML with no CSS + assert_eq!( + String::from_utf8_lossy(&out.stdout), + "

JS is not active

\n\n" + ); + + // Exit code should be 0 out.assert().code(0); } @@ -115,22 +115,9 @@ mod passing { let out = cmd.arg("-Mn").arg(path_html.as_os_str()).output().unwrap(); - // STDOUT should contain HTML with no CSS - assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), - "\ - \ - \ - \ - \ - \n\ - \ - \n" - ); - // STDERR should contain target HTML and embedded SVG files assert_eq!( - std::str::from_utf8(&out.stderr).unwrap(), + String::from_utf8_lossy(&out.stderr), format!( "\ {file_url_html}\n \ @@ -141,7 +128,20 @@ mod passing { ) ); - // The exit code should be 0 + // STDOUT should contain HTML with no CSS + assert_eq!( + String::from_utf8_lossy(&out.stdout), + "\ + \ + \ + \ + \ + \n\ + \ + \n" + ); + + // Exit code should be 0 out.assert().code(0); } @@ -155,16 +155,16 @@ mod passing { .output() .unwrap(); + // STDERR should be empty + assert_eq!(String::from_utf8_lossy(&out.stderr), ""); + // STDOUT should contain unwrapped contents of NOSCRIPT element assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stdout), "test\n" ); - // STDERR should be empty - assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); - - // The exit code should be 0 + // Exit code should be 0 out.assert().code(0); } } diff --git a/src/tests/cli/unusual_encodings.rs b/src/tests/cli/unusual_encodings.rs index 5ebd9ac..9493bd2 100644 --- a/src/tests/cli/unusual_encodings.rs +++ b/src/tests/cli/unusual_encodings.rs @@ -8,31 +8,42 @@ #[cfg(test)] mod passing { use assert_cmd::prelude::*; + use encoding_rs::Encoding; use std::env; - use std::process::Command; + use std::path::MAIN_SEPARATOR; + use std::process::{Command, Stdio}; #[test] - fn change_encoding_to_utf_8() { + fn change_iso88591_to_utf8_to_properly_display_html_entities() { let cwd = env::current_dir().unwrap(); let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/"); let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); let out = cmd .arg("-M") - .arg(if cfg!(windows) { - "src\\tests\\data\\unusual_encodings\\iso-8859-1.html" - } else { - "src/tests/data/unusual_encodings/iso-8859-1.html" - }) + .arg(format!( + "src{s}tests{s}data{s}unusual_encodings{s}iso-8859-1.html", + s = MAIN_SEPARATOR + )) .output() .unwrap(); let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" }; - // STDOUT should contain newly added base URL + // STDERR should contain only the target file assert_eq!( - std::str::from_utf8(&out.stdout).unwrap(), + String::from_utf8_lossy(&out.stderr), + format!( + "{file}{cwd}/src/tests/data/unusual_encodings/iso-8859-1.html\n", + file = file_url_protocol, + cwd = cwd_normalized, + ) + ); + + // STDOUT should contain original document but with UTF-8 charset + assert_eq!( + String::from_utf8_lossy(&out.stdout), "\ \n \ - \n \ + \n \ \n \ \n \ © Some Company\n \ @@ -40,17 +51,102 @@ mod passing { \n" ); + // Exit code should be 0 + out.assert().code(0); + } + + #[test] + fn properly_save_document_with_gb2312() { + let cwd = env::current_dir().unwrap(); + let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/"); + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + let out = cmd + .arg("-M") + .arg(format!( + "src{s}tests{s}data{s}unusual_encodings{s}gb2312.html", + s = MAIN_SEPARATOR + )) + .output() + .unwrap(); + let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" }; + // STDERR should contain only the target file assert_eq!( - std::str::from_utf8(&out.stderr).unwrap(), + String::from_utf8_lossy(&out.stderr), format!( - "{file}{cwd}/src/tests/data/unusual_encodings/iso-8859-1.html\n", + "{file}{cwd}/src/tests/data/unusual_encodings/gb2312.html\n", file = file_url_protocol, cwd = cwd_normalized, ) ); - // The exit code should be 0 + // STDOUT should contain original document without any modificatons + let s: String; + if let Some(encoding) = Encoding::for_label(b"gb2312") { + let (string, _, _) = encoding.decode(&out.stdout); + s = string.to_string(); + } else { + s = String::from_utf8_lossy(&out.stdout).to_string(); + } + assert_eq!( + s, + "\ + \n \ + \n \ + 近七成人减少线下需求\u{3000}银行数字化转型提速--经济·科技--人民网 \n\ + \n\ + \n \ +

近七成人减少线下需求\u{3000}银行数字化转型提速

\n\n\n\ + \ + \n" + ); + + // Exit code should be 0 + out.assert().code(0); + } + + #[test] + fn properly_save_document_with_gb2312_from_stdin() { + let mut echo = Command::new("cat") + .arg(format!( + "src{s}tests{s}data{s}unusual_encodings{s}gb2312.html", + s = MAIN_SEPARATOR + )) + .stdout(Stdio::piped()) + .spawn() + .unwrap(); + let echo_out = echo.stdout.take().unwrap(); + echo.wait().unwrap(); + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.stdin(echo_out); + let out = cmd.arg("-M").arg("-").output().unwrap(); + + // STDERR should be empty + assert_eq!(String::from_utf8_lossy(&out.stderr), ""); + + // STDOUT should contain HTML created out of STDIN + let s: String; + if let Some(encoding) = Encoding::for_label(b"gb2312") { + let (string, _, _) = encoding.decode(&out.stdout); + s = string.to_string(); + } else { + s = String::from_utf8_lossy(&out.stdout).to_string(); + } + assert_eq!( + s, + "\ + \n \ + \n \ + 近七成人减少线下需求\u{3000}银行数字化转型提速--经济·科技--人民网 \n\ + \n\ + \n \ +

近七成人减少线下需求\u{3000}银行数字化转型提速

\n\n\n\ + \ + \n" + ); + + // Exit code should be 0 out.assert().code(0); } } diff --git a/src/tests/css/embed_css.rs b/src/tests/css/embed_css.rs index d02f63c..aeae5dc 100644 --- a/src/tests/css/embed_css.rs +++ b/src/tests/css/embed_css.rs @@ -331,7 +331,7 @@ mod passing { "; const CSS_OUT: &str = "\ #language a[href=\"#translations\"]:before {\n\ - content: url(\"data:;base64,\") \"\\a \";\n\ + content: url(\"data:text/plain;base64,\") \"\\a \";\n\ white-space: pre }\n\ "; diff --git a/src/tests/data/unusual_encodings/gb2312.html b/src/tests/data/unusual_encodings/gb2312.html new file mode 100644 index 0000000..cd86ae6 --- /dev/null +++ b/src/tests/data/unusual_encodings/gb2312.html @@ -0,0 +1,9 @@ + + + + ߳˼ֻת--áƼ-- + + +

߳˼ֻת

+ + diff --git a/src/tests/html/add_favicon.rs b/src/tests/html/add_favicon.rs index 80bee70..ef83553 100644 --- a/src/tests/html/add_favicon.rs +++ b/src/tests/html/add_favicon.rs @@ -14,7 +14,7 @@ mod passing { #[test] fn basic() { let html = "
text
"; - let mut dom = html::html_to_dom(&html); + let mut dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); dom = html::add_favicon(&dom.document, "I_AM_A_FAVICON_DATA_URL".to_string()); diff --git a/src/tests/html/get_base_url.rs b/src/tests/html/get_base_url.rs index a1b959c..d240040 100644 --- a/src/tests/html/get_base_url.rs +++ b/src/tests/html/get_base_url.rs @@ -19,7 +19,7 @@ mod passing { "; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); assert_eq!( html::get_base_url(&dom.document), @@ -38,7 +38,7 @@ mod passing { "; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); assert_eq!( html::get_base_url(&dom.document), @@ -67,7 +67,7 @@ mod failing { "; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); assert_eq!(html::get_base_url(&dom.document), None); } @@ -82,7 +82,7 @@ mod failing { "; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); assert_eq!(html::get_base_url(&dom.document), None); } @@ -97,7 +97,7 @@ mod failing { "; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); assert_eq!(html::get_base_url(&dom.document), Some(str!())); } diff --git a/src/tests/html/get_charset.rs b/src/tests/html/get_charset.rs new file mode 100644 index 0000000..89751ba --- /dev/null +++ b/src/tests/html/get_charset.rs @@ -0,0 +1,72 @@ +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod passing { + use crate::html; + + #[test] + fn meta_content_type() { + let html = " + + + + + + +"; + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); + + assert_eq!(html::get_charset(&dom.document), Some(str!("GB2312"))); + } + + #[test] + fn meta_charset() { + let html = " + + + + + + +"; + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); + + assert_eq!(html::get_charset(&dom.document), Some(str!("GB2312"))); + } + + #[test] + fn multiple_conflicting_meta_charset_first() { + let html = " + + + + + + + +"; + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); + + assert_eq!(html::get_charset(&dom.document), Some(str!("utf-8"))); + } + #[test] + fn multiple_conflicting_meta_content_type_first() { + let html = " + + + + + + + +"; + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); + + assert_eq!(html::get_charset(&dom.document), Some(str!("GB2312"))); + } +} diff --git a/src/tests/html/get_node_attr.rs b/src/tests/html/get_node_attr.rs index a8b7448..f7823b9 100644 --- a/src/tests/html/get_node_attr.rs +++ b/src/tests/html/get_node_attr.rs @@ -14,7 +14,7 @@ mod passing { #[test] fn div_two_style_attributes() { let html = "
"; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let mut count = 0; fn test_walk(node: &Handle, i: &mut i8) { diff --git a/src/tests/html/get_node_name.rs b/src/tests/html/get_node_name.rs index 79da75e..49ffbed 100644 --- a/src/tests/html/get_node_name.rs +++ b/src/tests/html/get_node_name.rs @@ -14,7 +14,7 @@ mod passing { #[test] fn parent_node_names() { let html = "

"; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let mut count = 0; fn test_walk(node: &Handle, i: &mut i8) { diff --git a/src/tests/html/has_favicon.rs b/src/tests/html/has_favicon.rs index bbc8588..f9f7541 100644 --- a/src/tests/html/has_favicon.rs +++ b/src/tests/html/has_favicon.rs @@ -12,7 +12,7 @@ mod passing { #[test] fn icon() { let html = "
text
"; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let res: bool = html::has_favicon(&dom.document); assert!(res); @@ -21,7 +21,7 @@ mod passing { #[test] fn shortcut_icon() { let html = "
text
"; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let res: bool = html::has_favicon(&dom.document); assert!(res); @@ -42,7 +42,7 @@ mod failing { #[test] fn absent() { let html = "
text
"; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let res: bool = html::has_favicon(&dom.document); assert!(!res); diff --git a/src/tests/html/mod.rs b/src/tests/html/mod.rs index b9576da..955d28d 100644 --- a/src/tests/html/mod.rs +++ b/src/tests/html/mod.rs @@ -4,10 +4,11 @@ mod compose_csp; mod create_metadata_tag; mod embed_srcset; mod get_base_url; +mod get_charset; mod get_node_attr; mod get_node_name; mod has_favicon; mod is_icon; +mod serialize_document; mod set_node_attr; -mod stringify_document; mod walk_and_embed_assets; diff --git a/src/tests/html/stringify_document.rs b/src/tests/html/serialize_document.rs similarity index 86% rename from src/tests/html/stringify_document.rs rename to src/tests/html/serialize_document.rs index 4fe2ccc..11ae0c5 100644 --- a/src/tests/html/stringify_document.rs +++ b/src/tests/html/serialize_document.rs @@ -13,11 +13,11 @@ mod passing { #[test] fn div_as_root_element() { let html = "
"; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let options = Options::default(); assert_eq!( - html::stringify_document(&dom.document, &options), + String::from_utf8_lossy(&html::serialize_document(dom, str!(), &options)), "
" ); } @@ -28,15 +28,16 @@ mod passing { \ \
"; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let mut options = Options::default(); options.isolate = true; assert_eq!( - html::stringify_document( - &dom.document, + String::from_utf8_lossy(&html::serialize_document( + dom, + str!(), &options - ), + )), "\ \ \ @@ -59,12 +60,12 @@ mod passing { Unstyled document\ \
"; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let mut options = Options::default(); options.no_css = true; assert_eq!( - html::stringify_document(&dom.document, &options), + String::from_utf8_lossy(&html::serialize_document(dom, str!(), &options)), "\ \ \ @@ -83,15 +84,16 @@ mod passing { Frameless document\ \
"; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let mut options = Options::default(); options.no_frames = true; assert_eq!( - html::stringify_document( - &dom.document, + String::from_utf8_lossy(&html::serialize_document( + dom, + str!(), &options - ), + )), "\ \ \ @@ -115,7 +117,7 @@ mod passing { \ \ "; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let mut options = Options::default(); options.isolate = true; options.no_css = true; @@ -125,10 +127,11 @@ mod passing { options.no_images = true; assert_eq!( - html::stringify_document( - &dom.document, + String::from_utf8_lossy(&html::serialize_document( + dom, + str!(), &options - ), + )), "\ \ \ diff --git a/src/tests/html/set_node_attr.rs b/src/tests/html/set_node_attr.rs index 140895b..9366f20 100644 --- a/src/tests/html/set_node_attr.rs +++ b/src/tests/html/set_node_attr.rs @@ -14,7 +14,7 @@ mod passing { #[test] fn html_lang_and_body_style() { let html = ""; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let mut count = 0; fn test_walk(node: &Handle, i: &mut i8) { @@ -67,7 +67,7 @@ mod passing { #[test] fn body_background() { let html = ""; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let mut count = 0; fn test_walk(node: &Handle, i: &mut i8) { diff --git a/src/tests/html/walk_and_embed_assets.rs b/src/tests/html/walk_and_embed_assets.rs index 7e2ab83..8755cda 100644 --- a/src/tests/html/walk_and_embed_assets.rs +++ b/src/tests/html/walk_and_embed_assets.rs @@ -20,7 +20,7 @@ mod passing { let cache = &mut HashMap::new(); let html: &str = "

"; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let url: Url = Url::parse("http://localhost").unwrap(); let mut options = Options::default(); @@ -42,7 +42,7 @@ mod passing { #[test] fn ensure_no_recursive_iframe() { let html = "

"; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); @@ -65,7 +65,7 @@ mod passing { #[test] fn ensure_no_recursive_frame() { let html = ""; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); @@ -93,7 +93,7 @@ mod passing { \
\ "; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); @@ -129,7 +129,7 @@ mod passing { fn no_images() { let html = "\
"; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); @@ -166,7 +166,7 @@ mod passing { fn no_body_background_images() { let html = ""; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); @@ -190,7 +190,7 @@ mod passing { #[test] fn no_frames() { let html = ""; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); @@ -222,7 +222,7 @@ mod passing { #[test] fn no_iframes() { let html = ""; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); @@ -258,7 +258,7 @@ mod passing { \ \ "; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); @@ -293,7 +293,7 @@ mod passing { fn keeps_integrity_for_linked_assets() { let html = "Has integrity\ "; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); @@ -328,7 +328,7 @@ mod passing { \ \ "; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); @@ -366,7 +366,7 @@ mod passing { \ \ "; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); @@ -410,7 +410,7 @@ mod passing { \ \ "; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); @@ -452,7 +452,7 @@ mod passing { \ \ "; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); @@ -488,7 +488,7 @@ mod passing { #[test] fn preserves_script_type_json() { let html = ""; - let dom = html::html_to_dom(&html); + let dom = html::html_to_dom(&html.as_bytes().to_vec(), str!()); let url: Url = Url::parse("http://localhost").unwrap(); let cache = &mut HashMap::new(); diff --git a/src/tests/url/parse_data_url.rs b/src/tests/url/parse_data_url.rs index e39ce97..c4a6063 100644 --- a/src/tests/url/parse_data_url.rs +++ b/src/tests/url/parse_data_url.rs @@ -13,9 +13,10 @@ mod passing { #[test] fn parse_text_html_base64() { - let (media_type, data) = url::parse_data_url(&Url::parse("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg==").unwrap()); + let (media_type, charset, data) = url::parse_data_url(&Url::parse("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg==").unwrap()); assert_eq!(media_type, "text/html"); + assert_eq!(charset, "US-ASCII"); assert_eq!( String::from_utf8_lossy(&data), "Work expands so as to fill the time available for its completion" @@ -24,11 +25,12 @@ mod passing { #[test] fn parse_text_html_utf8() { - let (media_type, data) = url::parse_data_url( - &Url::parse("data:text/html;utf8,Work expands so as to fill the time available for its completion").unwrap(), + let (media_type, charset, data) = url::parse_data_url( + &Url::parse("data:text/html;charset=utf8,Work expands so as to fill the time available for its completion").unwrap(), ); assert_eq!(media_type, "text/html"); + assert_eq!(charset, "utf8"); assert_eq!( String::from_utf8_lossy(&data), "Work expands so as to fill the time available for its completion" @@ -37,7 +39,7 @@ mod passing { #[test] fn parse_text_html_plaintext() { - let (media_type, data) = url::parse_data_url( + let (media_type, charset, data) = url::parse_data_url( &Url::parse( "data:text/html,Work expands so as to fill the time available for its completion", ) @@ -45,6 +47,7 @@ mod passing { ); assert_eq!(media_type, "text/html"); + assert_eq!(charset, "US-ASCII"); assert_eq!( String::from_utf8_lossy(&data), "Work expands so as to fill the time available for its completion" @@ -53,26 +56,31 @@ mod passing { #[test] fn parse_text_css_url_encoded() { - let (media_type, data) = + let (media_type, charset, data) = url::parse_data_url(&Url::parse("data:text/css,div{background-color:%23000}").unwrap()); assert_eq!(media_type, "text/css"); + assert_eq!(charset, "US-ASCII"); assert_eq!(String::from_utf8_lossy(&data), "div{background-color:#000}"); } #[test] fn parse_no_media_type_base64() { - let (media_type, data) = url::parse_data_url(&Url::parse("data:;base64,dGVzdA==").unwrap()); + let (media_type, charset, data) = + url::parse_data_url(&Url::parse("data:;base64,dGVzdA==").unwrap()); - assert_eq!(media_type, ""); + assert_eq!(media_type, "text/plain"); + assert_eq!(charset, "US-ASCII"); assert_eq!(String::from_utf8_lossy(&data), "test"); } #[test] fn parse_no_media_type_no_encoding() { - let (media_type, data) = url::parse_data_url(&Url::parse("data:;,test%20test").unwrap()); + let (media_type, charset, data) = + url::parse_data_url(&Url::parse("data:;,test%20test").unwrap()); - assert_eq!(media_type, ""); + assert_eq!(media_type, "text/plain"); + assert_eq!(charset, "US-ASCII"); assert_eq!(String::from_utf8_lossy(&data), "test test"); } } @@ -92,9 +100,10 @@ mod failing { #[test] fn empty_data_url() { - let (media_type, data) = url::parse_data_url(&Url::parse("data:,").unwrap()); + let (media_type, charset, data) = url::parse_data_url(&Url::parse("data:,").unwrap()); - assert_eq!(media_type, ""); + assert_eq!(media_type, "text/plain"); + assert_eq!(charset, "US-ASCII"); assert_eq!(String::from_utf8_lossy(&data), ""); } } diff --git a/src/tests/utils/detect_media_type.rs b/src/tests/utils/detect_media_type.rs index 970af13..707fc44 100644 --- a/src/tests/utils/detect_media_type.rs +++ b/src/tests/utils/detect_media_type.rs @@ -195,7 +195,7 @@ mod failing { let dummy_url: Url = Url::parse("data:,").unwrap(); assert_eq!( utils::detect_media_type(b"abcdef0123456789", &dummy_url), - "" + "application/octet-stream" ); } } diff --git a/src/tests/utils/indent.rs b/src/tests/utils/indent.rs index 112b712..fb706b3 100644 --- a/src/tests/utils/indent.rs +++ b/src/tests/utils/indent.rs @@ -28,4 +28,9 @@ mod passing { fn three() { assert_eq!(utils::indent(3), " "); } + + #[test] + fn four() { + assert_eq!(utils::indent(4), " "); + } } diff --git a/src/tests/utils/mod.rs b/src/tests/utils/mod.rs index 3231b30..e7c7739 100644 --- a/src/tests/utils/mod.rs +++ b/src/tests/utils/mod.rs @@ -1,3 +1,4 @@ mod detect_media_type; mod indent; +mod parse_content_type; mod retrieve_asset; diff --git a/src/tests/utils/parse_content_type.rs b/src/tests/utils/parse_content_type.rs new file mode 100644 index 0000000..07e4cc0 --- /dev/null +++ b/src/tests/utils/parse_content_type.rs @@ -0,0 +1,86 @@ +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod passing { + use crate::utils; + + #[test] + fn text_plain_utf8() { + let (media_type, charset, is_base64) = utils::parse_content_type("text/plain;charset=utf8"); + assert_eq!(media_type, "text/plain"); + assert_eq!(charset, "utf8"); + assert!(!is_base64); + } + + #[test] + fn text_plain_utf8_spaces() { + let (media_type, charset, is_base64) = + utils::parse_content_type(" text/plain ; charset=utf8 "); + assert_eq!(media_type, "text/plain"); + assert_eq!(charset, "utf8"); + assert!(!is_base64); + } + + #[test] + fn empty() { + let (media_type, charset, is_base64) = utils::parse_content_type(""); + assert_eq!(media_type, "text/plain"); + assert_eq!(charset, "US-ASCII"); + assert!(!is_base64); + } + + #[test] + fn base64() { + let (media_type, charset, is_base64) = utils::parse_content_type(";base64"); + assert_eq!(media_type, "text/plain"); + assert_eq!(charset, "US-ASCII"); + assert!(is_base64); + } + + #[test] + fn text_html_base64() { + let (media_type, charset, is_base64) = utils::parse_content_type("text/html;base64"); + assert_eq!(media_type, "text/html"); + assert_eq!(charset, "US-ASCII"); + assert!(is_base64); + } + + #[test] + fn only_media_type() { + let (media_type, charset, is_base64) = utils::parse_content_type("text/html"); + assert_eq!(media_type, "text/html"); + assert_eq!(charset, "US-ASCII"); + assert!(!is_base64); + } + + #[test] + fn only_media_type_colon() { + let (media_type, charset, is_base64) = utils::parse_content_type("text/html;"); + assert_eq!(media_type, "text/html"); + assert_eq!(charset, "US-ASCII"); + assert!(!is_base64); + } + + #[test] + fn media_type_gb2312_filename() { + let (media_type, charset, is_base64) = + utils::parse_content_type("text/html;charset=GB2312;filename=index.html"); + assert_eq!(media_type, "text/html"); + assert_eq!(charset, "GB2312"); + assert!(!is_base64); + } + + #[test] + fn media_type_filename_gb2312() { + let (media_type, charset, is_base64) = + utils::parse_content_type("text/html;filename=index.html;charset=GB2312"); + assert_eq!(media_type, "text/html"); + assert_eq!(charset, "GB2312"); + assert!(!is_base64); + } +} diff --git a/src/tests/utils/retrieve_asset.rs b/src/tests/utils/retrieve_asset.rs index 5e975d0..eee881d 100644 --- a/src/tests/utils/retrieve_asset.rs +++ b/src/tests/utils/retrieve_asset.rs @@ -26,7 +26,7 @@ mod passing { // If both source and target are data URLs, // ensure the result contains target data URL - let (data, final_url, media_type) = utils::retrieve_asset( + let (data, final_url, media_type, charset) = utils::retrieve_asset( cache, &client, &Url::parse("data:text/html;base64,c291cmNl").unwrap(), @@ -35,23 +35,16 @@ mod passing { 0, ) .unwrap(); + assert_eq!(&media_type, "text/html"); + assert_eq!(&charset, "US-ASCII"); assert_eq!( url::create_data_url(&media_type, &data, &final_url), - url::create_data_url( - "text/html", - "target".as_bytes(), - &Url::parse("data:text/html;base64,c291cmNl").unwrap() - ) + Url::parse("data:text/html;base64,dGFyZ2V0").unwrap(), ); assert_eq!( final_url, - url::create_data_url( - "text/html", - "target".as_bytes(), - &Url::parse("data:text/html;base64,c291cmNl").unwrap() - ) + Url::parse("data:text/html;base64,dGFyZ2V0").unwrap(), ); - assert_eq!(&media_type, "text/html"); } #[test] @@ -66,7 +59,7 @@ mod passing { // Inclusion of local assets from local sources should be allowed let cwd = env::current_dir().unwrap(); - let (data, final_url, _media_type) = utils::retrieve_asset( + let (data, final_url, media_type, charset) = utils::retrieve_asset( cache, &client, &Url::parse(&format!( @@ -85,7 +78,9 @@ mod passing { 0, ) .unwrap(); - assert_eq!(url::create_data_url("application/javascript", &data, &final_url), Url::parse("data:application/javascript;base64,ZG9jdW1lbnQuYm9keS5zdHlsZS5iYWNrZ3JvdW5kQ29sb3IgPSAiZ3JlZW4iOwpkb2N1bWVudC5ib2R5LnN0eWxlLmNvbG9yID0gInJlZCI7Cg==").unwrap()); + assert_eq!(&media_type, "application/javascript"); + assert_eq!(&charset, ""); + assert_eq!(url::create_data_url(&media_type, &data, &final_url), Url::parse("data:application/javascript;base64,ZG9jdW1lbnQuYm9keS5zdHlsZS5iYWNrZ3JvdW5kQ29sb3IgPSAiZ3JlZW4iOwpkb2N1bWVudC5ib2R5LnN0eWxlLmNvbG9yID0gInJlZCI7Cg==").unwrap()); assert_eq!( final_url, Url::parse(&format!( diff --git a/src/url.rs b/src/url.rs index 878e726..eea0bb3 100644 --- a/src/url.rs +++ b/src/url.rs @@ -1,7 +1,7 @@ use base64; use url::{form_urlencoded, Url}; -use crate::utils::detect_media_type; +use crate::utils::{detect_media_type, parse_content_type}; pub fn clean_url(url: Url) -> Url { let mut url = url.clone(); @@ -37,42 +37,26 @@ pub fn is_url_and_has_protocol(input: &str) -> bool { } } -pub fn parse_data_url(url: &Url) -> (String, Vec) { +pub fn parse_data_url(url: &Url) -> (String, String, Vec) { let path: String = url.path().to_string(); let comma_loc: usize = path.find(',').unwrap_or(path.len()); - let meta_data: String = path.chars().take(comma_loc).collect(); - let raw_data: String = path.chars().skip(comma_loc + 1).collect(); - - let text: String = percent_decode(raw_data); - - let meta_data_items: Vec<&str> = meta_data.split(';').collect(); - let mut media_type: String = str!(); - let mut encoding: &str = ""; - - let mut i: i8 = 0; - for item in &meta_data_items { - if i == 0 { - media_type = str!(item); - } else { - if item.eq_ignore_ascii_case("base64") - || item.eq_ignore_ascii_case("utf8") - || item.eq_ignore_ascii_case("charset=UTF-8") - { - encoding = item; - } - } + // Split data URL into meta data and raw data + let content_type: String = path.chars().take(comma_loc).collect(); + let data: String = path.chars().skip(comma_loc + 1).collect(); - i = i + 1; - } + // Parse meta data + let (media_type, charset, is_base64) = parse_content_type(&content_type); - let data: Vec = if encoding.eq_ignore_ascii_case("base64") { + // Parse raw data into vector of bytes + let text: String = percent_decode(data); + let blob: Vec = if is_base64 { base64::decode(&text).unwrap_or(vec![]) } else { text.as_bytes().to_vec() }; - (media_type, data) + (media_type, charset, blob) } pub fn percent_decode(input: String) -> String { diff --git a/src/utils.rs b/src/utils.rs index 2be4d28..012b419 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -33,25 +33,63 @@ const MAGIC: [[&[u8]; 2]; 18] = [ [b"....moov", b"video/quicktime"], [b"\x1A\x45\xDF\xA3", b"video/webm"], ]; -const PLAINTEXT_MEDIA_TYPES: &[&str] = &["application/javascript", "image/svg+xml"]; +const PLAINTEXT_MEDIA_TYPES: &[&str] = &[ + "application/javascript", + "application/json", + "image/svg+xml", +]; pub fn detect_media_type(data: &[u8], url: &Url) -> String { + // At first attempt to read file's header for magic_item in MAGIC.iter() { if data.starts_with(magic_item[0]) { return String::from_utf8(magic_item[1].to_vec()).unwrap(); } } - if url.path().to_lowercase().ends_with(".svg") { - return str!("image/svg+xml"); - } - - str!() + // If header didn't match any known magic signatures, + // try to guess media type from file name + let parts: Vec<&str> = url.path().split('/').collect(); + detect_media_type_by_file_name(parts.last().unwrap()) } -pub fn is_plaintext_media_type(media_type: &str) -> bool { - media_type.to_lowercase().as_str().starts_with("text/") - || PLAINTEXT_MEDIA_TYPES.contains(&media_type.to_lowercase().as_str()) +pub fn detect_media_type_by_file_name(filename: &str) -> String { + let filename_lowercased: &str = &filename.to_lowercase(); + let parts: Vec<&str> = filename_lowercased.split('.').collect(); + + let mime: &str = match parts.last() { + Some(v) => match *v { + "avi" => "video/avi", + "bmp" => "image/bmp", + "css" => "text/css", + "flac" => "audio/flac", + "gif" => "image/gif", + "htm" | "html" => "text/html", + "ico" => "image/x-icon", + "jpeg" | "jpg" => "image/jpeg", + "js" => "application/javascript", + "json" => "application/json", + "mp3" => "audio/mpeg", + "mp4" | "m4v" => "video/mp4", + "ogg" => "audio/ogg", + "ogv" => "video/ogg", + "pdf" => "application/pdf", + "png" => "image/png", + "svg" => "image/svg+xml", + "swf" => "application/x-shockwave-flash", + "tif" | "tiff" => "image/tiff", + "txt" => "text/plain", + "wav" => "audio/wav", + "webp" => "image/webp", + "woff" => "font/woff", + "woff2" => "font/woff2", + "xml" => "text/xml", + &_ => "application/octet-stream", + }, + None => "application/octet-stream", + }; + + mime.to_string() } pub fn indent(level: u32) -> String { @@ -66,6 +104,38 @@ pub fn indent(level: u32) -> String { result } +pub fn is_plaintext_media_type(media_type: &str) -> bool { + media_type.to_lowercase().as_str().starts_with("text/") + || PLAINTEXT_MEDIA_TYPES.contains(&media_type.to_lowercase().as_str()) +} + +pub fn parse_content_type(content_type: &str) -> (String, String, bool) { + let mut media_type: String = str!("text/plain"); + let mut charset: String = str!("US-ASCII"); + let mut is_base64: bool = false; + + // Parse meta data + let content_type_items: Vec<&str> = content_type.split(';').collect(); + let mut i: i8 = 0; + for item in &content_type_items { + if i == 0 { + if item.trim().len() > 0 { + media_type = str!(item.trim()); + } + } else { + if item.trim().eq_ignore_ascii_case("base64") { + is_base64 = true; + } else if item.trim().starts_with("charset=") { + charset = item.trim().chars().skip(8).collect(); + } + } + + i += 1; + } + + (media_type, charset, is_base64) +} + pub fn retrieve_asset( cache: &mut HashMap>, client: &Client, @@ -73,10 +143,10 @@ pub fn retrieve_asset( url: &Url, options: &Options, depth: u32, -) -> Result<(Vec, Url, String), reqwest::Error> { +) -> Result<(Vec, Url, String, String), reqwest::Error> { if url.scheme() == "data" { - let (media_type, data) = parse_data_url(url); - Ok((data, url.clone(), media_type)) + let (media_type, charset, data) = parse_data_url(url); + Ok((data, url.clone(), media_type, charset)) } else if url.scheme() == "file" { // Check if parent_url is also file:/// (if not, then we don't embed the asset) if parent_url.scheme() != "file" { @@ -123,7 +193,14 @@ pub fn retrieve_asset( eprintln!("{}{}", indent(depth).as_str(), &url); } - Ok((fs::read(&path).expect(""), url.clone(), str!())) + let file_blob: Vec = fs::read(&path).expect("Unable to read file"); + + Ok(( + file_blob.clone(), + url.clone(), + detect_media_type(&file_blob, url), + str!(), + )) } } else { if !options.silent { @@ -147,16 +224,19 @@ pub fn retrieve_asset( let cache_key: String = clean_url(url.clone()).as_str().to_string(); if cache.contains_key(&cache_key) { - // URL is in cache, - // we get and return it + // URL is in cache, we get and return it if !options.silent { eprintln!("{}{} (from cache)", indent(depth).as_str(), &url); } - Ok((cache.get(&cache_key).unwrap().to_vec(), url.clone(), str!())) + Ok(( + cache.get(&cache_key).unwrap().to_vec(), + url.clone(), + str!(), + str!(), + )) } else { - // URL not in cache, - // we retrieve the file + // URL not in cache, we retrieve the file match client.get(url.as_str()).send() { Ok(mut response) => { if !options.ignore_errors && response.status() != 200 { @@ -192,18 +272,20 @@ pub fn retrieve_asset( let mut data: Vec = vec![]; response.copy_to(&mut data).unwrap(); - // Attempt to obtain media type by reading Content-Type header - let media_type: &str = response + // Attempt to obtain media type and charset by reading Content-Type header + let content_type: &str = response .headers() .get(CONTENT_TYPE) .and_then(|header| header.to_str().ok()) .unwrap_or(""); + let (media_type, charset, _is_base64) = parse_content_type(&content_type); + // Add retrieved resource to cache cache.insert(new_cache_key, data.clone()); // Return - Ok((data, response.url().clone(), media_type.to_string())) + Ok((data, response.url().clone(), media_type, charset)) } Err(error) => { if !options.silent {