From e0273c664a44c607144e0b2a0217f6396297b94d Mon Sep 17 00:00:00 2001 From: Sunshine Date: Tue, 23 Feb 2021 23:33:45 -1000 Subject: [PATCH] forcefully set document's charset to UTF-8 --- Makefile | 2 +- src/css.rs | 2 +- src/html.rs | 46 ++++++++++------- src/js.rs | 2 +- src/main.rs | 3 ++ src/opts.rs | 4 +- src/tests/cli/local_files.rs | 3 +- src/tests/cli/mod.rs | 1 + src/tests/cli/unusual_encodings.rs | 51 +++++++++++++++++++ .../data/unusual_encodings/iso-8859-1.html | 8 +++ src/utils.rs | 17 ++++--- 11 files changed, 106 insertions(+), 33 deletions(-) create mode 100644 src/tests/cli/unusual_encodings.rs create mode 100644 src/tests/data/unusual_encodings/iso-8859-1.html diff --git a/Makefile b/Makefile index ff15335..425209b 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ build: test: build @cargo test --locked @cargo fmt --all -- --check -.PHONY: test_code_formatting +.PHONY: test lint: @cargo fmt --all -- diff --git a/src/css.rs b/src/css.rs index 2fe6148..5492b88 100644 --- a/src/css.rs +++ b/src/css.rs @@ -26,7 +26,7 @@ const CSS_PROPS_WITH_IMAGE_URLS: &[&str] = &[ "suffix", "symbols", ]; -const CSS_SPECIAL_CHARS: &str = "~!@$%^&*()+=,./'\";:?><[]{}|`#"; +const CSS_SPECIAL_CHARS: &'static str = "~!@$%^&*()+=,./'\";:?><[]{}|`#"; pub fn is_image_url_prop(prop_name: &str) -> bool { CSS_PROPS_WITH_IMAGE_URLS diff --git a/src/html.rs b/src/html.rs index 4c2b582..55b40ae 100644 --- a/src/html.rs +++ b/src/html.rs @@ -28,7 +28,7 @@ struct SrcSetItem<'a> { descriptor: &'a str, } -const ICON_VALUES: &[&str] = &["icon", "shortcut icon"]; +const ICON_VALUES: &'static [&str] = &["icon", "shortcut icon"]; pub fn add_favicon(document: &Handle, favicon_data_url: String) -> RcDom { let mut buf: Vec = Vec::new(); @@ -105,7 +105,7 @@ pub fn compose_csp(options: &Options) -> String { } if options.no_images { - // Note: data: is needed for transparent pixels + // Note: "data:" is required for transparent pixel images to work string_list.push("img-src data:;"); } @@ -127,22 +127,17 @@ pub fn create_metadata_tag(url: &str) -> String { clean_url.set_password(None).unwrap(); } - if is_http_url(url) { - format!( - "", - &clean_url, - timestamp, - env!("CARGO_PKG_NAME"), - env!("CARGO_PKG_VERSION"), - ) - } else { - format!( - "", - timestamp, - env!("CARGO_PKG_NAME"), - env!("CARGO_PKG_VERSION"), - ) - } + format!( + "", + if is_http_url(url) { + &clean_url.as_str() + } else { + "local source" + }, + timestamp, + env!("CARGO_PKG_NAME"), + env!("CARGO_PKG_VERSION"), + ) } Err(_) => str!(), } @@ -498,12 +493,12 @@ pub fn walk_and_embed_assets( } => { match name.local.as_ref() { "meta" => { - // Remove http-equiv attributes from META nodes if they're able to control the page if let Some(meta_attr_http_equiv_value) = get_node_attr(node, "http-equiv") { let meta_attr_http_equiv_value: &str = &meta_attr_http_equiv_value; if meta_attr_http_equiv_value.eq_ignore_ascii_case("refresh") || meta_attr_http_equiv_value.eq_ignore_ascii_case("location") { + // Remove http-equiv attributes from META nodes if they're able to control the page set_node_attr( &node, "http-equiv", @@ -512,7 +507,20 @@ pub fn walk_and_embed_assets( meta_attr_http_equiv_value )), ); + } else if meta_attr_http_equiv_value.eq_ignore_ascii_case("Content-Type") { + // Enforce charset to be set to UTF-8 + if let Some(_attr_value) = get_node_attr(node, "content") { + set_node_attr( + &node, + "content", + Some(str!("text/html; charset=utf-8")), + ); + } } + } else if let Some(_meta_attr_http_equiv_value) = get_node_attr(node, "charset") + { + // Enforce charset to be set to UTF-8 + set_node_attr(&node, "charset", Some(str!("utf-8"))); } } "link" => { diff --git a/src/js.rs b/src/js.rs index 428c4ef..cfb19f6 100644 --- a/src/js.rs +++ b/src/js.rs @@ -1,4 +1,4 @@ -const JS_DOM_EVENT_ATTRS: &[&str] = &[ +const JS_DOM_EVENT_ATTRS: &'static [&str] = &[ // From WHATWG HTML spec 8.1.5.2 "Event handlers on elements, Document objects, and Window objects": // https://html.spec.whatwg.org/#event-handlers-on-elements,-document-objects,-and-window-objects // https://html.spec.whatwg.org/#attributes-3 (table "List of event handler content attributes") diff --git a/src/main.rs b/src/main.rs index 0f1e243..ad885af 100644 --- a/src/main.rs +++ b/src/main.rs @@ -212,6 +212,9 @@ fn main() { } } + // Remove charset meta-tag + // set_charset_meta_to_utf8(&dom.document); + // Serialize DOM tree let mut result: String = stringify_document(&dom.document, &options); diff --git a/src/opts.rs b/src/opts.rs index 84ee88c..1137c6d 100644 --- a/src/opts.rs +++ b/src/opts.rs @@ -21,7 +21,7 @@ pub struct Options { pub target: String, } -const ASCII: &str = " \ +const ASCII: &'static str = " \ _____ ______________ __________ ___________________ ___ | \\ / \\ | | | | | | | \\_/ __ \\_| __ | | ___ ___ |__| | @@ -31,7 +31,7 @@ const ASCII: &str = " \ |___| |__________| \\_____________________| |___| |___| |___| "; const DEFAULT_NETWORK_TIMEOUT: u64 = 120; -const DEFAULT_USER_AGENT: &str = +const DEFAULT_USER_AGENT: &'static str = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0"; impl Options { diff --git a/src/tests/cli/local_files.rs b/src/tests/cli/local_files.rs index 86cf09c..fc0c4b1 100644 --- a/src/tests/cli/local_files.rs +++ b/src/tests/cli/local_files.rs @@ -69,8 +69,7 @@ mod passing { #[test] fn local_file_target_input_absolute_target_path() -> Result<(), Box> { let cwd = env::current_dir().unwrap(); - let cwd_normalized: String = - str!(env::current_dir().unwrap().to_str().unwrap()).replace("\\", "/"); + let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/"); let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; let out = cmd .arg("-M") diff --git a/src/tests/cli/mod.rs b/src/tests/cli/mod.rs index 88a0a28..55d69f6 100644 --- a/src/tests/cli/mod.rs +++ b/src/tests/cli/mod.rs @@ -2,3 +2,4 @@ mod base_url; mod basic; mod data_url; mod local_files; +mod unusual_encodings; diff --git a/src/tests/cli/unusual_encodings.rs b/src/tests/cli/unusual_encodings.rs new file mode 100644 index 0000000..006e745 --- /dev/null +++ b/src/tests/cli/unusual_encodings.rs @@ -0,0 +1,51 @@ +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod passing { + use assert_cmd::prelude::*; + use std::env; + use std::process::Command; + + #[test] + fn change_encoding_to_utf_8() -> Result<(), Box> { + let cwd = env::current_dir().unwrap(); + let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/"); + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd + .arg("-M") + .arg(if cfg!(windows) { + "src\\tests\\data\\unusual_encodings\\iso-8859-1.html" + } else { + "src/tests/data/unusual_encodings/iso-8859-1.html" + }) + .output() + .unwrap(); + let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" }; + + // STDOUT should contain newly added base URL + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "\n \n \n \n © Some Company\n \n\n\n" + ); + + // STDERR should contain only the target file + assert_eq!( + std::str::from_utf8(&out.stderr).unwrap(), + format!( + "{file}{cwd}/src/tests/data/unusual_encodings/iso-8859-1.html\n", + file = file_url_protocol, + cwd = cwd_normalized, + ) + ); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) + } +} diff --git a/src/tests/data/unusual_encodings/iso-8859-1.html b/src/tests/data/unusual_encodings/iso-8859-1.html new file mode 100644 index 0000000..f80fd52 --- /dev/null +++ b/src/tests/data/unusual_encodings/iso-8859-1.html @@ -0,0 +1,8 @@ + + + + + + © Some Company + + diff --git a/src/utils.rs b/src/utils.rs index 5fd76be..e9cf90d 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -7,7 +7,7 @@ use std::path::Path; use crate::opts::Options; use crate::url::{clean_url, file_url_to_fs_path, is_data_url, is_file_url, parse_data_url}; -const INDENT: &str = " "; +const INDENT: &'static str = " "; const MAGIC: [[&[u8]; 2]; 18] = [ // Image @@ -34,11 +34,13 @@ const MAGIC: [[&[u8]; 2]; 18] = [ ]; const PLAINTEXT_MEDIA_TYPES: &[&str] = &[ + "application/javascript", "image/svg+xml", - "text/css", - "text/html", - "text/javascript", - "text/plain", + // "text/css", + // "text/csv", + // "text/html", + // "text/javascript", + // "text/plain", ]; pub fn detect_media_type(data: &[u8], url: &str) -> String { @@ -56,7 +58,8 @@ pub fn detect_media_type(data: &[u8], url: &str) -> String { } pub fn is_plaintext_media_type(media_type: &str) -> bool { - PLAINTEXT_MEDIA_TYPES.contains(&media_type.to_lowercase().as_str()) + media_type.to_lowercase().as_str().starts_with("text/") + || PLAINTEXT_MEDIA_TYPES.contains(&media_type.to_lowercase().as_str()) } pub fn indent(level: u32) -> String { @@ -125,7 +128,7 @@ pub fn retrieve_asset( Ok(mut response) => { if !options.ignore_errors && response.status() != 200 { if !options.silent { - eprintln!("Unable to retrieve {} ({})", &url, response.status()); + eprintln!("Unable to retrieve {} (error: {})", &url, response.status()); } // Provoke error return Err(client.get("").send().unwrap_err());