diff --git a/README.md b/README.md index 5beb9ea..dde913e 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,7 @@ or - `-a`: Exclude audio sources - `-b`: Use custom `base URL` - `-c`: Exclude CSS + - `-C`: Save document using custom `charset` - `-e`: Ignore network errors - `-f`: Omit frames - `-F`: Exclude web fonts @@ -80,7 +81,7 @@ or - `-k`: Accept invalid X.509 (TLS) certificates - `-M`: Don't add timestamp and URL information - `-n`: Extract contents of NOSCRIPT elements - - `-o`: Write output to `file` + - `-o`: Write output to `file`, use “-” for STDOUT - `-s`: Be quiet - `-t`: Adjust `network request timeout` - `-u`: Provide custom `User-Agent` diff --git a/src/main.rs b/src/main.rs index 7b9ac16..9bf6bd4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,4 @@ +use encoding_rs::Encoding; use html5ever::rcdom::RcDom; use reqwest::blocking::Client; use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; @@ -76,6 +77,14 @@ fn main() { process::exit(1); } + // Check if custom charset is valid + if let Some(custom_charset) = options.charset.clone() { + if !Encoding::for_label_no_replacement(custom_charset.as_bytes()).is_some() { + eprintln!("Unknown encoding: {}", &custom_charset); + process::exit(1); + } + } + let target_url: Url; let mut base_url: Url; let mut use_stdin: bool = false; @@ -296,10 +305,9 @@ fn main() { } } - // Enforce UTF-8 encoding for documents that may end up having garbled html entities - // due to html5ever forcefully converting them into UTF-8 byte sequences. - if document_encoding.eq_ignore_ascii_case("iso-8859-1") { - document_encoding = str!("utf-8"); + // Save using specified charset, if given + if let Some(custom_charset) = options.charset.clone() { + document_encoding = custom_charset; dom = set_charset(dom, document_encoding.clone()); } diff --git a/src/opts.rs b/src/opts.rs index f20e82e..2ca692b 100644 --- a/src/opts.rs +++ b/src/opts.rs @@ -6,6 +6,7 @@ pub struct Options { pub no_audio: bool, pub base_url: Option, pub no_css: bool, + pub charset: Option, pub ignore_errors: bool, pub no_frames: bool, pub no_fonts: bool, @@ -48,6 +49,7 @@ impl Options { .args_from_usage("-a, --no-audio 'Removes audio sources'") .args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'") .args_from_usage("-c, --no-css 'Removes CSS'") + .args_from_usage("-C, --charset=[UTF-8] 'Enforces custom encoding'") .args_from_usage("-e, --ignore-errors 'Ignore network errors'") .args_from_usage("-f, --no-frames 'Removes frames and iframes'") .args_from_usage("-F, --no-fonts 'Removes fonts'") @@ -59,7 +61,9 @@ impl Options { .args_from_usage( "-n, --unwrap-noscript 'Replaces NOSCRIPT elements with their contents'", ) - .args_from_usage("-o, --output=[document.html] 'Writes output to '") + .args_from_usage( + "-o, --output=[document.html] 'Writes output to , use - for STDOUT'", + ) .args_from_usage("-s, --silent 'Suppresses verbosity'") .args_from_usage("-t, --timeout=[60] 'Adjusts network request timeout'") .args_from_usage("-u, --user-agent=[Firefox] 'Sets custom User-Agent string'") @@ -69,7 +73,7 @@ impl Options { .required(true) .takes_value(true) .index(1) - .help("URL or file path, use - for stdin"), + .help("URL or file path, use - for STDIN"), ) .get_matches(); let mut options: Options = Options::default(); @@ -84,6 +88,9 @@ impl Options { options.base_url = Some(str!(base_url)); } options.no_css = app.is_present("no-css"); + if let Some(charset) = app.value_of("charset") { + options.charset = Some(str!(charset)); + } options.ignore_errors = app.is_present("ignore-errors"); options.no_frames = app.is_present("no-frames"); options.no_fonts = app.is_present("no-fonts"); diff --git a/src/tests/cli/basic.rs b/src/tests/cli/basic.rs index d3d7b93..df7390a 100644 --- a/src/tests/cli/basic.rs +++ b/src/tests/cli/basic.rs @@ -14,6 +14,21 @@ mod passing { use std::process::{Command, Stdio}; use url::Url; + #[test] + fn print_help_information() { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + let out = cmd.arg("-h").output().unwrap(); + + // STDERR should be empty + assert_eq!(String::from_utf8_lossy(&out.stderr), ""); + + // STDOUT should contain program name, version, and usage information + // TODO + + // Exit code should be 0 + out.assert().code(0); + } + #[test] fn print_version() { let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); diff --git a/src/tests/cli/unusual_encodings.rs b/src/tests/cli/unusual_encodings.rs index 9493bd2..9a1dca3 100644 --- a/src/tests/cli/unusual_encodings.rs +++ b/src/tests/cli/unusual_encodings.rs @@ -13,48 +13,6 @@ mod passing { use std::path::MAIN_SEPARATOR; use std::process::{Command, Stdio}; - #[test] - fn change_iso88591_to_utf8_to_properly_display_html_entities() { - let cwd = env::current_dir().unwrap(); - let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/"); - let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); - let out = cmd - .arg("-M") - .arg(format!( - "src{s}tests{s}data{s}unusual_encodings{s}iso-8859-1.html", - s = MAIN_SEPARATOR - )) - .output() - .unwrap(); - let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" }; - - // STDERR should contain only the target file - assert_eq!( - String::from_utf8_lossy(&out.stderr), - format!( - "{file}{cwd}/src/tests/data/unusual_encodings/iso-8859-1.html\n", - file = file_url_protocol, - cwd = cwd_normalized, - ) - ); - - // STDOUT should contain original document but with UTF-8 charset - assert_eq!( - String::from_utf8_lossy(&out.stdout), - "\ - \n \ - \n \ - \n \ - \n \ - © Some Company\n \ - \n\n\ - \n" - ); - - // Exit code should be 0 - out.assert().code(0); - } - #[test] fn properly_save_document_with_gb2312() { let cwd = env::current_dir().unwrap(); @@ -149,4 +107,133 @@ mod passing { // Exit code should be 0 out.assert().code(0); } + + #[test] + fn properly_save_document_with_gb2312_custom_charset() { + let cwd = env::current_dir().unwrap(); + let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/"); + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + let out = cmd + .arg("-M") + .arg("-C") + .arg("utf8") + .arg(format!( + "src{s}tests{s}data{s}unusual_encodings{s}gb2312.html", + s = MAIN_SEPARATOR + )) + .output() + .unwrap(); + let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" }; + + // STDERR should contain only the target file + assert_eq!( + String::from_utf8_lossy(&out.stderr), + format!( + "{file}{cwd}/src/tests/data/unusual_encodings/gb2312.html\n", + file = file_url_protocol, + cwd = cwd_normalized, + ) + ); + + // STDOUT should contain original document without any modificatons + assert_eq!( + String::from_utf8_lossy(&out.stdout).to_string(), + "\ + \n \ + \n \ + 近七成人减少线下需求\u{3000}银行数字化转型提速--经济·科技--人民网 \n\ + \n\ + \n \ +

近七成人减少线下需求\u{3000}银行数字化转型提速

\n\n\n\ + \ + \n" + ); + + // Exit code should be 0 + out.assert().code(0); + } + + #[test] + fn properly_save_document_with_gb2312_custom_charset_bad() { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + let out = cmd + .arg("-M") + .arg("-C") + .arg("utf0") + .arg(format!( + "src{s}tests{s}data{s}unusual_encodings{s}gb2312.html", + s = MAIN_SEPARATOR + )) + .output() + .unwrap(); + + // STDERR should contain error message + assert_eq!( + String::from_utf8_lossy(&out.stderr), + "Unknown encoding: utf0\n" + ); + + // STDOUT should be empty + assert_eq!(String::from_utf8_lossy(&out.stdout).to_string(), ""); + + // Exit code should be 1 + out.assert().code(1); + } +} + +// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ +// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝ +// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗ +// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod failing { + use assert_cmd::prelude::*; + use std::env; + use std::path::MAIN_SEPARATOR; + use std::process::Command; + + #[test] + fn change_iso88591_to_utf8_to_properly_display_html_entities() { + let cwd = env::current_dir().unwrap(); + let cwd_normalized: String = str!(cwd.to_str().unwrap()).replace("\\", "/"); + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + let out = cmd + .arg("-M") + .arg(format!( + "src{s}tests{s}data{s}unusual_encodings{s}iso-8859-1.html", + s = MAIN_SEPARATOR + )) + .output() + .unwrap(); + let file_url_protocol: &str = if cfg!(windows) { "file:///" } else { "file://" }; + + // STDERR should contain only the target file + assert_eq!( + String::from_utf8_lossy(&out.stderr), + format!( + "{file}{cwd}/src/tests/data/unusual_encodings/iso-8859-1.html\n", + file = file_url_protocol, + cwd = cwd_normalized, + ) + ); + + // STDOUT should contain original document but with UTF-8 charset + assert_eq!( + String::from_utf8_lossy(&out.stdout), + "\ + \n \ + \n \ + \n \ + \n \ + � Some Company\n \ + \n\n\ + \n" + ); + + // Exit code should be 0 + out.assert().code(0); + } } diff --git a/src/tests/opts.rs b/src/tests/opts.rs index ba93e93..b138b57 100644 --- a/src/tests/opts.rs +++ b/src/tests/opts.rs @@ -16,6 +16,7 @@ mod passing { assert_eq!(options.no_audio, false); assert_eq!(options.base_url, None); assert_eq!(options.no_css, false); + assert_eq!(options.charset, None); assert_eq!(options.no_frames, false); assert_eq!(options.no_fonts, false); assert_eq!(options.no_images, false);