add support for data URL targets

pull/119/head
Sunshine 4 years ago
parent b8b6d8cff6
commit 3d2d40e7cd
No known key found for this signature in database
GPG Key ID: B80CA68703CD8AB1

@ -13,7 +13,7 @@ pub fn retrieve_asset(
) -> Result<(String, String), reqwest::Error> {
let cache_key = clean_url(&url);
if is_data_url(&url).unwrap() {
if is_data_url(&url) {
Ok((url.to_string(), url.to_string()))
} else {
if cache.contains_key(&cache_key) {

@ -7,7 +7,7 @@ mod macros;
use crate::args::AppArgs;
use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets};
use monolith::http::retrieve_asset;
use monolith::utils::is_http_url;
use monolith::utils::{data_url_to_text, is_data_url, is_http_url};
use reqwest::blocking::Client;
use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT};
use std::collections::HashMap;
@ -46,11 +46,14 @@ impl Output {
fn main() {
let app_args = AppArgs::get();
let target_url: &str = app_args.url_target.as_str();
let base_url;
let dom;
if !is_http_url(app_args.url_target.as_str()) {
if !is_http_url(target_url) && !is_data_url(target_url) {
eprintln!(
"Only HTTP and HTTPS URLs are allowed but got: {}",
&app_args.url_target
"Only HTTP(S) or data URLs are allowed but got: {}",
&target_url
);
process::exit(1);
}
@ -78,21 +81,23 @@ fn main() {
.expect("Failed to initialize HTTP client");
// Retrieve root document
let (data, final_url) = retrieve_asset(
&mut cache,
&client,
app_args.url_target.as_str(),
false,
"",
app_args.silent,
)
.expect("Could not retrieve assets in HTML");
let dom = html_to_dom(&data);
if is_http_url(target_url) {
let (data, final_url) =
retrieve_asset(&mut cache, &client, target_url, false, "", app_args.silent)
.expect("Could not retrieve assets in HTML");
dom = html_to_dom(&data);
base_url = final_url;
} else if is_data_url(target_url) {
base_url = target_url.to_string();
dom = html_to_dom(&data_url_to_text(target_url));
} else {
process::exit(1);
}
walk_and_embed_assets(
&mut cache,
&client,
&final_url,
&base_url,
&dom.document,
app_args.no_css,
app_args.no_js,

@ -1,6 +1,6 @@
use crate::utils::{
clean_url, data_to_data_url, detect_mimetype, is_data_url, is_http_url, resolve_url,
url_has_protocol,
clean_url, data_to_data_url, data_url_to_text, detect_mimetype, is_data_url, is_http_url,
resolve_url, url_has_protocol,
};
use url::ParseError;
@ -144,20 +144,35 @@ fn test_resolve_url() -> Result<(), ParseError> {
"https://www.w3schools.com/html/default.asp"
);
let resolved_url = resolve_url(
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h",
"https://www.kernel.org/category/signatures.html",
)?;
assert_eq!(
resolved_url.as_str(),
"https://www.kernel.org/category/signatures.html"
);
let resolved_url = resolve_url(
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h",
"//www.w3schools.com/html/html_iframe.asp",
)
.unwrap_or(str!());
assert_eq!(resolved_url.as_str(), "");
Ok(())
}
#[test]
fn test_is_data_url() {
// passing
assert!(
is_data_url("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h")
.unwrap_or(false)
);
assert!(is_data_url(
"data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h"
));
// failing
assert!(!is_data_url("https://kernel.org").unwrap_or(false));
assert!(!is_data_url("//kernel.org").unwrap_or(false));
assert!(!is_data_url("").unwrap_or(false));
assert!(!is_data_url("https://kernel.org"));
assert!(!is_data_url("//kernel.org"));
assert!(!is_data_url(""));
}
#[test]
@ -175,3 +190,25 @@ fn test_clean_url() {
"https://somewhere.com/font.eot"
);
}
#[test]
fn test_data_url_to_text() {
assert_eq!(
data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="),
"Work expands so as to fill the time available for its completion"
);
assert_eq!(
data_url_to_text(
"data:text/html;utf8,Work expands so as to fill the time available for its completion"
),
"Work expands so as to fill the time available for its completion"
);
assert_eq!(
data_url_to_text(
"data:text/html,Work expands so as to fill the time available for its completion"
),
"Work expands so as to fill the time available for its completion"
);
}

@ -1,5 +1,5 @@
use crate::http::retrieve_asset;
use base64::encode;
use base64::{decode, encode};
use regex::Regex;
use reqwest::blocking::Client;
use std::collections::HashMap;
@ -37,8 +37,6 @@ use url::{ParseError, Url};
const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P<stylesheet>@import)|(?P<font>src\s*:))\s+)?url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)"###;
lazy_static! {
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap();
}
@ -82,19 +80,25 @@ pub fn detect_mimetype(data: &[u8]) -> String {
return String::from_utf8(item[1].to_vec()).unwrap();
}
}
"".to_owned()
str!()
}
pub fn url_has_protocol<T: AsRef<str>>(url: T) -> bool {
HAS_PROTOCOL.is_match(url.as_ref().to_lowercase().as_str())
Url::parse(url.as_ref())
.and_then(|u| Ok(u.scheme().len() > 0))
.unwrap_or(false)
}
pub fn is_data_url<T: AsRef<str>>(url: T) -> Result<bool, ParseError> {
Url::parse(url.as_ref()).and_then(|u| Ok(u.scheme() == "data"))
pub fn is_data_url<T: AsRef<str>>(url: T) -> bool {
Url::parse(url.as_ref())
.and_then(|u| Ok(u.scheme() == "data"))
.unwrap_or(false)
}
pub fn is_http_url<T: AsRef<str>>(path: T) -> bool {
REGEX_URL.is_match(path.as_ref())
pub fn is_http_url<T: AsRef<str>>(url: T) -> bool {
Url::parse(url.as_ref())
.and_then(|u| Ok(u.scheme() == "http" || u.scheme() == "https"))
.unwrap_or(false)
}
pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<String, ParseError> {
@ -205,3 +209,33 @@ pub fn clean_url<T: AsRef<str>>(url: T) -> String {
}
result.to_string()
}
pub fn data_url_to_text<T: AsRef<str>>(url: T) -> String {
let parsed_url = Url::parse(url.as_ref()).unwrap_or(Url::parse("http://[::1]").unwrap());
let mut data: String = parsed_url.path().to_string();
if data.to_lowercase().starts_with("text/html") {
data = data.chars().skip(9).collect();
if data.starts_with(";") {
// Encoding specified, find out which one
data = data.chars().skip(1).collect();
if data.to_lowercase().starts_with("base64,") {
data = data.chars().skip(7).collect();
String::from_utf8(decode(&data).unwrap_or(vec![])).unwrap_or(str!())
} else if data.to_lowercase().starts_with("utf8,") {
data.chars().skip(5).collect()
} else {
str!()
}
} else if data.starts_with(",") {
// Plaintext, no encoding specified
data.chars().skip(1).collect()
} else {
str!()
}
} else {
str!()
}
}

Loading…
Cancel
Save