Skip to content

Commit

Permalink
Support other charset (#105)
Browse files Browse the repository at this point in the history
* scraper: Initial work for other charset support

Not working

* scraper: Get charset from html instead of http headers

Not working

* scraper: Charset working

* scraper: Fix regex ('"' after equals)

* scraper: Use lazy_static for charset regex

* misc: Refactor pr

* tests: Refactor fixtures

* tests: Add html charset support tests

* tests: Charset, split tests

* downloader: Get charset from http headers

* misc: Clarify iterator first

* Apply suggestions from CohenArthur

Co-authored-by: CohenArthur <[email protected]>
  • Loading branch information
Skallwar and CohenArthur authored Jan 5, 2021
1 parent d26647c commit db3c933
Show file tree
Hide file tree
Showing 14 changed files with 708 additions and 235 deletions.
500 changes: 308 additions & 192 deletions Cargo.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ percent-encoding = "^2.1"
url = "^2.2"
rand = "^0.8"
regex = "^1.4"
encoding_rs = "^0.8"
lazy_static = "1.4.0"

[dev-dependencies]
tiny_http = "^0.7"
serial_test = "^0.5"
47 changes: 36 additions & 11 deletions src/downloader.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
use super::response::{Response, ResponseData};
use std::collections::HashMap;

use lazy_static::lazy_static;
use regex::Regex;
use url::Url;

use crate::warn;
Expand Down Expand Up @@ -104,26 +107,48 @@ impl Downloader {
};
match req.send() {
Ok(mut data) => {
let data_type = match data.headers().get("content-type") {
Some(data_type) => data_type.to_str().unwrap(),
None => "text/html",
};

let filename = if !Downloader::is_html(data_type) {
lazy_static! {
static ref DATA_TYPE_REGEX: Regex =
Regex::new("^.*(\\b[a-z]+/[a-z-+\\.]+).*$").unwrap();
static ref CHARSET_REGEX: Regex =
Regex::new("^.*charset\\s*=\\s*\"?([^\"\\s;]+).*$").unwrap();
}

let (data_type, charset): (String, Option<String>) =
match data.headers().get("content-type") {
Some(content_type_header) => {
let content_type = content_type_header.to_str().unwrap();
let data_type_captures =
DATA_TYPE_REGEX.captures_iter(&content_type).nth(0);
let data_type = data_type_captures
.map_or(String::from("text/html"), |first| {
String::from(first.get(1).unwrap().as_str().to_lowercase())
});
let charset_captures =
CHARSET_REGEX.captures_iter(&content_type).nth(0);
let charset = charset_captures.map(|first| {
String::from(first.get(1).unwrap().as_str().to_lowercase())
});
(data_type, charset)
}
None => (String::from("text/html"), None),
};

let filename = if !Downloader::is_html(&data_type) {
Downloader::get_filename(data.headers())
} else {
None
};

let data = if Downloader::is_html(data_type) {
ResponseData::Html(data.text().unwrap())
let mut raw_data: Vec<u8> = Vec::new();
data.copy_to(&mut raw_data).unwrap();
let response_data = if Downloader::is_html(&data_type) {
ResponseData::Html(raw_data)
} else {
let mut raw_data: Vec<u8> = Vec::new();
data.copy_to(&mut raw_data).unwrap();
ResponseData::Other(raw_data)
};

Ok(Response::new(data, filename))
Ok(Response::new(response_data, filename, charset))
}

Err(e) => {
Expand Down
11 changes: 8 additions & 3 deletions src/response.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
/// Separates HTML responses and other content (PDFs, images...)
pub enum ResponseData {
Html(String),
Html(Vec<u8>),
Other(Vec<u8>),
}

/// Wrapper around `ResponseData`
pub struct Response {
pub data: ResponseData,
pub filename: Option<String>,
pub charset: Option<String>,
}

impl Response {
///Create a new Response
pub fn new(data: ResponseData, filename: Option<String>) -> Response {
Response { data, filename }
pub fn new(data: ResponseData, filename: Option<String>, charset: Option<String>) -> Response {
Response {
data,
filename,
charset,
}
}
}
101 changes: 89 additions & 12 deletions src/scraper.rs
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
use crossbeam::channel::{Receiver, Sender, TryRecvError};
use crossbeam::thread;
use encoding_rs::Encoding;
use lazy_static::lazy_static;
use rand::Rng;
use regex::Regex;
use url::Url;

use std::borrow::Borrow;
use std::collections::HashMap;
use std::collections::HashSet;
use std::process;
use std::sync::Mutex;
use std::time;

use rand::Rng;

use super::downloader;

use super::args;
use super::disk;
use super::dom;
use super::downloader;
use super::response;
use super::url_helper;

use crate::{error, info};
use crate::{error, info, warn};

/// Maximum number of empty recv() from the channel
static MAX_EMPTY_RECEIVES: usize = 10;
Expand Down Expand Up @@ -91,15 +93,79 @@ impl Scraper {
old_url_str.push_str(&new_url_str);
}

///Proces an html file: add new url to the chanel and prepare for offline navigation
/// Find the charset of the webpage. ``data`` is not a String as this might not be utf8.
/// Returned String is lower cased
/// This is a hack and should be check in case of a bug
fn find_charset(data: &[u8], http_charset: Option<String>) -> Option<String> {
lazy_static! {
static ref CHARSET_REGEX: Regex =
Regex::new("<meta.*charset\\s*=\\s*\"?([^\"\\s;]+).*>").unwrap();
}

// We don't know the real charset yet. We hope that the charset is ASCII
// compatible, because Rust String are in UTF-8 (also ASCII compatible).
let data_utf8 = unsafe { String::from_utf8_unchecked(Vec::from(data)) };
let captures = CHARSET_REGEX.captures_iter(&data_utf8).next();

// We use the first one, hopping we are in the <head> of the page... or if nothing is found
// we used the http charset (if any).
captures
.map(|first| String::from(first.get(1).unwrap().as_str().to_lowercase()))
.or(http_charset)
}

/// Proceed to convert the data in utf8.
fn charset_convert(
data: &[u8],
charset_source: &'static Encoding,
charset_dest: &'static Encoding,
) -> Vec<u8> {
let decode_result = charset_source.decode(data);
let decode_bytes = decode_result.0.borrow();

let encode_result = charset_dest.encode(decode_bytes);
let encode_bytes = encode_result.0.into_owned();

encode_bytes
}

/// Check if the charset require conversion
fn needs_charset_conversion(charset: &str) -> bool {
match charset {
"utf-8" => false,
_ => true,
}
}

/// Proces an html file: add new url to the chanel and prepare for offline navigation
fn handle_html(
scraper: &Scraper,
transmitter: &Sender<(Url, i32)>,
url: &Url,
depth: i32,
data: &str,
data: &[u8],
http_charset: Option<String>,
) -> Vec<u8> {
let dom = dom::Dom::new(data);
let charset_source_str = match Self::find_charset(data, http_charset) {
Some(s) => s,
None => {
warn!("Charset not found for {}, defaulting to UTF-8", url);
String::from("utf-8")
}
};

let need_charset_conversion = Self::needs_charset_conversion(&charset_source_str);

let charset_source =
encoding_rs::Encoding::for_label(&charset_source_str.as_bytes()).unwrap();
let charset_utf8 = encoding_rs::UTF_8;
let utf8_data = if need_charset_conversion {
Self::charset_convert(data, charset_source, charset_utf8)
} else {
Vec::from(data)
};

let dom = dom::Dom::new(&String::from_utf8_lossy(&utf8_data).into_owned());

dom.find_urls_as_strings()
.into_iter()
Expand All @@ -119,17 +185,28 @@ impl Scraper {
scraper.fix_domtree(next_url, &next_full_url);
});

dom.serialize().into_bytes()
let utf8_data = dom.serialize().into_bytes();

if need_charset_conversion {
Self::charset_convert(&utf8_data, charset_utf8, charset_source)
} else {
utf8_data
}
}

/// Process a single URL
fn handle_url(scraper: &Scraper, transmitter: &Sender<(Url, i32)>, url: Url, depth: i32) {
match scraper.downloader.get(&url) {
Ok(response) => {
let data = match response.data {
response::ResponseData::Html(data) => {
Scraper::handle_html(scraper, transmitter, &url, depth, &data)
}
response::ResponseData::Html(data) => Scraper::handle_html(
scraper,
transmitter,
&url,
depth,
&data,
response.charset,
),
response::ResponseData::Other(data) => data,
};

Expand Down
17 changes: 12 additions & 5 deletions tests/auth.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,19 @@
mod fixtures;

use fixtures::get_file_count_with_pattern;
use std::fs::read_dir;
use std::process::Command;
use std::process::Stdio;
use std::sync::Once;

const ADDR: &'static str = "http://0.0.0.0:8000";
const PAGE: &'static str = "tests/fixtures/index.html";
static START: Once = Once::new();

#[test]
fn test_auth() {
// Spawn a single instance of a local http server usable by all tests in this module.
START.call_once(|| {
fixtures::spawn_local_http_server(true);
fixtures::spawn_local_http_server(PAGE, true, None);
});

// Tests below are grouped together as they depend on the local_http_server above.
Expand All @@ -28,7 +27,7 @@ fn auth_different_host() {
let output_dir = "w4";
let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
.args(&[
ADDR,
fixtures::HTTP_ADDR,
"-o",
"w4",
"-a",
Expand All @@ -54,7 +53,15 @@ fn auth_different_host() {
fn auth_valid() {
let output_dir = "w5";
let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
.args(&[ADDR, "-o", "w5", "-a", "username password", "-j", "16"])
.args(&[
fixtures::HTTP_ADDR,
"-o",
"w5",
"-a",
"username password",
"-j",
"16",
])
.stdout(Stdio::inherit())
.stderr(Stdio::inherit())
.spawn()
Expand Down
41 changes: 41 additions & 0 deletions tests/charset_html_found.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
//! Test for charset detection/conversion
mod fixtures;

use std::fs;
use std::process::{Command, Stdio};
use std::sync::Once;

const PAGE_META: &'static str = "tests/fixtures/charset_test_html.html";
static START: Once = Once::new();

#[test]
fn test_html_charset_found() {
// Spawn a single instance of a local http server usable by all tests in this module.
START.call_once(|| {
fixtures::spawn_local_http_server(PAGE_META, false, None);
});

let output_dir = "charset_html_found";
let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
.args(&[fixtures::HTTP_ADDR, "-o", output_dir])
.stdout(Stdio::inherit())
.stderr(Stdio::inherit())
.spawn()
.unwrap();
let status = cmd.wait().unwrap();
assert!(status.success());
let file_path = fs::read_dir(output_dir)
.unwrap()
.next()
.unwrap()
.unwrap()
.path(); // There is only one file in the directory

let data_source = fs::read(PAGE_META).unwrap();
let data_downloaded = fs::read(file_path).unwrap();

assert!(fixtures::do_vecs_match(&data_source, &data_downloaded));

fs::remove_dir_all(output_dir).unwrap();
}
41 changes: 41 additions & 0 deletions tests/charset_html_not_found.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
//! Test for charset detection/conversion
mod fixtures;

use std::fs;
use std::process::{Command, Stdio};
use std::sync::Once;

const PAGE_NO_META: &'static str = "tests/fixtures/charset_test_html_no_meta.html";
static START: Once = Once::new();

#[test]
fn test_html_charset_not_found() {
// Spawn a single instance of a local http server usable by all tests in this module.
START.call_once(|| {
fixtures::spawn_local_http_server(PAGE_NO_META, false, None);
});

let output_dir = "charset_html_not_found";
let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
.args(&[fixtures::HTTP_ADDR, "-o", output_dir])
.stdout(Stdio::inherit())
.stderr(Stdio::inherit())
.spawn()
.unwrap();
let status = cmd.wait().unwrap();
assert!(status.success());
let file_path = fs::read_dir(output_dir)
.unwrap()
.next()
.unwrap()
.unwrap()
.path(); // There is only one file in the directory

let data_source = fs::read(PAGE_NO_META).unwrap();
let data_downloaded = fs::read(file_path).unwrap();

assert!(!fixtures::do_vecs_match(&data_source, &data_downloaded));

fs::remove_dir_all(output_dir).unwrap();
}
Loading

0 comments on commit db3c933

Please sign in to comment.