Support other charset (#105)

* scraper: Initial work for other charset support Not working * scraper: Get charset from html instead of http headers Not working * scraper: Charset working * scraper: Fix regex ('"' after equals) * scraper: Use lazy_static for charset regex * misc: Refactor pr * tests: Refactor fixtures * tests: Add html charset support tests * tests: Charset, split tests * downloader: Get charset from http headers * misc: Clarify iterator first * Apply suggestions from CohenArthur Co-authored-by: CohenArthur <[email protected]>
Skallwar · Jan 5, 2021 · db3c933 · db3c933
1 parent d26647c
commit db3c933
Show file tree

Hide file tree

Showing 14 changed files with 708 additions and 235 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -33,6 +33,9 @@ percent-encoding = "^2.1"
 url = "^2.2"
 rand = "^0.8"
 regex = "^1.4"
+encoding_rs = "^0.8"
+lazy_static = "1.4.0"
 
 [dev-dependencies]
 tiny_http = "^0.7"
+serial_test = "^0.5"
diff --git a/src/downloader.rs b/src/downloader.rs
@@ -1,5 +1,8 @@
 use super::response::{Response, ResponseData};
 use std::collections::HashMap;
+
+use lazy_static::lazy_static;
+use regex::Regex;
 use url::Url;
 
 use crate::warn;
@@ -104,26 +107,48 @@ impl Downloader {
         };
         match req.send() {
             Ok(mut data) => {
-                let data_type = match data.headers().get("content-type") {
-                    Some(data_type) => data_type.to_str().unwrap(),
-                    None => "text/html",
-                };
-
-                let filename = if !Downloader::is_html(data_type) {
+                lazy_static! {
+                    static ref DATA_TYPE_REGEX: Regex =
+                        Regex::new("^.*(\\b[a-z]+/[a-z-+\\.]+).*$").unwrap();
+                    static ref CHARSET_REGEX: Regex =
+                        Regex::new("^.*charset\\s*=\\s*\"?([^\"\\s;]+).*$").unwrap();
+                }
+
+                let (data_type, charset): (String, Option<String>) =
+                    match data.headers().get("content-type") {
+                        Some(content_type_header) => {
+                            let content_type = content_type_header.to_str().unwrap();
+                            let data_type_captures =
+                                DATA_TYPE_REGEX.captures_iter(&content_type).nth(0);
+                            let data_type = data_type_captures
+                                .map_or(String::from("text/html"), |first| {
+                                    String::from(first.get(1).unwrap().as_str().to_lowercase())
+                                });
+                            let charset_captures =
+                                CHARSET_REGEX.captures_iter(&content_type).nth(0);
+                            let charset = charset_captures.map(|first| {
+                                String::from(first.get(1).unwrap().as_str().to_lowercase())
+                            });
+                            (data_type, charset)
+                        }
+                        None => (String::from("text/html"), None),
+                    };
+
+                let filename = if !Downloader::is_html(&data_type) {
                     Downloader::get_filename(data.headers())
                 } else {
                     None
                 };
 
-                let data = if Downloader::is_html(data_type) {
-                    ResponseData::Html(data.text().unwrap())
+                let mut raw_data: Vec<u8> = Vec::new();
+                data.copy_to(&mut raw_data).unwrap();
+                let response_data = if Downloader::is_html(&data_type) {
+                    ResponseData::Html(raw_data)
                 } else {
-                    let mut raw_data: Vec<u8> = Vec::new();
-                    data.copy_to(&mut raw_data).unwrap();
                     ResponseData::Other(raw_data)
                 };
 
-                Ok(Response::new(data, filename))
+                Ok(Response::new(response_data, filename, charset))
             }
 
             Err(e) => {

diff --git a/src/response.rs b/src/response.rs
@@ -1,18 +1,23 @@
 /// Separates HTML responses and other content (PDFs, images...)
 pub enum ResponseData {
-    Html(String),
+    Html(Vec<u8>),
     Other(Vec<u8>),
 }
 
 /// Wrapper around `ResponseData`
 pub struct Response {
     pub data: ResponseData,
     pub filename: Option<String>,
+    pub charset: Option<String>,
 }
 
 impl Response {
     ///Create a new Response
-    pub fn new(data: ResponseData, filename: Option<String>) -> Response {
-        Response { data, filename }
+    pub fn new(data: ResponseData, filename: Option<String>, charset: Option<String>) -> Response {
+        Response {
+            data,
+            filename,
+            charset,
+        }
     }
 }
diff --git a/src/scraper.rs b/src/scraper.rs
@@ -1,24 +1,26 @@
 use crossbeam::channel::{Receiver, Sender, TryRecvError};
 use crossbeam::thread;
+use encoding_rs::Encoding;
+use lazy_static::lazy_static;
+use rand::Rng;
+use regex::Regex;
 use url::Url;
 
+use std::borrow::Borrow;
 use std::collections::HashMap;
 use std::collections::HashSet;
 use std::process;
 use std::sync::Mutex;
 use std::time;
 
-use rand::Rng;
-
-use super::downloader;
-
 use super::args;
 use super::disk;
 use super::dom;
+use super::downloader;
 use super::response;
 use super::url_helper;
 
-use crate::{error, info};
+use crate::{error, info, warn};
 
 /// Maximum number of empty recv() from the channel
 static MAX_EMPTY_RECEIVES: usize = 10;
@@ -91,15 +93,79 @@ impl Scraper {
         old_url_str.push_str(&new_url_str);
     }
 
-    ///Proces an html file: add new url to the chanel and prepare for offline navigation
+    /// Find the charset of the webpage. ``data`` is not a String as this might not be utf8.
+    /// Returned String is lower cased
+    /// This is a hack and should be check in case of a bug
+    fn find_charset(data: &[u8], http_charset: Option<String>) -> Option<String> {
+        lazy_static! {
+            static ref CHARSET_REGEX: Regex =
+                Regex::new("<meta.*charset\\s*=\\s*\"?([^\"\\s;]+).*>").unwrap();
+        }
+
+        // We don't know the real charset yet. We hope that the charset is ASCII
+        // compatible, because Rust String are in UTF-8 (also ASCII compatible).
+        let data_utf8 = unsafe { String::from_utf8_unchecked(Vec::from(data)) };
+        let captures = CHARSET_REGEX.captures_iter(&data_utf8).next();
+
+        // We use the first one, hopping we are in the <head> of the page... or if nothing is found
+        // we used the http charset (if any).
+        captures
+            .map(|first| String::from(first.get(1).unwrap().as_str().to_lowercase()))
+            .or(http_charset)
+    }
+
+    /// Proceed to convert the data in utf8.
+    fn charset_convert(
+        data: &[u8],
+        charset_source: &'static Encoding,
+        charset_dest: &'static Encoding,
+    ) -> Vec<u8> {
+        let decode_result = charset_source.decode(data);
+        let decode_bytes = decode_result.0.borrow();
+
+        let encode_result = charset_dest.encode(decode_bytes);
+        let encode_bytes = encode_result.0.into_owned();
+
+        encode_bytes
+    }
+
+    /// Check if the charset require conversion
+    fn needs_charset_conversion(charset: &str) -> bool {
+        match charset {
+            "utf-8" => false,
+            _ => true,
+        }
+    }
+
+    /// Proces an html file: add new url to the chanel and prepare for offline navigation
     fn handle_html(
         scraper: &Scraper,
         transmitter: &Sender<(Url, i32)>,
         url: &Url,
         depth: i32,
-        data: &str,
+        data: &[u8],
+        http_charset: Option<String>,
     ) -> Vec<u8> {
-        let dom = dom::Dom::new(data);
+        let charset_source_str = match Self::find_charset(data, http_charset) {
+            Some(s) => s,
+            None => {
+                warn!("Charset not found for {}, defaulting to UTF-8", url);
+                String::from("utf-8")
+            }
+        };
+
+        let need_charset_conversion = Self::needs_charset_conversion(&charset_source_str);
+
+        let charset_source =
+            encoding_rs::Encoding::for_label(&charset_source_str.as_bytes()).unwrap();
+        let charset_utf8 = encoding_rs::UTF_8;
+        let utf8_data = if need_charset_conversion {
+            Self::charset_convert(data, charset_source, charset_utf8)
+        } else {
+            Vec::from(data)
+        };
+
+        let dom = dom::Dom::new(&String::from_utf8_lossy(&utf8_data).into_owned());
 
         dom.find_urls_as_strings()
             .into_iter()
@@ -119,17 +185,28 @@ impl Scraper {
                 scraper.fix_domtree(next_url, &next_full_url);
             });
 
-        dom.serialize().into_bytes()
+        let utf8_data = dom.serialize().into_bytes();
+
+        if need_charset_conversion {
+            Self::charset_convert(&utf8_data, charset_utf8, charset_source)
+        } else {
+            utf8_data
+        }
     }
 
     /// Process a single URL
     fn handle_url(scraper: &Scraper, transmitter: &Sender<(Url, i32)>, url: Url, depth: i32) {
         match scraper.downloader.get(&url) {
             Ok(response) => {
                 let data = match response.data {
-                    response::ResponseData::Html(data) => {
-                        Scraper::handle_html(scraper, transmitter, &url, depth, &data)
-                    }
+                    response::ResponseData::Html(data) => Scraper::handle_html(
+                        scraper,
+                        transmitter,
+                        &url,
+                        depth,
+                        &data,
+                        response.charset,
+                    ),
                     response::ResponseData::Other(data) => data,
                 };
 

diff --git a/tests/auth.rs b/tests/auth.rs
@@ -2,20 +2,19 @@
 
 mod fixtures;
 
-use fixtures::get_file_count_with_pattern;
 use std::fs::read_dir;
 use std::process::Command;
 use std::process::Stdio;
 use std::sync::Once;
 
-const ADDR: &'static str = "http://0.0.0.0:8000";
+const PAGE: &'static str = "tests/fixtures/index.html";
 static START: Once = Once::new();
 
 #[test]
 fn test_auth() {
     // Spawn a single instance of a local http server usable by all tests in this module.
     START.call_once(|| {
-        fixtures::spawn_local_http_server(true);
+        fixtures::spawn_local_http_server(PAGE, true, None);
     });
 
     // Tests below are grouped together as they depend on the local_http_server above.
@@ -28,7 +27,7 @@ fn auth_different_host() {
     let output_dir = "w4";
     let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
         .args(&[
-            ADDR,
+            fixtures::HTTP_ADDR,
             "-o",
             "w4",
             "-a",
@@ -54,7 +53,15 @@ fn auth_different_host() {
 fn auth_valid() {
     let output_dir = "w5";
     let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
-        .args(&[ADDR, "-o", "w5", "-a", "username password", "-j", "16"])
+        .args(&[
+            fixtures::HTTP_ADDR,
+            "-o",
+            "w5",
+            "-a",
+            "username password",
+            "-j",
+            "16",
+        ])
         .stdout(Stdio::inherit())
         .stderr(Stdio::inherit())
         .spawn()

diff --git a/tests/charset_html_found.rs b/tests/charset_html_found.rs
@@ -0,0 +1,41 @@
+//! Test for charset detection/conversion
+
+mod fixtures;
+
+use std::fs;
+use std::process::{Command, Stdio};
+use std::sync::Once;
+
+const PAGE_META: &'static str = "tests/fixtures/charset_test_html.html";
+static START: Once = Once::new();
+
+#[test]
+fn test_html_charset_found() {
+    // Spawn a single instance of a local http server usable by all tests in this module.
+    START.call_once(|| {
+        fixtures::spawn_local_http_server(PAGE_META, false, None);
+    });
+
+    let output_dir = "charset_html_found";
+    let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
+        .args(&[fixtures::HTTP_ADDR, "-o", output_dir])
+        .stdout(Stdio::inherit())
+        .stderr(Stdio::inherit())
+        .spawn()
+        .unwrap();
+    let status = cmd.wait().unwrap();
+    assert!(status.success());
+    let file_path = fs::read_dir(output_dir)
+        .unwrap()
+        .next()
+        .unwrap()
+        .unwrap()
+        .path(); // There is only one file in the directory
+
+    let data_source = fs::read(PAGE_META).unwrap();
+    let data_downloaded = fs::read(file_path).unwrap();
+
+    assert!(fixtures::do_vecs_match(&data_source, &data_downloaded));
+
+    fs::remove_dir_all(output_dir).unwrap();
+}
diff --git a/tests/charset_html_not_found.rs b/tests/charset_html_not_found.rs
@@ -0,0 +1,41 @@
+//! Test for charset detection/conversion
+
+mod fixtures;
+
+use std::fs;
+use std::process::{Command, Stdio};
+use std::sync::Once;
+
+const PAGE_NO_META: &'static str = "tests/fixtures/charset_test_html_no_meta.html";
+static START: Once = Once::new();
+
+#[test]
+fn test_html_charset_not_found() {
+    // Spawn a single instance of a local http server usable by all tests in this module.
+    START.call_once(|| {
+        fixtures::spawn_local_http_server(PAGE_NO_META, false, None);
+    });
+
+    let output_dir = "charset_html_not_found";
+    let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
+        .args(&[fixtures::HTTP_ADDR, "-o", output_dir])
+        .stdout(Stdio::inherit())
+        .stderr(Stdio::inherit())
+        .spawn()
+        .unwrap();
+    let status = cmd.wait().unwrap();
+    assert!(status.success());
+    let file_path = fs::read_dir(output_dir)
+        .unwrap()
+        .next()
+        .unwrap()
+        .unwrap()
+        .path(); // There is only one file in the directory
+
+    let data_source = fs::read(PAGE_NO_META).unwrap();
+    let data_downloaded = fs::read(file_path).unwrap();
+
+    assert!(!fixtures::do_vecs_match(&data_source, &data_downloaded));
+
+    fs::remove_dir_all(output_dir).unwrap();
+}