-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* scraper: Initial work for other charset support Not working * scraper: Get charset from html instead of http headers Not working * scraper: Charset working * scraper: Fix regex ('"' after equals) * scraper: Use lazy_static for charset regex * misc: Refactor pr * tests: Refactor fixtures * tests: Add html charset support tests * tests: Charset, split tests * downloader: Get charset from http headers * misc: Clarify iterator first * Apply suggestions from CohenArthur Co-authored-by: CohenArthur <[email protected]>
- Loading branch information
1 parent
d26647c
commit db3c933
Showing
14 changed files
with
708 additions
and
235 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,23 @@ | ||
/// Separates HTML responses and other content (PDFs, images...) | ||
pub enum ResponseData { | ||
Html(String), | ||
Html(Vec<u8>), | ||
Other(Vec<u8>), | ||
} | ||
|
||
/// Wrapper around `ResponseData` | ||
pub struct Response { | ||
pub data: ResponseData, | ||
pub filename: Option<String>, | ||
pub charset: Option<String>, | ||
} | ||
|
||
impl Response { | ||
///Create a new Response | ||
pub fn new(data: ResponseData, filename: Option<String>) -> Response { | ||
Response { data, filename } | ||
pub fn new(data: ResponseData, filename: Option<String>, charset: Option<String>) -> Response { | ||
Response { | ||
data, | ||
filename, | ||
charset, | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
//! Test for charset detection/conversion | ||
mod fixtures; | ||
|
||
use std::fs; | ||
use std::process::{Command, Stdio}; | ||
use std::sync::Once; | ||
|
||
const PAGE_META: &'static str = "tests/fixtures/charset_test_html.html"; | ||
static START: Once = Once::new(); | ||
|
||
#[test] | ||
fn test_html_charset_found() { | ||
// Spawn a single instance of a local http server usable by all tests in this module. | ||
START.call_once(|| { | ||
fixtures::spawn_local_http_server(PAGE_META, false, None); | ||
}); | ||
|
||
let output_dir = "charset_html_found"; | ||
let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) | ||
.args(&[fixtures::HTTP_ADDR, "-o", output_dir]) | ||
.stdout(Stdio::inherit()) | ||
.stderr(Stdio::inherit()) | ||
.spawn() | ||
.unwrap(); | ||
let status = cmd.wait().unwrap(); | ||
assert!(status.success()); | ||
let file_path = fs::read_dir(output_dir) | ||
.unwrap() | ||
.next() | ||
.unwrap() | ||
.unwrap() | ||
.path(); // There is only one file in the directory | ||
|
||
let data_source = fs::read(PAGE_META).unwrap(); | ||
let data_downloaded = fs::read(file_path).unwrap(); | ||
|
||
assert!(fixtures::do_vecs_match(&data_source, &data_downloaded)); | ||
|
||
fs::remove_dir_all(output_dir).unwrap(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
//! Test for charset detection/conversion | ||
mod fixtures; | ||
|
||
use std::fs; | ||
use std::process::{Command, Stdio}; | ||
use std::sync::Once; | ||
|
||
const PAGE_NO_META: &'static str = "tests/fixtures/charset_test_html_no_meta.html"; | ||
static START: Once = Once::new(); | ||
|
||
#[test] | ||
fn test_html_charset_not_found() { | ||
// Spawn a single instance of a local http server usable by all tests in this module. | ||
START.call_once(|| { | ||
fixtures::spawn_local_http_server(PAGE_NO_META, false, None); | ||
}); | ||
|
||
let output_dir = "charset_html_not_found"; | ||
let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) | ||
.args(&[fixtures::HTTP_ADDR, "-o", output_dir]) | ||
.stdout(Stdio::inherit()) | ||
.stderr(Stdio::inherit()) | ||
.spawn() | ||
.unwrap(); | ||
let status = cmd.wait().unwrap(); | ||
assert!(status.success()); | ||
let file_path = fs::read_dir(output_dir) | ||
.unwrap() | ||
.next() | ||
.unwrap() | ||
.unwrap() | ||
.path(); // There is only one file in the directory | ||
|
||
let data_source = fs::read(PAGE_NO_META).unwrap(); | ||
let data_downloaded = fs::read(file_path).unwrap(); | ||
|
||
assert!(!fixtures::do_vecs_match(&data_source, &data_downloaded)); | ||
|
||
fs::remove_dir_all(output_dir).unwrap(); | ||
} |
Oops, something went wrong.