Skip to content

Commit

Permalink
Merge pull request #13 from tsengia/add-better-errors
Browse files Browse the repository at this point in the history
Better Error Reporting
  • Loading branch information
tsengia authored Dec 3, 2023
2 parents f7bbbd9 + 0dc7ce5 commit fa8e65b
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 61 deletions.
80 changes: 34 additions & 46 deletions src/algo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,55 +53,44 @@ fn check_content_type(response: &Response) -> (bool, Option<String>) {
#[async_recursion]
pub async fn visit_page(
node_index: NodeIndex,
url: Url,
client: &Client,
options: &SpiderOptions,
graph_mutex: &Mutex<&mut PageGraph>,
page_map_mutex: &Mutex<&mut PageMap>,
current_depth: i32,
) -> bool {
let url: Url;
let mut new_nodes = Vec::<NodeIndex>::new();
let mut new_nodes = Vec::<(NodeIndex, Url)>::new();
let mut found_problem: bool = false;
// Reserve some space for our new node indices.
new_nodes.reserve(64);

{
// Momentarily acquire the lock so that we can grab the URL of the page
url = graph_mutex
.lock()
.unwrap()
.node_weight(node_index)
.unwrap()
.url
.clone();
} // End of scope, releases the lock

{
// Start of new scope, this is to get the document, parse links, and update the graph

// Send an HTTP(S) GET request for the desired URL
let response_result = client
.request(reqwest::Method::GET, url.clone())
.send()
.await;
let response: Response;
let is_good = response_result.is_ok();

{
// Acquire a lock on the graph so that we can update it with our findings for this page
let mut graph = graph_mutex.lock().unwrap();
let page = graph.node_weight_mut(node_index).unwrap();

if !is_good {
page.good = false;
if !options.quiet {
if response_result.is_err() {
if options.verbose {
println!("Found bad link! {}", url);
}
page.status_code = response_result.err().unwrap().status();
return false;
}

response = response_result.unwrap();

// Record the HTTP status code
page.status_code = Some(response.status());

// Attempt to get the Content-Type of the page
let (parse_html, content_type) = check_content_type(&response);
page.content_type = content_type.clone();
Expand Down Expand Up @@ -133,18 +122,22 @@ pub async fn visit_page(

// Acquire a lock on the graph so that we can update it with our findings for this page
let mut graph = graph_mutex.lock().unwrap();
let page = graph.node_weight_mut(node_index).unwrap();
if contents.is_err() {
page.good = false;
if !options.quiet {
println!("Failed to get contents of page! {}", url);
{
let page = graph.node_weight_mut(node_index).unwrap();
if contents.is_err() {
page.good = Some(false);
if options.verbose {
println!("Failed to get contents of page! {}", url);
}
return false;
}
return false;
}
let contents = contents.unwrap();
let html = Html::parse_document(contents.as_str());

page.good = true;
{
let page = graph.node_weight_mut(node_index).unwrap();
page.good = Some(true);
}

if options.verbose {
println!("Visited page {}", url.as_str());
Expand All @@ -162,9 +155,15 @@ pub async fn visit_page(

// Parse out a URL from the link
let next_url = get_url_from_element(l, &url);
if next_url.is_none() {
println!("Failed to get URL from element: {}", l.html());
if next_url.is_err() {
if options.verbose {
println!("Failed to get URL from element: {}", l.html());
}
found_problem = true;
{
let page = graph.node_weight_mut(node_index).unwrap();
page.errors.push(next_url.unwrap_err());
}
continue;
}
let next_url = next_url.unwrap();
Expand All @@ -178,13 +177,7 @@ pub async fn visit_page(
}

// Target URL has not been visited yet, add a node to the graph
let new_node = graph.add_node(Page {
url: next_url.clone(),
title: None,
content_type: None,
good: false,
checked: false,
});
let new_node = graph.add_node(Page::new(&next_url));

// Add an edge to the graph connecting current page to the target page
graph.add_edge(node_index, new_node, Link { html: l.html() });
Expand All @@ -199,17 +192,18 @@ pub async fn visit_page(
continue;
}

new_nodes.push(new_node);
new_nodes.push((new_node, next_url));
}
}

let mut futures_vec = Vec::new();
futures_vec.reserve_exact(new_nodes.len());

// Create a future for each node we discovered
for node in new_nodes {
for (node, next_url) in new_nodes {
futures_vec.push(visit_page(
node,
next_url,
client,
options,
graph_mutex,
Expand Down Expand Up @@ -237,18 +231,12 @@ pub async fn visit_root_page(
let root_index: NodeIndex;
{
// Insert the root page as a node into the graph
root_index = graph.lock().unwrap().add_node(Page {
title: None,
content_type: None,
good: false,
checked: false,
url: url.clone(),
});
root_index = graph.lock().unwrap().add_node(Page::new(url));

// Mark the root node as visited because visit_page assumes
// that the target page is already marked as visited
page_map.lock().unwrap().insert(url.clone(), root_index);
}

visit_page(root_index, client, options, graph, page_map, 0).await
visit_page(root_index, url.clone(), client, options, graph, page_map, 0).await
}
54 changes: 52 additions & 2 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,63 @@
#[derive(Debug)]
/// Custom error type for Spider Crab
pub struct SpiderError {
pub message: String,
pub source_page: Option<String>,
pub target_page: Option<String>,
pub http_error_code: Option<u16>,
pub error_type: SpiderErrorType,
pub html: Option<String>,
}

#[derive(Debug)]
pub enum SpiderErrorType {
InvalidURL,
BrokenLink,
MissingHref,
EmptyHref,
MissingTitle,
FailedCrawl,
}

impl std::error::Error for SpiderError {}

impl std::fmt::Display for SpiderError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "SpiderError: {}", self.message)
let message = self.get_message();
write!(f, "SpiderError ({:?}): {}", self.error_type, message)
}
}

impl SpiderError {
fn get_message(&self) -> String {
match &self.error_type {
SpiderErrorType::BrokenLink => format!(
"Page at {:?} contains a link pointing to {:?}, but {:?} is a bad link!",
self.source_page.as_ref().unwrap(),
self.target_page.as_ref().unwrap(),
self.target_page.as_ref().unwrap()
),
SpiderErrorType::InvalidURL => format!(
"Page at {:?} contains a link with an invalid URL {:?}!",
self.source_page.as_ref().unwrap(),
self.target_page.as_ref().unwrap()
),
SpiderErrorType::MissingHref => format!(
"Page at {:?} contains a link with no href attribute! Element is: {:?}",
self.source_page.as_ref().unwrap(),
self.html.as_ref().unwrap()
),
SpiderErrorType::EmptyHref => format!(
"Page at {:?} contains a link with an empty href attribute! Element is: {:?}",
self.source_page.as_ref().unwrap(),
self.html.as_ref().unwrap()
),
SpiderErrorType::MissingTitle => format!(
"Page at {:?} does not have a title!",
self.source_page.as_ref().unwrap()
),
SpiderErrorType::FailedCrawl => {
String::from("Found a problem while crawling the target webpage!")
}
}
}
}
22 changes: 21 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use error::SpiderError;
use petgraph::graph::{DiGraph, NodeIndex};
use reqwest::StatusCode;
use scraper::{selector::CssLocalName, Selector};
use std::collections::HashMap;
use std::sync::Mutex;
Expand All @@ -22,11 +24,29 @@ pub struct Page {
/// Content-Type that was given when this page was visited
pub content_type: Option<String>,
/// True if the page was visited and a 2XX HTTP status code was returned, false otherwise
pub good: bool,
pub good: Option<bool>,
/// True if this page was visited, false otherwise
pub checked: bool,
/// URL that this page is represented by. Does not include URL parameters or fragments
pub url: Url,
/// HTTP status code returned when this page was visited
pub status_code: Option<StatusCode>,
/// Vector of errors encountered while scraping this page
pub errors: Vec<SpiderError>,
}

impl Page {
pub fn new(url: &Url) -> Self {
Self {
title: None,
content_type: None,
good: None,
checked: false,
url: url.clone(),
status_code: None,
errors: Vec::<SpiderError>::new(),
}
}
}

/// Helper type for the HashMap that maps Urls to Nodes in the graph
Expand Down
16 changes: 10 additions & 6 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,6 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
let quiet: bool = matches.get_flag("quiet");
let verbose: bool = matches.get_flag("verbose");

if !quiet {
println!("Spider Crab");
}

let mut spider_crab = SpiderCrab::default();
spider_crab.options.add_host(url_str);

Expand All @@ -79,10 +75,18 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
return Ok(());
} else {
if !quiet {
println!("Something failed!");
for page in spider_crab.graph.node_weights() {
for error in &page.errors {
println!("{}", error);
}
}
}
let e = Box::new(SpiderError {
message: String::from("Check failed!"),
error_type: spider_crab::error::SpiderErrorType::FailedCrawl,
source_page: None,
http_error_code: None,
target_page: None,
html: None,
}) as Box<dyn std::error::Error>;
return Err(e);
}
Expand Down
42 changes: 36 additions & 6 deletions src/url_helpers.rs
Original file line number Diff line number Diff line change
@@ -1,25 +1,55 @@
//! Helper functions called by the page traversal algorithm
use crate::error::{SpiderError, SpiderErrorType};
use scraper::ElementRef;
use url::{Host, ParseError, Url};

/// Attempt to extract and parse a URL from a `<a>` HTML element
/// Returns `Some(Url)` if extract + parse was successful
/// Returns `None` if extraction or parsing failed
pub fn get_url_from_element(element: ElementRef, current_url: &Url) -> Option<Url> {
let href_attribute = element.attr("href")?;
pub fn get_url_from_element(element: ElementRef, current_url: &Url) -> Result<Url, SpiderError> {
let href_attribute = element.attr("href");

if href_attribute.is_none() {
// Element does not have an href attribute
return Err(SpiderError {
error_type: SpiderErrorType::MissingHref,
source_page: Some(current_url.to_string()),
target_page: None,
http_error_code: None,
html: Some(element.html()),
});
}

let href_attribute = href_attribute.unwrap();

let next_url_str = href_attribute;

if next_url_str.is_empty() {
// href attribute value is ""
return None;
// Element's href attribute value is ""
return Err(SpiderError {
error_type: SpiderErrorType::EmptyHref,
source_page: Some(current_url.to_string()),
target_page: None,
http_error_code: None,
html: Some(element.html()),
});
}

let next_url = parse_relative_or_absolute_url(current_url, next_url_str);
next_url.as_ref()?;

next_url
if next_url.is_none() {
// Failed to parse the URL, report it as an error
return Err(SpiderError {
error_type: SpiderErrorType::InvalidURL,
source_page: Some(current_url.to_string()),
target_page: Some(next_url_str.to_string()),
http_error_code: None,
html: Some(element.html()),
});
}

Ok(next_url.unwrap())
}

/// Attempts to grab the host from `url` and see if it matches any element listed in `hosts`
Expand Down

0 comments on commit fa8e65b

Please sign in to comment.