Merge pull request #13 from tsengia/add-better-errors

Better Error Reporting
tsengia · Dec 3, 2023 · fa8e65b · fa8e65b
2 parents f7bbbd9 + 0dc7ce5
commit fa8e65b
Show file tree

Hide file tree

Showing 5 changed files with 153 additions and 61 deletions.
diff --git a/src/algo.rs b/src/algo.rs
@@ -53,55 +53,44 @@ fn check_content_type(response: &Response) -> (bool, Option<String>) {
 #[async_recursion]
 pub async fn visit_page(
     node_index: NodeIndex,
+    url: Url,
     client: &Client,
     options: &SpiderOptions,
     graph_mutex: &Mutex<&mut PageGraph>,
     page_map_mutex: &Mutex<&mut PageMap>,
     current_depth: i32,
 ) -> bool {
-    let url: Url;
-    let mut new_nodes = Vec::<NodeIndex>::new();
+    let mut new_nodes = Vec::<(NodeIndex, Url)>::new();
     let mut found_problem: bool = false;
     // Reserve some space for our new node indices.
     new_nodes.reserve(64);
 
     {
-        // Momentarily acquire the lock so that we can grab the URL of the page
-        url = graph_mutex
-            .lock()
-            .unwrap()
-            .node_weight(node_index)
-            .unwrap()
-            .url
-            .clone();
-    } // End of scope, releases the lock
-
-    {
-        // Start of new scope, this is to get the document, parse links, and update the graph
-
         // Send an HTTP(S) GET request for the desired URL
         let response_result = client
             .request(reqwest::Method::GET, url.clone())
             .send()
             .await;
         let response: Response;
-        let is_good = response_result.is_ok();
 
         {
             // Acquire a lock on the graph so that we can update it with our findings for this page
             let mut graph = graph_mutex.lock().unwrap();
             let page = graph.node_weight_mut(node_index).unwrap();
 
-            if !is_good {
-                page.good = false;
-                if !options.quiet {
+            if response_result.is_err() {
+                if options.verbose {
                     println!("Found bad link! {}", url);
                 }
+                page.status_code = response_result.err().unwrap().status();
                 return false;
             }
 
             response = response_result.unwrap();
 
+            // Record the HTTP status code
+            page.status_code = Some(response.status());
+
             // Attempt to get the Content-Type of the page
             let (parse_html, content_type) = check_content_type(&response);
             page.content_type = content_type.clone();
@@ -133,18 +122,22 @@ pub async fn visit_page(
 
         // Acquire a lock on the graph so that we can update it with our findings for this page
         let mut graph = graph_mutex.lock().unwrap();
-        let page = graph.node_weight_mut(node_index).unwrap();
-        if contents.is_err() {
-            page.good = false;
-            if !options.quiet {
-                println!("Failed to get contents of page! {}", url);
+        {
+            let page = graph.node_weight_mut(node_index).unwrap();
+            if contents.is_err() {
+                page.good = Some(false);
+                if options.verbose {
+                    println!("Failed to get contents of page! {}", url);
+                }
+                return false;
             }
-            return false;
         }
         let contents = contents.unwrap();
         let html = Html::parse_document(contents.as_str());
-
-        page.good = true;
+        {
+            let page = graph.node_weight_mut(node_index).unwrap();
+            page.good = Some(true);
+        }
 
         if options.verbose {
             println!("Visited page {}", url.as_str());
@@ -162,9 +155,15 @@ pub async fn visit_page(
 
             // Parse out a URL from the link
             let next_url = get_url_from_element(l, &url);
-            if next_url.is_none() {
-                println!("Failed to get URL from element: {}", l.html());
+            if next_url.is_err() {
+                if options.verbose {
+                    println!("Failed to get URL from element: {}", l.html());
+                }
                 found_problem = true;
+                {
+                    let page = graph.node_weight_mut(node_index).unwrap();
+                    page.errors.push(next_url.unwrap_err());
+                }
                 continue;
             }
             let next_url = next_url.unwrap();
@@ -178,13 +177,7 @@ pub async fn visit_page(
             }
 
             // Target URL has not been visited yet, add a node to the graph
-            let new_node = graph.add_node(Page {
-                url: next_url.clone(),
-                title: None,
-                content_type: None,
-                good: false,
-                checked: false,
-            });
+            let new_node = graph.add_node(Page::new(&next_url));
 
             // Add an edge to the graph connecting current page to the target page
             graph.add_edge(node_index, new_node, Link { html: l.html() });
@@ -199,17 +192,18 @@ pub async fn visit_page(
                 continue;
             }
 
-            new_nodes.push(new_node);
+            new_nodes.push((new_node, next_url));
         }
     }
 
     let mut futures_vec = Vec::new();
     futures_vec.reserve_exact(new_nodes.len());
 
     // Create a future for each node we discovered
-    for node in new_nodes {
+    for (node, next_url) in new_nodes {
         futures_vec.push(visit_page(
             node,
+            next_url,
             client,
             options,
             graph_mutex,
@@ -237,18 +231,12 @@ pub async fn visit_root_page(
     let root_index: NodeIndex;
     {
         // Insert the root page as a node into the graph
-        root_index = graph.lock().unwrap().add_node(Page {
-            title: None,
-            content_type: None,
-            good: false,
-            checked: false,
-            url: url.clone(),
-        });
+        root_index = graph.lock().unwrap().add_node(Page::new(url));
 
         // Mark the root node as visited because visit_page assumes
         //  that the target page is already marked as visited
         page_map.lock().unwrap().insert(url.clone(), root_index);
     }
 
-    visit_page(root_index, client, options, graph, page_map, 0).await
+    visit_page(root_index, url.clone(), client, options, graph, page_map, 0).await
 }
diff --git a/src/error.rs b/src/error.rs
@@ -3,13 +3,63 @@
 #[derive(Debug)]
 /// Custom error type for Spider Crab
 pub struct SpiderError {
-    pub message: String,
+    pub source_page: Option<String>,
+    pub target_page: Option<String>,
+    pub http_error_code: Option<u16>,
+    pub error_type: SpiderErrorType,
+    pub html: Option<String>,
+}
+
+#[derive(Debug)]
+pub enum SpiderErrorType {
+    InvalidURL,
+    BrokenLink,
+    MissingHref,
+    EmptyHref,
+    MissingTitle,
+    FailedCrawl,
 }
 
 impl std::error::Error for SpiderError {}
 
 impl std::fmt::Display for SpiderError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "SpiderError: {}", self.message)
+        let message = self.get_message();
+        write!(f, "SpiderError ({:?}): {}", self.error_type, message)
+    }
+}
+
+impl SpiderError {
+    fn get_message(&self) -> String {
+        match &self.error_type {
+            SpiderErrorType::BrokenLink => format!(
+                "Page at {:?} contains a link pointing to {:?}, but {:?} is a bad link!",
+                self.source_page.as_ref().unwrap(),
+                self.target_page.as_ref().unwrap(),
+                self.target_page.as_ref().unwrap()
+            ),
+            SpiderErrorType::InvalidURL => format!(
+                "Page at {:?} contains a link with an invalid URL {:?}!",
+                self.source_page.as_ref().unwrap(),
+                self.target_page.as_ref().unwrap()
+            ),
+            SpiderErrorType::MissingHref => format!(
+                "Page at {:?} contains a link with no href attribute! Element is: {:?}",
+                self.source_page.as_ref().unwrap(),
+                self.html.as_ref().unwrap()
+            ),
+            SpiderErrorType::EmptyHref => format!(
+                "Page at {:?} contains a link with an empty href attribute! Element is: {:?}",
+                self.source_page.as_ref().unwrap(),
+                self.html.as_ref().unwrap()
+            ),
+            SpiderErrorType::MissingTitle => format!(
+                "Page at {:?} does not have a title!",
+                self.source_page.as_ref().unwrap()
+            ),
+            SpiderErrorType::FailedCrawl => {
+                String::from("Found a problem while crawling the target webpage!")
+            }
+        }
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,4 +1,6 @@
+use error::SpiderError;
 use petgraph::graph::{DiGraph, NodeIndex};
+use reqwest::StatusCode;
 use scraper::{selector::CssLocalName, Selector};
 use std::collections::HashMap;
 use std::sync::Mutex;
@@ -22,11 +24,29 @@ pub struct Page {
     /// Content-Type that was given when this page was visited
     pub content_type: Option<String>,
     /// True if the page was visited and a 2XX HTTP status code was returned, false otherwise
-    pub good: bool,
+    pub good: Option<bool>,
     /// True if this page was visited, false otherwise
     pub checked: bool,
     /// URL that this page is represented by. Does not include URL parameters or fragments
     pub url: Url,
+    /// HTTP status code returned when this page was visited
+    pub status_code: Option<StatusCode>,
+    /// Vector of errors encountered while scraping this page
+    pub errors: Vec<SpiderError>,
+}
+
+impl Page {
+    pub fn new(url: &Url) -> Self {
+        Self {
+            title: None,
+            content_type: None,
+            good: None,
+            checked: false,
+            url: url.clone(),
+            status_code: None,
+            errors: Vec::<SpiderError>::new(),
+        }
+    }
 }
 
 /// Helper type for the HashMap that maps Urls to Nodes in the graph

diff --git a/src/main.rs b/src/main.rs
@@ -49,10 +49,6 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
     let quiet: bool = matches.get_flag("quiet");
     let verbose: bool = matches.get_flag("verbose");
 
-    if !quiet {
-        println!("Spider Crab");
-    }
-
     let mut spider_crab = SpiderCrab::default();
     spider_crab.options.add_host(url_str);
 
@@ -79,10 +75,18 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
         return Ok(());
     } else {
         if !quiet {
-            println!("Something failed!");
+            for page in spider_crab.graph.node_weights() {
+                for error in &page.errors {
+                    println!("{}", error);
+                }
+            }
         }
         let e = Box::new(SpiderError {
-            message: String::from("Check failed!"),
+            error_type: spider_crab::error::SpiderErrorType::FailedCrawl,
+            source_page: None,
+            http_error_code: None,
+            target_page: None,
+            html: None,
         }) as Box<dyn std::error::Error>;
         return Err(e);
     }

diff --git a/src/url_helpers.rs b/src/url_helpers.rs
@@ -1,25 +1,55 @@
 //! Helper functions called by the page traversal algorithm
 
+use crate::error::{SpiderError, SpiderErrorType};
 use scraper::ElementRef;
 use url::{Host, ParseError, Url};
 
 /// Attempt to extract and parse a URL from a `<a>` HTML element
 /// Returns `Some(Url)` if extract + parse was successful
 /// Returns `None` if extraction or parsing failed
-pub fn get_url_from_element(element: ElementRef, current_url: &Url) -> Option<Url> {
-    let href_attribute = element.attr("href")?;
+pub fn get_url_from_element(element: ElementRef, current_url: &Url) -> Result<Url, SpiderError> {
+    let href_attribute = element.attr("href");
+
+    if href_attribute.is_none() {
+        // Element does not have an href attribute
+        return Err(SpiderError {
+            error_type: SpiderErrorType::MissingHref,
+            source_page: Some(current_url.to_string()),
+            target_page: None,
+            http_error_code: None,
+            html: Some(element.html()),
+        });
+    }
+
+    let href_attribute = href_attribute.unwrap();
 
     let next_url_str = href_attribute;
 
     if next_url_str.is_empty() {
-        // href attribute value is ""
-        return None;
+        // Element's href attribute value is ""
+        return Err(SpiderError {
+            error_type: SpiderErrorType::EmptyHref,
+            source_page: Some(current_url.to_string()),
+            target_page: None,
+            http_error_code: None,
+            html: Some(element.html()),
+        });
     }
 
     let next_url = parse_relative_or_absolute_url(current_url, next_url_str);
-    next_url.as_ref()?;
 
-    next_url
+    if next_url.is_none() {
+        // Failed to parse the URL, report it as an error
+        return Err(SpiderError {
+            error_type: SpiderErrorType::InvalidURL,
+            source_page: Some(current_url.to_string()),
+            target_page: Some(next_url_str.to_string()),
+            http_error_code: None,
+            html: Some(element.html()),
+        });
+    }
+
+    Ok(next_url.unwrap())
 }
 
 /// Attempts to grab the host from `url` and see if it matches any element listed in `hosts`