From 84ce028cd8be427af48bc6e6e50294159e6d8f3c Mon Sep 17 00:00:00 2001
From: Tyler Sengia <tylersengia@gmail.com>
Date: Fri, 1 Dec 2023 22:27:38 -0500
Subject: [PATCH 01/10] Expand error types for more detail, and use them in
 url_helpers.rs

---
 src/error.rs       | 30 ++++++++++++++++++++++++++++--
 src/url_helpers.rs | 24 ++++++++++++++++++------
 2 files changed, 46 insertions(+), 8 deletions(-)
diff --git a/src/error.rs b/src/error.rs
index 995b47e..52bfb89 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -3,13 +3,39 @@
 #[derive(Debug)]
 /// Custom error type for Spider Crab
 pub struct SpiderError {
-    pub message: String,
+    pub source_page: Option<String>,
+    pub target_page: Option<String>,
+    pub http_error_code: Option<u16>,
+    pub error_type: SpiderErrorType
+}
+
+
+#[derive(Debug)]
+pub enum SpiderErrorType {
+    InvalidURL,
+    BrokenLink,
+    MissingHref,
+    EmptyHref,
+    MissingTitle
 }
 
 impl std::error::Error for SpiderError {}
 
 impl std::fmt::Display for SpiderError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "SpiderError: {}", self.message)
+        let message = self.get_message();
+        write!(f, "SpiderError ({:?}): {}", self.error_type, message)
     }
 }
+
+impl SpiderError {
+    fn get_message(&self) -> String {
+        match &self.error_type {
+            SpiderErrorType::BrokenLink => format!("Page at \"{:?}\" contains a link pointing to \"{:?}\", but \"{:?}\" is a bad link!", self.source_page, self.target_page, self.target_page),
+            SpiderErrorType::InvalidURL => format!("Page at \"{:?}\" contains a link with no href attribute!", self.source_page),
+            SpiderErrorType::MissingHref => format!("Page at \"{:?}\" contains a link with an invalid URL '{:?}'!", self.source_page, self.target_page),
+            SpiderErrorType::EmptyHref => format!("Page at \"{:?}\" contains a link with an empty href attribute!", self.source_page),
+            SpiderErrorType::MissingTitle => format!("Page at \"{:?}\" does not have a title!", self.source_page),
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/url_helpers.rs b/src/url_helpers.rs
index 7701502..721cdde 100644
--- a/src/url_helpers.rs
+++ b/src/url_helpers.rs
@@ -2,24 +2,36 @@
 
 use scraper::ElementRef;
 use url::{Host, ParseError, Url};
+use crate::error::SpiderErrorType;
 
 /// Attempt to extract and parse a URL from a `<a>` HTML element
 /// Returns `Some(Url)` if extract + parse was successful
 /// Returns `None` if extraction or parsing failed
-pub fn get_url_from_element(element: ElementRef, current_url: &Url) -> Option<Url> {
-    let href_attribute = element.attr("href")?;
+pub fn get_url_from_element(element: ElementRef, current_url: &Url) -> Result<Url, SpiderErrorType> {
+    let href_attribute = element.attr("href");
+
+    if href_attribute.is_none() {
+        // Element does not have an href attribute
+        return Err(SpiderErrorType::MissingHref);
+    }
+
+    let href_attribute = href_attribute.unwrap();
 
     let next_url_str = href_attribute;
 
     if next_url_str.is_empty() {
-        // href attribute value is ""
-        return None;
+        // Element's href attribute value is ""
+        return Err(SpiderErrorType::EmptyHref);
     }
 
     let next_url = parse_relative_or_absolute_url(current_url, next_url_str);
-    next_url.as_ref()?;
 
-    next_url
+    if next_url.is_none() {
+        // Failed to parse the URL, report it as an error
+        return Err(SpiderErrorType::InvalidURL);
+    }
+
+    Ok(next_url.unwrap())
 }
 
 /// Attempts to grab the host from `url` and see if it matches any element listed in `hosts`

From cbfe4157d8b23c186b87f7dc49168dad05b1957b Mon Sep 17 00:00:00 2001
From: Tyler Sengia <tylersengia@gmail.com>
Date: Fri, 1 Dec 2023 22:31:50 -0500
Subject: [PATCH 02/10] Add Other error type

---
 src/error.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/error.rs b/src/error.rs
index 52bfb89..fa23c87 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -16,7 +16,8 @@ pub enum SpiderErrorType {
     BrokenLink,
     MissingHref,
     EmptyHref,
-    MissingTitle
+    MissingTitle,
+    Other
 }
 
 impl std::error::Error for SpiderError {}
@@ -33,9 +34,10 @@ impl SpiderError {
         match &self.error_type {
             SpiderErrorType::BrokenLink => format!("Page at \"{:?}\" contains a link pointing to \"{:?}\", but \"{:?}\" is a bad link!", self.source_page, self.target_page, self.target_page),
             SpiderErrorType::InvalidURL => format!("Page at \"{:?}\" contains a link with no href attribute!", self.source_page),
-            SpiderErrorType::MissingHref => format!("Page at \"{:?}\" contains a link with an invalid URL '{:?}'!", self.source_page, self.target_page),
+            SpiderErrorType::MissingHref => format!("Page at \"{:?}\" contains a link with an invalid URL \"{:?}\"!", self.source_page, self.target_page),
             SpiderErrorType::EmptyHref => format!("Page at \"{:?}\" contains a link with an empty href attribute!", self.source_page),
             SpiderErrorType::MissingTitle => format!("Page at \"{:?}\" does not have a title!", self.source_page),
+            SpiderErrorType::Other => format!("Other Error! source_page=\"{:?}\", http_error_code={:?}", self.source_page, self.http_error_code),
         }
     }
 }
\ No newline at end of file

From 3c5bf1dd315d89d44275e29de3e4b850f2720b8b Mon Sep 17 00:00:00 2001
From: Tyler Sengia <tylersengia@gmail.com>
Date: Fri, 1 Dec 2023 22:32:01 -0500
Subject: [PATCH 03/10] Add TODO in algo for handling error

---
 src/algo.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/algo.rs b/src/algo.rs
index 6106a21..49190fb 100644
--- a/src/algo.rs
+++ b/src/algo.rs
@@ -162,7 +162,8 @@ pub async fn visit_page(
 
             // Parse out a URL from the link
             let next_url = get_url_from_element(l, &url);
-            if next_url.is_none() {
+            if next_url.is_err() {
+                // TODO: Transform the error code into an actual error and return it
                 println!("Failed to get URL from element: {}", l.html());
                 found_problem = true;
                 continue;

From 22f36852eb7282223be61c76ce853363df23375f Mon Sep 17 00:00:00 2001
From: Tyler Sengia <tylersengia@gmail.com>
Date: Fri, 1 Dec 2023 22:32:10 -0500
Subject: [PATCH 04/10] Update main.rs to compile

---
 src/main.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/main.rs b/src/main.rs
index d0ff428..679af43 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -82,7 +82,10 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
             println!("Something failed!");
         }
         let e = Box::new(SpiderError {
-            message: String::from("Check failed!"),
+            error_type: spider_crab::error::SpiderErrorType::Other,
+            source_page: Some("Unknown".to_string()),
+            http_error_code: None,
+            target_page: None
         }) as Box<dyn std::error::Error>;
         return Err(e);
     }

From 1cb5aee7e66a853bc6f2d7aee9e0c8e83b0d1050 Mon Sep 17 00:00:00 2001
From: Tyler Sengia <tylersengia@gmail.com>
Date: Fri, 1 Dec 2023 22:32:25 -0500
Subject: [PATCH 05/10] formatting

---
 src/error.rs       | 7 +++----
 src/main.rs        | 2 +-
 src/url_helpers.rs | 7 +++++--
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/error.rs b/src/error.rs
index fa23c87..216fbf7 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -6,10 +6,9 @@ pub struct SpiderError {
     pub source_page: Option<String>,
     pub target_page: Option<String>,
     pub http_error_code: Option<u16>,
-    pub error_type: SpiderErrorType
+    pub error_type: SpiderErrorType,
 }
 
-
 #[derive(Debug)]
 pub enum SpiderErrorType {
     InvalidURL,
@@ -17,7 +16,7 @@ pub enum SpiderErrorType {
     MissingHref,
     EmptyHref,
     MissingTitle,
-    Other
+    Other,
 }
 
 impl std::error::Error for SpiderError {}
@@ -40,4 +39,4 @@ impl SpiderError {
             SpiderErrorType::Other => format!("Other Error! source_page=\"{:?}\", http_error_code={:?}", self.source_page, self.http_error_code),
         }
     }
-}
\ No newline at end of file
+}
diff --git a/src/main.rs b/src/main.rs
index 679af43..303b871 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -85,7 +85,7 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
             error_type: spider_crab::error::SpiderErrorType::Other,
             source_page: Some("Unknown".to_string()),
             http_error_code: None,
-            target_page: None
+            target_page: None,
         }) as Box<dyn std::error::Error>;
         return Err(e);
     }
diff --git a/src/url_helpers.rs b/src/url_helpers.rs
index 721cdde..004329a 100644
--- a/src/url_helpers.rs
+++ b/src/url_helpers.rs
@@ -1,13 +1,16 @@
 //! Helper functions called by the page traversal algorithm
 
+use crate::error::SpiderErrorType;
 use scraper::ElementRef;
 use url::{Host, ParseError, Url};
-use crate::error::SpiderErrorType;
 
 /// Attempt to extract and parse a URL from a `<a>` HTML element
 /// Returns `Some(Url)` if extract + parse was successful
 /// Returns `None` if extraction or parsing failed
-pub fn get_url_from_element(element: ElementRef, current_url: &Url) -> Result<Url, SpiderErrorType> {
+pub fn get_url_from_element(
+    element: ElementRef,
+    current_url: &Url,
+) -> Result<Url, SpiderErrorType> {
     let href_attribute = element.attr("href");
 
     if href_attribute.is_none() {

From 3cd6f4c854fc05a9b9980d3ac1d19f6eba0753ae Mon Sep 17 00:00:00 2001
From: Tyler Sengia <tylersengia@gmail.com>
Date: Sun, 3 Dec 2023 01:02:13 -0500
Subject: [PATCH 06/10] Make good in Page struct an optional

---
 src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index d438eec..2d9557a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -22,11 +22,11 @@ pub struct Page {
     /// Content-Type that was given when this page was visited
     pub content_type: Option<String>,
     /// True if the page was visited and a 2XX HTTP status code was returned, false otherwise
-    pub good: bool,
+    pub good: Option<bool>,
     /// True if this page was visited, false otherwise
     pub checked: bool,
     /// URL that this page is represented by. Does not include URL parameters or fragments
-    pub url: Url,
+    pub url: Url
 }
 
 /// Helper type for the HashMap that maps Urls to Nodes in the graph

From 3da9d50473a990ff2da1f0b0ca292805c569ee42 Mon Sep 17 00:00:00 2001
From: Tyler Sengia <tylersengia@gmail.com>
Date: Sun, 3 Dec 2023 01:02:57 -0500
Subject: [PATCH 07/10] Pass URL to visit_page() to prevent needing to lock
 graph mutex to acquire URL

---
 src/algo.rs | 39 ++++++++++++++-------------------------
 1 file changed, 14 insertions(+), 25 deletions(-)

diff --git a/src/algo.rs b/src/algo.rs
index 49190fb..8930b1c 100644
--- a/src/algo.rs
+++ b/src/algo.rs
@@ -53,29 +53,18 @@ fn check_content_type(response: &Response) -> (bool, Option<String>) {
 #[async_recursion]
 pub async fn visit_page(
     node_index: NodeIndex,
+    url: Url,
     client: &Client,
     options: &SpiderOptions,
     graph_mutex: &Mutex<&mut PageGraph>,
     page_map_mutex: &Mutex<&mut PageMap>,
     current_depth: i32,
 ) -> bool {
-    let url: Url;
-    let mut new_nodes = Vec::<NodeIndex>::new();
+    let mut new_nodes = Vec::<(NodeIndex,Url)>::new();
     let mut found_problem: bool = false;
     // Reserve some space for our new node indices.
     new_nodes.reserve(64);
 
-    {
-        // Momentarily acquire the lock so that we can grab the URL of the page
-        url = graph_mutex
-            .lock()
-            .unwrap()
-            .node_weight(node_index)
-            .unwrap()
-            .url
-            .clone();
-    } // End of scope, releases the lock
-
     {
         // Start of new scope, this is to get the document, parse links, and update the graph
 
@@ -85,15 +74,14 @@ pub async fn visit_page(
             .send()
             .await;
         let response: Response;
-        let is_good = response_result.is_ok();
 
         {
             // Acquire a lock on the graph so that we can update it with our findings for this page
             let mut graph = graph_mutex.lock().unwrap();
             let page = graph.node_weight_mut(node_index).unwrap();
 
-            if !is_good {
-                page.good = false;
+            if  response_result.is_err() {
+                // TODO: Insert error into graph
                 if !options.quiet {
                     println!("Found bad link! {}", url);
                 }
@@ -135,7 +123,7 @@ pub async fn visit_page(
         let mut graph = graph_mutex.lock().unwrap();
         let page = graph.node_weight_mut(node_index).unwrap();
         if contents.is_err() {
-            page.good = false;
+            page.good = Some(false);
             if !options.quiet {
                 println!("Failed to get contents of page! {}", url);
             }
@@ -144,7 +132,7 @@ pub async fn visit_page(
         let contents = contents.unwrap();
         let html = Html::parse_document(contents.as_str());
 
-        page.good = true;
+        page.good = Some(true);
 
         if options.verbose {
             println!("Visited page {}", url.as_str());
@@ -183,8 +171,8 @@ pub async fn visit_page(
                 url: next_url.clone(),
                 title: None,
                 content_type: None,
-                good: false,
-                checked: false,
+                good: None,
+                checked: false
             });
 
             // Add an edge to the graph connecting current page to the target page
@@ -200,7 +188,7 @@ pub async fn visit_page(
                 continue;
             }
 
-            new_nodes.push(new_node);
+            new_nodes.push((new_node, next_url));
         }
     }
 
@@ -208,9 +196,10 @@ pub async fn visit_page(
     futures_vec.reserve_exact(new_nodes.len());
 
     // Create a future for each node we discovered
-    for node in new_nodes {
+    for (node, next_url) in new_nodes {
         futures_vec.push(visit_page(
             node,
+            next_url,
             client,
             options,
             graph_mutex,
@@ -241,9 +230,9 @@ pub async fn visit_root_page(
         root_index = graph.lock().unwrap().add_node(Page {
             title: None,
             content_type: None,
-            good: false,
+            good: None,
             checked: false,
-            url: url.clone(),
+            url: url.clone()
         });
 
         // Mark the root node as visited because visit_page assumes
@@ -251,5 +240,5 @@ pub async fn visit_root_page(
         page_map.lock().unwrap().insert(url.clone(), root_index);
     }
 
-    visit_page(root_index, client, options, graph, page_map, 0).await
+    visit_page(root_index, url.clone(), client, options, graph, page_map, 0).await
 }

From 6c0e985bf45a538d7b81a2e7f0e027132ffbcf60 Mon Sep 17 00:00:00 2001
From: Tyler Sengia <tylersengia@gmail.com>
Date: Sun, 3 Dec 2023 01:07:59 -0500
Subject: [PATCH 08/10] Add status_code and new() for Page struct

---
 src/algo.rs | 16 ++--------------
 src/lib.rs  | 18 +++++++++++++++++-
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/algo.rs b/src/algo.rs
index 8930b1c..a001c64 100644
--- a/src/algo.rs
+++ b/src/algo.rs
@@ -167,13 +167,7 @@ pub async fn visit_page(
             }
 
             // Target URL has not been visited yet, add a node to the graph
-            let new_node = graph.add_node(Page {
-                url: next_url.clone(),
-                title: None,
-                content_type: None,
-                good: None,
-                checked: false
-            });
+            let new_node = graph.add_node(Page::new(&next_url));
 
             // Add an edge to the graph connecting current page to the target page
             graph.add_edge(node_index, new_node, Link { html: l.html() });
@@ -227,13 +221,7 @@ pub async fn visit_root_page(
     let root_index: NodeIndex;
     {
         // Insert the root page as a node into the graph
-        root_index = graph.lock().unwrap().add_node(Page {
-            title: None,
-            content_type: None,
-            good: None,
-            checked: false,
-            url: url.clone()
-        });
+        root_index = graph.lock().unwrap().add_node(Page::new(url));
 
         // Mark the root node as visited because visit_page assumes
         //  that the target page is already marked as visited
diff --git a/src/lib.rs b/src/lib.rs
index 2d9557a..49b49a9 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,4 +1,5 @@
 use petgraph::graph::{DiGraph, NodeIndex};
+use reqwest::StatusCode;
 use scraper::{selector::CssLocalName, Selector};
 use std::collections::HashMap;
 use std::sync::Mutex;
@@ -26,7 +27,22 @@ pub struct Page {
     /// True if this page was visited, false otherwise
     pub checked: bool,
     /// URL that this page is represented by. Does not include URL parameters or fragments
-    pub url: Url
+    pub url: Url,
+    /// HTTP status code returned when this page was visited
+    pub status_code: Option<StatusCode>
+}
+
+impl Page {
+    pub fn new(url: &Url) -> Self {
+        Self {
+            title: None,
+            content_type: None,
+            good: None,
+            checked: false,
+            url: url.clone(),
+            status_code: None
+        }
+    }
 }
 
 /// Helper type for the HashMap that maps Urls to Nodes in the graph

From 76fb3b7a0fc9574fdef4fd99830c613b839f1723 Mon Sep 17 00:00:00 2001
From: Tyler Sengia <tylersengia@gmail.com>
Date: Sun, 3 Dec 2023 01:51:34 -0500
Subject: [PATCH 09/10] Add error reporting

---
 src/algo.rs        | 39 +++++++++++++++++++++++++--------------
 src/lib.rs         |  8 ++++++--
 src/url_helpers.rs | 28 ++++++++++++++++++++--------
 3 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/src/algo.rs b/src/algo.rs
index a001c64..e1dc22e 100644
--- a/src/algo.rs
+++ b/src/algo.rs
@@ -60,14 +60,12 @@ pub async fn visit_page(
     page_map_mutex: &Mutex<&mut PageMap>,
     current_depth: i32,
 ) -> bool {
-    let mut new_nodes = Vec::<(NodeIndex,Url)>::new();
+    let mut new_nodes = Vec::<(NodeIndex, Url)>::new();
     let mut found_problem: bool = false;
     // Reserve some space for our new node indices.
     new_nodes.reserve(64);
 
     {
-        // Start of new scope, this is to get the document, parse links, and update the graph
-
         // Send an HTTP(S) GET request for the desired URL
         let response_result = client
             .request(reqwest::Method::GET, url.clone())
@@ -80,16 +78,20 @@ pub async fn visit_page(
             let mut graph = graph_mutex.lock().unwrap();
             let page = graph.node_weight_mut(node_index).unwrap();
 
-            if  response_result.is_err() {
+            if response_result.is_err() {
                 // TODO: Insert error into graph
                 if !options.quiet {
                     println!("Found bad link! {}", url);
                 }
+                page.status_code = response_result.err().unwrap().status();
                 return false;
             }
 
             response = response_result.unwrap();
 
+            // Record the HTTP status code
+            page.status_code = Some(response.status());
+
             // Attempt to get the Content-Type of the page
             let (parse_html, content_type) = check_content_type(&response);
             page.content_type = content_type.clone();
@@ -121,18 +123,22 @@ pub async fn visit_page(
 
         // Acquire a lock on the graph so that we can update it with our findings for this page
         let mut graph = graph_mutex.lock().unwrap();
-        let page = graph.node_weight_mut(node_index).unwrap();
-        if contents.is_err() {
-            page.good = Some(false);
-            if !options.quiet {
-                println!("Failed to get contents of page! {}", url);
+        {
+            let page = graph.node_weight_mut(node_index).unwrap();
+            if contents.is_err() {
+                page.good = Some(false);
+                if !options.quiet {
+                    println!("Failed to get contents of page! {}", url);
+                }
+                return false;
             }
-            return false;
         }
         let contents = contents.unwrap();
         let html = Html::parse_document(contents.as_str());
-
-        page.good = Some(true);
+        {
+            let page = graph.node_weight_mut(node_index).unwrap();
+            page.good = Some(true);
+        }
 
         if options.verbose {
             println!("Visited page {}", url.as_str());
@@ -151,9 +157,14 @@ pub async fn visit_page(
             // Parse out a URL from the link
             let next_url = get_url_from_element(l, &url);
             if next_url.is_err() {
-                // TODO: Transform the error code into an actual error and return it
-                println!("Failed to get URL from element: {}", l.html());
+                if !options.quiet {
+                    println!("Failed to get URL from element: {}", l.html());
+                }
                 found_problem = true;
+                {
+                    let page = graph.node_weight_mut(node_index).unwrap();
+                    page.errors.push(next_url.unwrap_err());
+                }
                 continue;
             }
             let next_url = next_url.unwrap();
diff --git a/src/lib.rs b/src/lib.rs
index 49b49a9..cf2a1b6 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,3 +1,4 @@
+use error::SpiderError;
 use petgraph::graph::{DiGraph, NodeIndex};
 use reqwest::StatusCode;
 use scraper::{selector::CssLocalName, Selector};
@@ -29,7 +30,9 @@ pub struct Page {
     /// URL that this page is represented by. Does not include URL parameters or fragments
     pub url: Url,
     /// HTTP status code returned when this page was visited
-    pub status_code: Option<StatusCode>
+    pub status_code: Option<StatusCode>,
+    /// Vector of errors encountered while scraping this page
+    pub errors: Vec<SpiderError>,
 }
 
 impl Page {
@@ -40,7 +43,8 @@ impl Page {
             good: None,
             checked: false,
             url: url.clone(),
-            status_code: None
+            status_code: None,
+            errors: Vec::<SpiderError>::new(),
         }
     }
 }
diff --git a/src/url_helpers.rs b/src/url_helpers.rs
index 004329a..b76f92d 100644
--- a/src/url_helpers.rs
+++ b/src/url_helpers.rs
@@ -1,21 +1,23 @@
 //! Helper functions called by the page traversal algorithm
 
-use crate::error::SpiderErrorType;
+use crate::error::{SpiderError, SpiderErrorType};
 use scraper::ElementRef;
 use url::{Host, ParseError, Url};
 
 /// Attempt to extract and parse a URL from a `<a>` HTML element
 /// Returns `Some(Url)` if extract + parse was successful
 /// Returns `None` if extraction or parsing failed
-pub fn get_url_from_element(
-    element: ElementRef,
-    current_url: &Url,
-) -> Result<Url, SpiderErrorType> {
+pub fn get_url_from_element(element: ElementRef, current_url: &Url) -> Result<Url, SpiderError> {
     let href_attribute = element.attr("href");
 
     if href_attribute.is_none() {
         // Element does not have an href attribute
-        return Err(SpiderErrorType::MissingHref);
+        return Err(SpiderError {
+            error_type: SpiderErrorType::MissingHref,
+            source_page: Some(current_url.to_string()),
+            target_page: None,
+            http_error_code: None,
+        });
     }
 
     let href_attribute = href_attribute.unwrap();
@@ -24,14 +26,24 @@ pub fn get_url_from_element(
 
     if next_url_str.is_empty() {
         // Element's href attribute value is ""
-        return Err(SpiderErrorType::EmptyHref);
+        return Err(SpiderError {
+            error_type: SpiderErrorType::EmptyHref,
+            source_page: Some(current_url.to_string()),
+            target_page: None,
+            http_error_code: None,
+        });
     }
 
     let next_url = parse_relative_or_absolute_url(current_url, next_url_str);
 
     if next_url.is_none() {
         // Failed to parse the URL, report it as an error
-        return Err(SpiderErrorType::InvalidURL);
+        return Err(SpiderError {
+            error_type: SpiderErrorType::InvalidURL,
+            source_page: Some(current_url.to_string()),
+            target_page: Some(next_url_str.to_string()),
+            http_error_code: None,
+        });
     }
 
     Ok(next_url.unwrap())

From 0dc7ce5f494c7c5fc5c820842cf6e3988e024b6c Mon Sep 17 00:00:00 2001
From: Tyler Sengia <tylersengia@gmail.com>
Date: Sun, 3 Dec 2023 02:10:46 -0500
Subject: [PATCH 10/10] Finishing touches

---
 src/algo.rs        |  7 +++----
 src/error.rs       | 37 ++++++++++++++++++++++++++++++-------
 src/main.rs        | 15 ++++++++-------
 src/url_helpers.rs |  3 +++
 4 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/src/algo.rs b/src/algo.rs
index e1dc22e..745fa0d 100644
--- a/src/algo.rs
+++ b/src/algo.rs
@@ -79,8 +79,7 @@ pub async fn visit_page(
             let page = graph.node_weight_mut(node_index).unwrap();
 
             if response_result.is_err() {
-                // TODO: Insert error into graph
-                if !options.quiet {
+                if options.verbose {
                     println!("Found bad link! {}", url);
                 }
                 page.status_code = response_result.err().unwrap().status();
@@ -127,7 +126,7 @@ pub async fn visit_page(
             let page = graph.node_weight_mut(node_index).unwrap();
             if contents.is_err() {
                 page.good = Some(false);
-                if !options.quiet {
+                if options.verbose {
                     println!("Failed to get contents of page! {}", url);
                 }
                 return false;
@@ -157,7 +156,7 @@ pub async fn visit_page(
             // Parse out a URL from the link
             let next_url = get_url_from_element(l, &url);
             if next_url.is_err() {
-                if !options.quiet {
+                if options.verbose {
                     println!("Failed to get URL from element: {}", l.html());
                 }
                 found_problem = true;
diff --git a/src/error.rs b/src/error.rs
index 216fbf7..2008180 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -7,6 +7,7 @@ pub struct SpiderError {
     pub target_page: Option<String>,
     pub http_error_code: Option<u16>,
     pub error_type: SpiderErrorType,
+    pub html: Option<String>,
 }
 
 #[derive(Debug)]
@@ -16,7 +17,7 @@ pub enum SpiderErrorType {
     MissingHref,
     EmptyHref,
     MissingTitle,
-    Other,
+    FailedCrawl,
 }
 
 impl std::error::Error for SpiderError {}
@@ -31,12 +32,34 @@ impl std::fmt::Display for SpiderError {
 impl SpiderError {
     fn get_message(&self) -> String {
         match &self.error_type {
-            SpiderErrorType::BrokenLink => format!("Page at \"{:?}\" contains a link pointing to \"{:?}\", but \"{:?}\" is a bad link!", self.source_page, self.target_page, self.target_page),
-            SpiderErrorType::InvalidURL => format!("Page at \"{:?}\" contains a link with no href attribute!", self.source_page),
-            SpiderErrorType::MissingHref => format!("Page at \"{:?}\" contains a link with an invalid URL \"{:?}\"!", self.source_page, self.target_page),
-            SpiderErrorType::EmptyHref => format!("Page at \"{:?}\" contains a link with an empty href attribute!", self.source_page),
-            SpiderErrorType::MissingTitle => format!("Page at \"{:?}\" does not have a title!", self.source_page),
-            SpiderErrorType::Other => format!("Other Error! source_page=\"{:?}\", http_error_code={:?}", self.source_page, self.http_error_code),
+            SpiderErrorType::BrokenLink => format!(
+                "Page at {:?} contains a link pointing to {:?}, but {:?} is a bad link!",
+                self.source_page.as_ref().unwrap(),
+                self.target_page.as_ref().unwrap(),
+                self.target_page.as_ref().unwrap()
+            ),
+            SpiderErrorType::InvalidURL => format!(
+                "Page at {:?} contains a link with an invalid URL {:?}!",
+                self.source_page.as_ref().unwrap(),
+                self.target_page.as_ref().unwrap()
+            ),
+            SpiderErrorType::MissingHref => format!(
+                "Page at {:?} contains a link with no href attribute! Element is: {:?}",
+                self.source_page.as_ref().unwrap(),
+                self.html.as_ref().unwrap()
+            ),
+            SpiderErrorType::EmptyHref => format!(
+                "Page at {:?} contains a link with an empty href attribute! Element is: {:?}",
+                self.source_page.as_ref().unwrap(),
+                self.html.as_ref().unwrap()
+            ),
+            SpiderErrorType::MissingTitle => format!(
+                "Page at {:?} does not have a title!",
+                self.source_page.as_ref().unwrap()
+            ),
+            SpiderErrorType::FailedCrawl => {
+                String::from("Found a problem while crawling the target webpage!")
+            }
         }
     }
 }
diff --git a/src/main.rs b/src/main.rs
index 303b871..cd4b759 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -49,10 +49,6 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
     let quiet: bool = matches.get_flag("quiet");
     let verbose: bool = matches.get_flag("verbose");
 
-    if !quiet {
-        println!("Spider Crab");
-    }
-
     let mut spider_crab = SpiderCrab::default();
     spider_crab.options.add_host(url_str);
 
@@ -79,13 +75,18 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
         return Ok(());
     } else {
         if !quiet {
-            println!("Something failed!");
+            for page in spider_crab.graph.node_weights() {
+                for error in &page.errors {
+                    println!("{}", error);
+                }
+            }
         }
         let e = Box::new(SpiderError {
-            error_type: spider_crab::error::SpiderErrorType::Other,
-            source_page: Some("Unknown".to_string()),
+            error_type: spider_crab::error::SpiderErrorType::FailedCrawl,
+            source_page: None,
             http_error_code: None,
             target_page: None,
+            html: None,
         }) as Box<dyn std::error::Error>;
         return Err(e);
     }
diff --git a/src/url_helpers.rs b/src/url_helpers.rs
index b76f92d..30cccab 100644
--- a/src/url_helpers.rs
+++ b/src/url_helpers.rs
@@ -17,6 +17,7 @@ pub fn get_url_from_element(element: ElementRef, current_url: &Url) -> Result<Ur
             source_page: Some(current_url.to_string()),
             target_page: None,
             http_error_code: None,
+            html: Some(element.html()),
         });
     }
 
@@ -31,6 +32,7 @@ pub fn get_url_from_element(element: ElementRef, current_url: &Url) -> Result<Ur
             source_page: Some(current_url.to_string()),
             target_page: None,
             http_error_code: None,
+            html: Some(element.html()),
         });
     }
 
@@ -43,6 +45,7 @@ pub fn get_url_from_element(element: ElementRef, current_url: &Url) -> Result<Ur
             source_page: Some(current_url.to_string()),
             target_page: Some(next_url_str.to_string()),
             http_error_code: None,
+            html: Some(element.html()),
         });
     }