From 84ce028cd8be427af48bc6e6e50294159e6d8f3c Mon Sep 17 00:00:00 2001 From: Tyler Sengia Date: Fri, 1 Dec 2023 22:27:38 -0500 Subject: [PATCH 01/10] Expand error types for more detail, and use them in url_helpers.rs --- src/error.rs | 30 ++++++++++++++++++++++++++++-- src/url_helpers.rs | 24 ++++++++++++++++++------ 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/src/error.rs b/src/error.rs index 995b47e..52bfb89 100644 --- a/src/error.rs +++ b/src/error.rs @@ -3,13 +3,39 @@ #[derive(Debug)] /// Custom error type for Spider Crab pub struct SpiderError { - pub message: String, + pub source_page: Option, + pub target_page: Option, + pub http_error_code: Option, + pub error_type: SpiderErrorType +} + + +#[derive(Debug)] +pub enum SpiderErrorType { + InvalidURL, + BrokenLink, + MissingHref, + EmptyHref, + MissingTitle } impl std::error::Error for SpiderError {} impl std::fmt::Display for SpiderError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "SpiderError: {}", self.message) + let message = self.get_message(); + write!(f, "SpiderError ({:?}): {}", self.error_type, message) } } + +impl SpiderError { + fn get_message(&self) -> String { + match &self.error_type { + SpiderErrorType::BrokenLink => format!("Page at \"{:?}\" contains a link pointing to \"{:?}\", but \"{:?}\" is a bad link!", self.source_page, self.target_page, self.target_page), + SpiderErrorType::InvalidURL => format!("Page at \"{:?}\" contains a link with no href attribute!", self.source_page), + SpiderErrorType::MissingHref => format!("Page at \"{:?}\" contains a link with an invalid URL '{:?}'!", self.source_page, self.target_page), + SpiderErrorType::EmptyHref => format!("Page at \"{:?}\" contains a link with an empty href attribute!", self.source_page), + SpiderErrorType::MissingTitle => format!("Page at \"{:?}\" does not have a title!", self.source_page), + } + } +} \ No newline at end of file diff --git a/src/url_helpers.rs b/src/url_helpers.rs index 7701502..721cdde 100644 --- a/src/url_helpers.rs +++ b/src/url_helpers.rs @@ -2,24 +2,36 @@ use scraper::ElementRef; use url::{Host, ParseError, Url}; +use crate::error::SpiderErrorType; /// Attempt to extract and parse a URL from a `` HTML element /// Returns `Some(Url)` if extract + parse was successful /// Returns `None` if extraction or parsing failed -pub fn get_url_from_element(element: ElementRef, current_url: &Url) -> Option { - let href_attribute = element.attr("href")?; +pub fn get_url_from_element(element: ElementRef, current_url: &Url) -> Result { + let href_attribute = element.attr("href"); + + if href_attribute.is_none() { + // Element does not have an href attribute + return Err(SpiderErrorType::MissingHref); + } + + let href_attribute = href_attribute.unwrap(); let next_url_str = href_attribute; if next_url_str.is_empty() { - // href attribute value is "" - return None; + // Element's href attribute value is "" + return Err(SpiderErrorType::EmptyHref); } let next_url = parse_relative_or_absolute_url(current_url, next_url_str); - next_url.as_ref()?; - next_url + if next_url.is_none() { + // Failed to parse the URL, report it as an error + return Err(SpiderErrorType::InvalidURL); + } + + Ok(next_url.unwrap()) } /// Attempts to grab the host from `url` and see if it matches any element listed in `hosts` From cbfe4157d8b23c186b87f7dc49168dad05b1957b Mon Sep 17 00:00:00 2001 From: Tyler Sengia Date: Fri, 1 Dec 2023 22:31:50 -0500 Subject: [PATCH 02/10] Add Other error type --- src/error.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/error.rs b/src/error.rs index 52bfb89..fa23c87 100644 --- a/src/error.rs +++ b/src/error.rs @@ -16,7 +16,8 @@ pub enum SpiderErrorType { BrokenLink, MissingHref, EmptyHref, - MissingTitle + MissingTitle, + Other } impl std::error::Error for SpiderError {} @@ -33,9 +34,10 @@ impl SpiderError { match &self.error_type { SpiderErrorType::BrokenLink => format!("Page at \"{:?}\" contains a link pointing to \"{:?}\", but \"{:?}\" is a bad link!", self.source_page, self.target_page, self.target_page), SpiderErrorType::InvalidURL => format!("Page at \"{:?}\" contains a link with no href attribute!", self.source_page), - SpiderErrorType::MissingHref => format!("Page at \"{:?}\" contains a link with an invalid URL '{:?}'!", self.source_page, self.target_page), + SpiderErrorType::MissingHref => format!("Page at \"{:?}\" contains a link with an invalid URL \"{:?}\"!", self.source_page, self.target_page), SpiderErrorType::EmptyHref => format!("Page at \"{:?}\" contains a link with an empty href attribute!", self.source_page), SpiderErrorType::MissingTitle => format!("Page at \"{:?}\" does not have a title!", self.source_page), + SpiderErrorType::Other => format!("Other Error! source_page=\"{:?}\", http_error_code={:?}", self.source_page, self.http_error_code), } } } \ No newline at end of file From 3c5bf1dd315d89d44275e29de3e4b850f2720b8b Mon Sep 17 00:00:00 2001 From: Tyler Sengia Date: Fri, 1 Dec 2023 22:32:01 -0500 Subject: [PATCH 03/10] Add TODO in algo for handling error --- src/algo.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/algo.rs b/src/algo.rs index 6106a21..49190fb 100644 --- a/src/algo.rs +++ b/src/algo.rs @@ -162,7 +162,8 @@ pub async fn visit_page( // Parse out a URL from the link let next_url = get_url_from_element(l, &url); - if next_url.is_none() { + if next_url.is_err() { + // TODO: Transform the error code into an actual error and return it println!("Failed to get URL from element: {}", l.html()); found_problem = true; continue; From 22f36852eb7282223be61c76ce853363df23375f Mon Sep 17 00:00:00 2001 From: Tyler Sengia Date: Fri, 1 Dec 2023 22:32:10 -0500 Subject: [PATCH 04/10] Update main.rs to compile --- src/main.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index d0ff428..679af43 100644 --- a/src/main.rs +++ b/src/main.rs @@ -82,7 +82,10 @@ async fn main() -> std::result::Result<(), Box> { println!("Something failed!"); } let e = Box::new(SpiderError { - message: String::from("Check failed!"), + error_type: spider_crab::error::SpiderErrorType::Other, + source_page: Some("Unknown".to_string()), + http_error_code: None, + target_page: None }) as Box; return Err(e); } From 1cb5aee7e66a853bc6f2d7aee9e0c8e83b0d1050 Mon Sep 17 00:00:00 2001 From: Tyler Sengia Date: Fri, 1 Dec 2023 22:32:25 -0500 Subject: [PATCH 05/10] formatting --- src/error.rs | 7 +++---- src/main.rs | 2 +- src/url_helpers.rs | 7 +++++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/error.rs b/src/error.rs index fa23c87..216fbf7 100644 --- a/src/error.rs +++ b/src/error.rs @@ -6,10 +6,9 @@ pub struct SpiderError { pub source_page: Option, pub target_page: Option, pub http_error_code: Option, - pub error_type: SpiderErrorType + pub error_type: SpiderErrorType, } - #[derive(Debug)] pub enum SpiderErrorType { InvalidURL, @@ -17,7 +16,7 @@ pub enum SpiderErrorType { MissingHref, EmptyHref, MissingTitle, - Other + Other, } impl std::error::Error for SpiderError {} @@ -40,4 +39,4 @@ impl SpiderError { SpiderErrorType::Other => format!("Other Error! source_page=\"{:?}\", http_error_code={:?}", self.source_page, self.http_error_code), } } -} \ No newline at end of file +} diff --git a/src/main.rs b/src/main.rs index 679af43..303b871 100644 --- a/src/main.rs +++ b/src/main.rs @@ -85,7 +85,7 @@ async fn main() -> std::result::Result<(), Box> { error_type: spider_crab::error::SpiderErrorType::Other, source_page: Some("Unknown".to_string()), http_error_code: None, - target_page: None + target_page: None, }) as Box; return Err(e); } diff --git a/src/url_helpers.rs b/src/url_helpers.rs index 721cdde..004329a 100644 --- a/src/url_helpers.rs +++ b/src/url_helpers.rs @@ -1,13 +1,16 @@ //! Helper functions called by the page traversal algorithm +use crate::error::SpiderErrorType; use scraper::ElementRef; use url::{Host, ParseError, Url}; -use crate::error::SpiderErrorType; /// Attempt to extract and parse a URL from a `` HTML element /// Returns `Some(Url)` if extract + parse was successful /// Returns `None` if extraction or parsing failed -pub fn get_url_from_element(element: ElementRef, current_url: &Url) -> Result { +pub fn get_url_from_element( + element: ElementRef, + current_url: &Url, +) -> Result { let href_attribute = element.attr("href"); if href_attribute.is_none() { From 3cd6f4c854fc05a9b9980d3ac1d19f6eba0753ae Mon Sep 17 00:00:00 2001 From: Tyler Sengia Date: Sun, 3 Dec 2023 01:02:13 -0500 Subject: [PATCH 06/10] Make good in Page struct an optional --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index d438eec..2d9557a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -22,11 +22,11 @@ pub struct Page { /// Content-Type that was given when this page was visited pub content_type: Option, /// True if the page was visited and a 2XX HTTP status code was returned, false otherwise - pub good: bool, + pub good: Option, /// True if this page was visited, false otherwise pub checked: bool, /// URL that this page is represented by. Does not include URL parameters or fragments - pub url: Url, + pub url: Url } /// Helper type for the HashMap that maps Urls to Nodes in the graph From 3da9d50473a990ff2da1f0b0ca292805c569ee42 Mon Sep 17 00:00:00 2001 From: Tyler Sengia Date: Sun, 3 Dec 2023 01:02:57 -0500 Subject: [PATCH 07/10] Pass URL to visit_page() to prevent needing to lock graph mutex to acquire URL --- src/algo.rs | 39 ++++++++++++++------------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/src/algo.rs b/src/algo.rs index 49190fb..8930b1c 100644 --- a/src/algo.rs +++ b/src/algo.rs @@ -53,29 +53,18 @@ fn check_content_type(response: &Response) -> (bool, Option) { #[async_recursion] pub async fn visit_page( node_index: NodeIndex, + url: Url, client: &Client, options: &SpiderOptions, graph_mutex: &Mutex<&mut PageGraph>, page_map_mutex: &Mutex<&mut PageMap>, current_depth: i32, ) -> bool { - let url: Url; - let mut new_nodes = Vec::::new(); + let mut new_nodes = Vec::<(NodeIndex,Url)>::new(); let mut found_problem: bool = false; // Reserve some space for our new node indices. new_nodes.reserve(64); - { - // Momentarily acquire the lock so that we can grab the URL of the page - url = graph_mutex - .lock() - .unwrap() - .node_weight(node_index) - .unwrap() - .url - .clone(); - } // End of scope, releases the lock - { // Start of new scope, this is to get the document, parse links, and update the graph @@ -85,15 +74,14 @@ pub async fn visit_page( .send() .await; let response: Response; - let is_good = response_result.is_ok(); { // Acquire a lock on the graph so that we can update it with our findings for this page let mut graph = graph_mutex.lock().unwrap(); let page = graph.node_weight_mut(node_index).unwrap(); - if !is_good { - page.good = false; + if response_result.is_err() { + // TODO: Insert error into graph if !options.quiet { println!("Found bad link! {}", url); } @@ -135,7 +123,7 @@ pub async fn visit_page( let mut graph = graph_mutex.lock().unwrap(); let page = graph.node_weight_mut(node_index).unwrap(); if contents.is_err() { - page.good = false; + page.good = Some(false); if !options.quiet { println!("Failed to get contents of page! {}", url); } @@ -144,7 +132,7 @@ pub async fn visit_page( let contents = contents.unwrap(); let html = Html::parse_document(contents.as_str()); - page.good = true; + page.good = Some(true); if options.verbose { println!("Visited page {}", url.as_str()); @@ -183,8 +171,8 @@ pub async fn visit_page( url: next_url.clone(), title: None, content_type: None, - good: false, - checked: false, + good: None, + checked: false }); // Add an edge to the graph connecting current page to the target page @@ -200,7 +188,7 @@ pub async fn visit_page( continue; } - new_nodes.push(new_node); + new_nodes.push((new_node, next_url)); } } @@ -208,9 +196,10 @@ pub async fn visit_page( futures_vec.reserve_exact(new_nodes.len()); // Create a future for each node we discovered - for node in new_nodes { + for (node, next_url) in new_nodes { futures_vec.push(visit_page( node, + next_url, client, options, graph_mutex, @@ -241,9 +230,9 @@ pub async fn visit_root_page( root_index = graph.lock().unwrap().add_node(Page { title: None, content_type: None, - good: false, + good: None, checked: false, - url: url.clone(), + url: url.clone() }); // Mark the root node as visited because visit_page assumes @@ -251,5 +240,5 @@ pub async fn visit_root_page( page_map.lock().unwrap().insert(url.clone(), root_index); } - visit_page(root_index, client, options, graph, page_map, 0).await + visit_page(root_index, url.clone(), client, options, graph, page_map, 0).await } From 6c0e985bf45a538d7b81a2e7f0e027132ffbcf60 Mon Sep 17 00:00:00 2001 From: Tyler Sengia Date: Sun, 3 Dec 2023 01:07:59 -0500 Subject: [PATCH 08/10] Add status_code and new() for Page struct --- src/algo.rs | 16 ++-------------- src/lib.rs | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/src/algo.rs b/src/algo.rs index 8930b1c..a001c64 100644 --- a/src/algo.rs +++ b/src/algo.rs @@ -167,13 +167,7 @@ pub async fn visit_page( } // Target URL has not been visited yet, add a node to the graph - let new_node = graph.add_node(Page { - url: next_url.clone(), - title: None, - content_type: None, - good: None, - checked: false - }); + let new_node = graph.add_node(Page::new(&next_url)); // Add an edge to the graph connecting current page to the target page graph.add_edge(node_index, new_node, Link { html: l.html() }); @@ -227,13 +221,7 @@ pub async fn visit_root_page( let root_index: NodeIndex; { // Insert the root page as a node into the graph - root_index = graph.lock().unwrap().add_node(Page { - title: None, - content_type: None, - good: None, - checked: false, - url: url.clone() - }); + root_index = graph.lock().unwrap().add_node(Page::new(url)); // Mark the root node as visited because visit_page assumes // that the target page is already marked as visited diff --git a/src/lib.rs b/src/lib.rs index 2d9557a..49b49a9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ use petgraph::graph::{DiGraph, NodeIndex}; +use reqwest::StatusCode; use scraper::{selector::CssLocalName, Selector}; use std::collections::HashMap; use std::sync::Mutex; @@ -26,7 +27,22 @@ pub struct Page { /// True if this page was visited, false otherwise pub checked: bool, /// URL that this page is represented by. Does not include URL parameters or fragments - pub url: Url + pub url: Url, + /// HTTP status code returned when this page was visited + pub status_code: Option +} + +impl Page { + pub fn new(url: &Url) -> Self { + Self { + title: None, + content_type: None, + good: None, + checked: false, + url: url.clone(), + status_code: None + } + } } /// Helper type for the HashMap that maps Urls to Nodes in the graph From 76fb3b7a0fc9574fdef4fd99830c613b839f1723 Mon Sep 17 00:00:00 2001 From: Tyler Sengia Date: Sun, 3 Dec 2023 01:51:34 -0500 Subject: [PATCH 09/10] Add error reporting --- src/algo.rs | 39 +++++++++++++++++++++++++-------------- src/lib.rs | 8 ++++++-- src/url_helpers.rs | 28 ++++++++++++++++++++-------- 3 files changed, 51 insertions(+), 24 deletions(-) diff --git a/src/algo.rs b/src/algo.rs index a001c64..e1dc22e 100644 --- a/src/algo.rs +++ b/src/algo.rs @@ -60,14 +60,12 @@ pub async fn visit_page( page_map_mutex: &Mutex<&mut PageMap>, current_depth: i32, ) -> bool { - let mut new_nodes = Vec::<(NodeIndex,Url)>::new(); + let mut new_nodes = Vec::<(NodeIndex, Url)>::new(); let mut found_problem: bool = false; // Reserve some space for our new node indices. new_nodes.reserve(64); { - // Start of new scope, this is to get the document, parse links, and update the graph - // Send an HTTP(S) GET request for the desired URL let response_result = client .request(reqwest::Method::GET, url.clone()) @@ -80,16 +78,20 @@ pub async fn visit_page( let mut graph = graph_mutex.lock().unwrap(); let page = graph.node_weight_mut(node_index).unwrap(); - if response_result.is_err() { + if response_result.is_err() { // TODO: Insert error into graph if !options.quiet { println!("Found bad link! {}", url); } + page.status_code = response_result.err().unwrap().status(); return false; } response = response_result.unwrap(); + // Record the HTTP status code + page.status_code = Some(response.status()); + // Attempt to get the Content-Type of the page let (parse_html, content_type) = check_content_type(&response); page.content_type = content_type.clone(); @@ -121,18 +123,22 @@ pub async fn visit_page( // Acquire a lock on the graph so that we can update it with our findings for this page let mut graph = graph_mutex.lock().unwrap(); - let page = graph.node_weight_mut(node_index).unwrap(); - if contents.is_err() { - page.good = Some(false); - if !options.quiet { - println!("Failed to get contents of page! {}", url); + { + let page = graph.node_weight_mut(node_index).unwrap(); + if contents.is_err() { + page.good = Some(false); + if !options.quiet { + println!("Failed to get contents of page! {}", url); + } + return false; } - return false; } let contents = contents.unwrap(); let html = Html::parse_document(contents.as_str()); - - page.good = Some(true); + { + let page = graph.node_weight_mut(node_index).unwrap(); + page.good = Some(true); + } if options.verbose { println!("Visited page {}", url.as_str()); @@ -151,9 +157,14 @@ pub async fn visit_page( // Parse out a URL from the link let next_url = get_url_from_element(l, &url); if next_url.is_err() { - // TODO: Transform the error code into an actual error and return it - println!("Failed to get URL from element: {}", l.html()); + if !options.quiet { + println!("Failed to get URL from element: {}", l.html()); + } found_problem = true; + { + let page = graph.node_weight_mut(node_index).unwrap(); + page.errors.push(next_url.unwrap_err()); + } continue; } let next_url = next_url.unwrap(); diff --git a/src/lib.rs b/src/lib.rs index 49b49a9..cf2a1b6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +use error::SpiderError; use petgraph::graph::{DiGraph, NodeIndex}; use reqwest::StatusCode; use scraper::{selector::CssLocalName, Selector}; @@ -29,7 +30,9 @@ pub struct Page { /// URL that this page is represented by. Does not include URL parameters or fragments pub url: Url, /// HTTP status code returned when this page was visited - pub status_code: Option + pub status_code: Option, + /// Vector of errors encountered while scraping this page + pub errors: Vec, } impl Page { @@ -40,7 +43,8 @@ impl Page { good: None, checked: false, url: url.clone(), - status_code: None + status_code: None, + errors: Vec::::new(), } } } diff --git a/src/url_helpers.rs b/src/url_helpers.rs index 004329a..b76f92d 100644 --- a/src/url_helpers.rs +++ b/src/url_helpers.rs @@ -1,21 +1,23 @@ //! Helper functions called by the page traversal algorithm -use crate::error::SpiderErrorType; +use crate::error::{SpiderError, SpiderErrorType}; use scraper::ElementRef; use url::{Host, ParseError, Url}; /// Attempt to extract and parse a URL from a `` HTML element /// Returns `Some(Url)` if extract + parse was successful /// Returns `None` if extraction or parsing failed -pub fn get_url_from_element( - element: ElementRef, - current_url: &Url, -) -> Result { +pub fn get_url_from_element(element: ElementRef, current_url: &Url) -> Result { let href_attribute = element.attr("href"); if href_attribute.is_none() { // Element does not have an href attribute - return Err(SpiderErrorType::MissingHref); + return Err(SpiderError { + error_type: SpiderErrorType::MissingHref, + source_page: Some(current_url.to_string()), + target_page: None, + http_error_code: None, + }); } let href_attribute = href_attribute.unwrap(); @@ -24,14 +26,24 @@ pub fn get_url_from_element( if next_url_str.is_empty() { // Element's href attribute value is "" - return Err(SpiderErrorType::EmptyHref); + return Err(SpiderError { + error_type: SpiderErrorType::EmptyHref, + source_page: Some(current_url.to_string()), + target_page: None, + http_error_code: None, + }); } let next_url = parse_relative_or_absolute_url(current_url, next_url_str); if next_url.is_none() { // Failed to parse the URL, report it as an error - return Err(SpiderErrorType::InvalidURL); + return Err(SpiderError { + error_type: SpiderErrorType::InvalidURL, + source_page: Some(current_url.to_string()), + target_page: Some(next_url_str.to_string()), + http_error_code: None, + }); } Ok(next_url.unwrap()) From 0dc7ce5f494c7c5fc5c820842cf6e3988e024b6c Mon Sep 17 00:00:00 2001 From: Tyler Sengia Date: Sun, 3 Dec 2023 02:10:46 -0500 Subject: [PATCH 10/10] Finishing touches --- src/algo.rs | 7 +++---- src/error.rs | 37 ++++++++++++++++++++++++++++++------- src/main.rs | 15 ++++++++------- src/url_helpers.rs | 3 +++ 4 files changed, 44 insertions(+), 18 deletions(-) diff --git a/src/algo.rs b/src/algo.rs index e1dc22e..745fa0d 100644 --- a/src/algo.rs +++ b/src/algo.rs @@ -79,8 +79,7 @@ pub async fn visit_page( let page = graph.node_weight_mut(node_index).unwrap(); if response_result.is_err() { - // TODO: Insert error into graph - if !options.quiet { + if options.verbose { println!("Found bad link! {}", url); } page.status_code = response_result.err().unwrap().status(); @@ -127,7 +126,7 @@ pub async fn visit_page( let page = graph.node_weight_mut(node_index).unwrap(); if contents.is_err() { page.good = Some(false); - if !options.quiet { + if options.verbose { println!("Failed to get contents of page! {}", url); } return false; @@ -157,7 +156,7 @@ pub async fn visit_page( // Parse out a URL from the link let next_url = get_url_from_element(l, &url); if next_url.is_err() { - if !options.quiet { + if options.verbose { println!("Failed to get URL from element: {}", l.html()); } found_problem = true; diff --git a/src/error.rs b/src/error.rs index 216fbf7..2008180 100644 --- a/src/error.rs +++ b/src/error.rs @@ -7,6 +7,7 @@ pub struct SpiderError { pub target_page: Option, pub http_error_code: Option, pub error_type: SpiderErrorType, + pub html: Option, } #[derive(Debug)] @@ -16,7 +17,7 @@ pub enum SpiderErrorType { MissingHref, EmptyHref, MissingTitle, - Other, + FailedCrawl, } impl std::error::Error for SpiderError {} @@ -31,12 +32,34 @@ impl std::fmt::Display for SpiderError { impl SpiderError { fn get_message(&self) -> String { match &self.error_type { - SpiderErrorType::BrokenLink => format!("Page at \"{:?}\" contains a link pointing to \"{:?}\", but \"{:?}\" is a bad link!", self.source_page, self.target_page, self.target_page), - SpiderErrorType::InvalidURL => format!("Page at \"{:?}\" contains a link with no href attribute!", self.source_page), - SpiderErrorType::MissingHref => format!("Page at \"{:?}\" contains a link with an invalid URL \"{:?}\"!", self.source_page, self.target_page), - SpiderErrorType::EmptyHref => format!("Page at \"{:?}\" contains a link with an empty href attribute!", self.source_page), - SpiderErrorType::MissingTitle => format!("Page at \"{:?}\" does not have a title!", self.source_page), - SpiderErrorType::Other => format!("Other Error! source_page=\"{:?}\", http_error_code={:?}", self.source_page, self.http_error_code), + SpiderErrorType::BrokenLink => format!( + "Page at {:?} contains a link pointing to {:?}, but {:?} is a bad link!", + self.source_page.as_ref().unwrap(), + self.target_page.as_ref().unwrap(), + self.target_page.as_ref().unwrap() + ), + SpiderErrorType::InvalidURL => format!( + "Page at {:?} contains a link with an invalid URL {:?}!", + self.source_page.as_ref().unwrap(), + self.target_page.as_ref().unwrap() + ), + SpiderErrorType::MissingHref => format!( + "Page at {:?} contains a link with no href attribute! Element is: {:?}", + self.source_page.as_ref().unwrap(), + self.html.as_ref().unwrap() + ), + SpiderErrorType::EmptyHref => format!( + "Page at {:?} contains a link with an empty href attribute! Element is: {:?}", + self.source_page.as_ref().unwrap(), + self.html.as_ref().unwrap() + ), + SpiderErrorType::MissingTitle => format!( + "Page at {:?} does not have a title!", + self.source_page.as_ref().unwrap() + ), + SpiderErrorType::FailedCrawl => { + String::from("Found a problem while crawling the target webpage!") + } } } } diff --git a/src/main.rs b/src/main.rs index 303b871..cd4b759 100644 --- a/src/main.rs +++ b/src/main.rs @@ -49,10 +49,6 @@ async fn main() -> std::result::Result<(), Box> { let quiet: bool = matches.get_flag("quiet"); let verbose: bool = matches.get_flag("verbose"); - if !quiet { - println!("Spider Crab"); - } - let mut spider_crab = SpiderCrab::default(); spider_crab.options.add_host(url_str); @@ -79,13 +75,18 @@ async fn main() -> std::result::Result<(), Box> { return Ok(()); } else { if !quiet { - println!("Something failed!"); + for page in spider_crab.graph.node_weights() { + for error in &page.errors { + println!("{}", error); + } + } } let e = Box::new(SpiderError { - error_type: spider_crab::error::SpiderErrorType::Other, - source_page: Some("Unknown".to_string()), + error_type: spider_crab::error::SpiderErrorType::FailedCrawl, + source_page: None, http_error_code: None, target_page: None, + html: None, }) as Box; return Err(e); } diff --git a/src/url_helpers.rs b/src/url_helpers.rs index b76f92d..30cccab 100644 --- a/src/url_helpers.rs +++ b/src/url_helpers.rs @@ -17,6 +17,7 @@ pub fn get_url_from_element(element: ElementRef, current_url: &Url) -> Result Result Result