From 3b27b6aa86c5cf0a696264f64e211258fc9a4548 Mon Sep 17 00:00:00 2001 From: Tyler Sengia Date: Wed, 13 Dec 2023 20:13:11 -0500 Subject: [PATCH 1/7] Add log and stderrlog to Cargo.toml --- Cargo.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 601fbce..fc9492d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,8 @@ futures = "0.3.29" petgraph = "0.6.4" reqwest = "0.11.22" scraper = "0.18.1" +stderrlog = "0.5.4" +log = "0.4.20" tokio = { version = "1.34.0", features = ["macros"] } url = "2.4.1" From 70a256f9fa566e9a5f130a83accc364d2e006214 Mon Sep 17 00:00:00 2001 From: Tyler Sengia Date: Wed, 13 Dec 2023 20:13:41 -0500 Subject: [PATCH 2/7] Switch to using standard logging interface --- src/algo.rs | 32 +++++++++++--------------------- src/lib.rs | 10 +--------- src/main.rs | 49 ++++++++++++++++++++++++++----------------------- 3 files changed, 38 insertions(+), 53 deletions(-) diff --git a/src/algo.rs b/src/algo.rs index da0b25d..1dd6528 100644 --- a/src/algo.rs +++ b/src/algo.rs @@ -1,6 +1,7 @@ //! Holds algorithm(s) used to traverse across a website use async_recursion::async_recursion; +use log::{error, info, warn}; use petgraph::graph::NodeIndex; use reqwest::{Client, Response}; use scraper::{Element, Html}; @@ -80,9 +81,7 @@ pub async fn visit_page( page.checked = true; if response_result.is_err() { - if options.verbose { - println!("Found bad link! {}", url); - } + error!("Found bad link! {}", url); page.status_code = response_result.err().unwrap().status(); page.good = Some(false); return false; @@ -99,12 +98,10 @@ pub async fn visit_page( // If Content-Type is not HTML, then don't try to parse the HTML if !parse_html { - if options.verbose { - println!( - "Not parsing HTML for: {}, Content-Type is {:?}", - url, content_type - ); - } + warn!( + "Not parsing HTML for: {}, Content-Type is {:?}", + url, content_type + ); return true; } @@ -112,9 +109,7 @@ pub async fn visit_page( let parse_html = check_host(&options.hosts, &url); if !parse_html { - if options.verbose { - println!("Not parsing HTML for: {}, outside of domain", url); - } + info!("Not parsing HTML for: {}, outside of domain", url); return true; } } @@ -128,9 +123,7 @@ pub async fn visit_page( let page = graph.node_weight_mut(node_index).unwrap(); if contents.is_err() { page.good = Some(false); - if options.verbose { - println!("Failed to get contents of page! {}", url); - } + error!("Failed to get contents of page! {}", url); return false; } } @@ -147,9 +140,7 @@ pub async fn visit_page( } } - if options.verbose { - println!("Visited page {}", url.as_str()); - } + info!("Visited page {}", url.as_str()); let links = html.select(options.link_selector.as_ref()); @@ -164,9 +155,8 @@ pub async fn visit_page( // Parse out a URL from the link let next_url = get_url_from_element(l, &url); if next_url.is_err() { - if options.verbose { - println!("Failed to get URL from element: {}", l.html()); - } + error!("Failed to get URL from element: {}", l.html()); + found_problem = true; { let page = graph.node_weight_mut(node_index).unwrap(); diff --git a/src/lib.rs b/src/lib.rs index 144a61b..231e4bc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ use error::SpiderError; +use log::{error, info, warn}; use petgraph::graph::{DiGraph, NodeIndex}; use reqwest::StatusCode; use scraper::{selector::CssLocalName, Selector}; @@ -69,10 +70,6 @@ pub struct SpiderOptions { pub link_selector: Box, /// Scraper CSS Selector for title elements pub title_selector: Box, - /// Flag to enable quiet mode. True if quiet mode enabled. - pub quiet: bool, - /// Flag to enable verbose mode. True if verbose mode enabled. - pub verbose: bool, /// Name of the CSS class that marks elements to not check URLs for pub skip_class: CssLocalName, /// Vector of hosts (domain names and IP addresses) that Spider Crab will traverse @@ -104,11 +101,6 @@ impl Default for SpiderOptions { max_depth: -1, link_selector: Box::new(Selector::parse("a").expect("Invalid title selector!")), title_selector: Box::new(Selector::parse("title").expect("Invalid title selector!")), - quiet: false, - #[cfg(test)] - verbose: true, - #[cfg(not(test))] - verbose: false, skip_class: CssLocalName::from("scrab-skip"), hosts: vec![], } diff --git a/src/main.rs b/src/main.rs index 858c75d..1ee1af3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,7 @@ +use log::{error, info, warn}; use std::fs::File; use std::io::Write; +use stderrlog; use clap::{Arg, ArgAction, Command}; use spider_crab::error::SpiderError; @@ -17,7 +19,6 @@ fn save_graph_file( #[tokio::main(flavor = "current_thread")] async fn main() -> std::result::Result<(), Box> { let matches = Command::new("Spider Crab") - .version("0.0.1") .about("Checks links and images in a webpage.") .author("Tyler Sengia") .arg( @@ -38,12 +39,11 @@ async fn main() -> std::result::Result<(), Box> { .arg( Arg::new("quiet") .short('q') - .long("quiet") - .action(ArgAction::SetTrue) - .help("Do not print to STDOUT or STDERR."), + .action(ArgAction::Append) + .help("Increase message verbosity"), ) .arg( - Arg::new("verbose") + Arg::new("verbosity") .short('v') .long("verbose") .action(ArgAction::SetTrue) @@ -65,16 +65,24 @@ async fn main() -> std::result::Result<(), Box> { let depth: i32 = *matches.get_one::("depth").expect("Invalid depth!"); - let quiet: bool = matches.get_flag("quiet"); - let verbose: bool = matches.get_flag("verbose"); + let verbose: usize = matches + .get_occurrences::("verbosity") + .unwrap() + .count(); let dot_output_file = matches.get_one::("dot"); + stderrlog::new() + .module(module_path!()) + .quiet(matches.get_flag("quiet")) + .verbosity(verbose) + .init() + .unwrap(); + let mut spider_crab = SpiderCrab::default(); spider_crab.options.add_host(url_str); spider_crab.options.max_depth = depth; - spider_crab.options.verbose = verbose; const EXPECTED_PAGES: usize = 50; spider_crab.graph.reserve_edges(200); @@ -83,16 +91,12 @@ async fn main() -> std::result::Result<(), Box> { let result = spider_crab.visit_website(url_str).await; - if !quiet { - println!("Discovered {} pages", spider_crab.graph.node_count()); - println!("Visited {} pages", spider_crab.map.len()); - println!("Discovered {} links", spider_crab.graph.edge_count()); - } + info!("Discovered {} pages", spider_crab.graph.node_count()); + info!("Visited {} pages", spider_crab.map.len()); + info!("Discovered {} links", spider_crab.graph.edge_count()); if result { - if !quiet { - println!("All links good!"); - } + info!("All links good!"); if dot_output_file.is_some() { let save_result = save_graph_file(&spider_crab, dot_output_file.unwrap()); if save_result.is_err() { @@ -101,13 +105,12 @@ async fn main() -> std::result::Result<(), Box> { } return Ok(()); } else { - if !quiet { - for page in spider_crab.graph.node_weights() { - for error in &page.errors { - println!("{}", error); - } + for page in spider_crab.graph.node_weights() { + for error in &page.errors { + error!("{}", error); } } + let e = Box::new(SpiderError { error_type: spider_crab::error::SpiderErrorType::FailedCrawl, source_page: None, @@ -118,11 +121,11 @@ async fn main() -> std::result::Result<(), Box> { if dot_output_file.is_some() { let save_result = save_graph_file(&spider_crab, dot_output_file.unwrap()); if save_result.is_err() { - eprintln!( + error!( "Save to Dot output file {} failed!", dot_output_file.unwrap() ); - eprintln!("Error: {:?}", save_result.err().unwrap()); + error!("Error: {:?}", save_result.err().unwrap()); } } return Err(e); From 31d3906b0b87652adee6555c7f1e0dee8efd2478 Mon Sep 17 00:00:00 2001 From: Tyler Sengia Date: Wed, 13 Dec 2023 20:14:13 -0500 Subject: [PATCH 3/7] Linter fixes --- src/lib.rs | 2 +- src/main.rs | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 231e4bc..cc1a007 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,5 @@ use error::SpiderError; -use log::{error, info, warn}; + use petgraph::graph::{DiGraph, NodeIndex}; use reqwest::StatusCode; use scraper::{selector::CssLocalName, Selector}; diff --git a/src/main.rs b/src/main.rs index 1ee1af3..f42e581 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,6 @@ -use log::{error, info, warn}; +use log::{error, info}; use std::fs::File; use std::io::Write; -use stderrlog; use clap::{Arg, ArgAction, Command}; use spider_crab::error::SpiderError; From 620d748d327d979504cb5651e251c39700056eee Mon Sep 17 00:00:00 2001 From: Tyler Sengia Date: Wed, 13 Dec 2023 20:22:58 -0500 Subject: [PATCH 4/7] Fix argument parsing --- src/main.rs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/main.rs b/src/main.rs index f42e581..65f459a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -38,14 +38,13 @@ async fn main() -> std::result::Result<(), Box> { .arg( Arg::new("quiet") .short('q') - .action(ArgAction::Append) - .help("Increase message verbosity"), + .action(ArgAction::SetTrue) + .help("Silence logging output."), ) .arg( Arg::new("verbosity") .short('v') - .long("verbose") - .action(ArgAction::SetTrue) + .action(ArgAction::Count) .help("Print more log messages."), ) .arg( @@ -64,17 +63,14 @@ async fn main() -> std::result::Result<(), Box> { let depth: i32 = *matches.get_one::("depth").expect("Invalid depth!"); - let verbose: usize = matches - .get_occurrences::("verbosity") - .unwrap() - .count(); + let verbose = matches.get_count("verbosity"); let dot_output_file = matches.get_one::("dot"); stderrlog::new() .module(module_path!()) .quiet(matches.get_flag("quiet")) - .verbosity(verbose) + .verbosity(verbose as usize) .init() .unwrap(); From b7429bd7c75560113509bf8f9aa9298ec89570a4 Mon Sep 17 00:00:00 2001 From: Tyler Sengia Date: Wed, 13 Dec 2023 20:29:16 -0500 Subject: [PATCH 5/7] Add error cases --- src/error.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/error.rs b/src/error.rs index 2008180..f51021f 100644 --- a/src/error.rs +++ b/src/error.rs @@ -13,7 +13,8 @@ pub struct SpiderError { #[derive(Debug)] pub enum SpiderErrorType { InvalidURL, - BrokenLink, + MissingPage, + UnableToRetrieve, MissingHref, EmptyHref, MissingTitle, @@ -32,10 +33,12 @@ impl std::fmt::Display for SpiderError { impl SpiderError { fn get_message(&self) -> String { match &self.error_type { - SpiderErrorType::BrokenLink => format!( - "Page at {:?} contains a link pointing to {:?}, but {:?} is a bad link!", - self.source_page.as_ref().unwrap(), - self.target_page.as_ref().unwrap(), + SpiderErrorType::MissingPage => format!( + "Page at {:?} does not exist!", + self.target_page.as_ref().unwrap() + ), + SpiderErrorType::UnableToRetrieve => format!( + "Failed to retrieve content for page {:?}!", self.target_page.as_ref().unwrap() ), SpiderErrorType::InvalidURL => format!( From 7bcfb44362b883a85f1757ce1684b4f08833c005 Mon Sep 17 00:00:00 2001 From: Tyler Sengia Date: Wed, 13 Dec 2023 20:36:24 -0500 Subject: [PATCH 6/7] Add better error cases --- src/error.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/error.rs b/src/error.rs index f51021f..2b58c8e 100644 --- a/src/error.rs +++ b/src/error.rs @@ -13,7 +13,7 @@ pub struct SpiderError { #[derive(Debug)] pub enum SpiderErrorType { InvalidURL, - MissingPage, + HTTPError, UnableToRetrieve, MissingHref, EmptyHref, @@ -33,14 +33,15 @@ impl std::fmt::Display for SpiderError { impl SpiderError { fn get_message(&self) -> String { match &self.error_type { - SpiderErrorType::MissingPage => format!( - "Page at {:?} does not exist!", - self.target_page.as_ref().unwrap() - ), SpiderErrorType::UnableToRetrieve => format!( "Failed to retrieve content for page {:?}!", self.target_page.as_ref().unwrap() ), + SpiderErrorType::HTTPError => format!( + "HTTP GET request received status code {:?} for page {:?}!", + self.http_error_code.as_ref().unwrap(), + self.target_page.as_ref().unwrap() + ), SpiderErrorType::InvalidURL => format!( "Page at {:?} contains a link with an invalid URL {:?}!", self.source_page.as_ref().unwrap(), From c548f456b7d9dc4be477c7867e4f043573729d5a Mon Sep 17 00:00:00 2001 From: Tyler Sengia Date: Wed, 13 Dec 2023 20:36:43 -0500 Subject: [PATCH 7/7] Push errors to page graph instead of relying on logging output --- src/algo.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/algo.rs b/src/algo.rs index a1d2554..1d5f00e 100644 --- a/src/algo.rs +++ b/src/algo.rs @@ -8,6 +8,7 @@ use scraper::{Element, Html}; use std::sync::Mutex; use url::Url; +use crate::error::{SpiderError, SpiderErrorType}; use crate::url_helpers::{check_host, get_url_from_element}; use crate::{Link, Page, PageGraph, PageMap, SpiderOptions}; @@ -81,9 +82,15 @@ pub async fn visit_page( page.checked = true; if response_result.is_err() { - error!("Found bad link! {}", url); page.status_code = response_result.err().unwrap().status(); page.good = Some(false); + page.errors.push(SpiderError { + html: None, + source_page: None, + target_page: Some(url.to_string()), + http_error_code: None, + error_type: SpiderErrorType::UnableToRetrieve, + }); return false; } @@ -92,8 +99,14 @@ pub async fn visit_page( // Record the HTTP status code page.status_code = Some(response.status()); if !response.status().is_success() { - println!("Found bad link! {}", url); page.good = Some(false); + page.errors.push(SpiderError { + html: None, + source_page: None, + target_page: Some(url.to_string()), + http_error_code: Some(response.status().as_u16()), + error_type: SpiderErrorType::HTTPError, + }); return false; }