Skip to content

Commit

Permalink
Merge pull request #17 from tsengia/add-logging
Browse files Browse the repository at this point in the history
Add logging
  • Loading branch information
tsengia authored Dec 14, 2023
2 parents bf43f91 + c548f45 commit 3de8a3e
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 60 deletions.
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ futures = "0.3.29"
petgraph = "0.6.4"
reqwest = "0.11.22"
scraper = "0.18.1"
stderrlog = "0.5.4"
log = "0.4.20"
tokio = { version = "1.34.0", features = ["macros"] }
url = "2.4.1"

Expand Down
47 changes: 25 additions & 22 deletions src/algo.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
//! Holds algorithm(s) used to traverse across a website
use async_recursion::async_recursion;
use log::{error, info, warn};
use petgraph::graph::NodeIndex;
use reqwest::{Client, Response};
use scraper::{Element, Html};
use std::sync::Mutex;
use url::Url;

use crate::error::{SpiderError, SpiderErrorType};
use crate::url_helpers::{check_host, get_url_from_element};
use crate::{Link, Page, PageGraph, PageMap, SpiderOptions};

Expand Down Expand Up @@ -80,11 +82,15 @@ pub async fn visit_page(

page.checked = true;
if response_result.is_err() {
if options.verbose {
println!("Found bad link! {}", url);
}
page.status_code = response_result.err().unwrap().status();
page.good = Some(false);
page.errors.push(SpiderError {
html: None,
source_page: None,
target_page: Some(url.to_string()),
http_error_code: None,
error_type: SpiderErrorType::UnableToRetrieve,
});
return false;
}

Expand All @@ -93,8 +99,14 @@ pub async fn visit_page(
// Record the HTTP status code
page.status_code = Some(response.status());
if !response.status().is_success() {
println!("Found bad link! {}", url);
page.good = Some(false);
page.errors.push(SpiderError {
html: None,
source_page: None,
target_page: Some(url.to_string()),
http_error_code: Some(response.status().as_u16()),
error_type: SpiderErrorType::HTTPError,
});
return false;
}

Expand All @@ -104,22 +116,18 @@ pub async fn visit_page(

// If Content-Type is not HTML, then don't try to parse the HTML
if !parse_html {
if options.verbose {
println!(
"Not parsing HTML for: {}, Content-Type is {:?}",
url, content_type
);
}
warn!(
"Not parsing HTML for: {}, Content-Type is {:?}",
url, content_type
);
return true;
}

// Check to see if the domain is inside the starting domain.
let parse_html = check_host(&options.hosts, &url);

if !parse_html {
if options.verbose {
println!("Not parsing HTML for: {}, outside of domain", url);
}
info!("Not parsing HTML for: {}, outside of domain", url);
return true;
}
}
Expand All @@ -133,9 +141,7 @@ pub async fn visit_page(
let page = graph.node_weight_mut(node_index).unwrap();
if contents.is_err() {
page.good = Some(false);
if options.verbose {
println!("Failed to get contents of page! {}", url);
}
error!("Failed to get contents of page! {}", url);
return false;
}
}
Expand All @@ -152,9 +158,7 @@ pub async fn visit_page(
}
}

if options.verbose {
println!("Visited page {}", url.as_str());
}
info!("Visited page {}", url.as_str());

let links = html.select(options.link_selector.as_ref());

Expand All @@ -169,9 +173,8 @@ pub async fn visit_page(
// Parse out a URL from the link
let next_url = get_url_from_element(l, &url);
if next_url.is_err() {
if options.verbose {
println!("Failed to get URL from element: {}", l.html());
}
error!("Failed to get URL from element: {}", l.html());

found_problem = true;
{
let page = graph.node_weight_mut(node_index).unwrap();
Expand Down
14 changes: 9 additions & 5 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ pub struct SpiderError {
#[derive(Debug)]
pub enum SpiderErrorType {
InvalidURL,
BrokenLink,
HTTPError,
UnableToRetrieve,
MissingHref,
EmptyHref,
MissingTitle,
Expand All @@ -32,10 +33,13 @@ impl std::fmt::Display for SpiderError {
impl SpiderError {
fn get_message(&self) -> String {
match &self.error_type {
SpiderErrorType::BrokenLink => format!(
"Page at {:?} contains a link pointing to {:?}, but {:?} is a bad link!",
self.source_page.as_ref().unwrap(),
self.target_page.as_ref().unwrap(),
SpiderErrorType::UnableToRetrieve => format!(
"Failed to retrieve content for page {:?}!",
self.target_page.as_ref().unwrap()
),
SpiderErrorType::HTTPError => format!(
"HTTP GET request received status code {:?} for page {:?}!",
self.http_error_code.as_ref().unwrap(),
self.target_page.as_ref().unwrap()
),
SpiderErrorType::InvalidURL => format!(
Expand Down
10 changes: 1 addition & 9 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use error::SpiderError;

use petgraph::graph::{DiGraph, NodeIndex};
use reqwest::StatusCode;
use scraper::{selector::CssLocalName, Selector};
Expand Down Expand Up @@ -69,10 +70,6 @@ pub struct SpiderOptions {
pub link_selector: Box<Selector>,
/// Scraper CSS Selector for title elements
pub title_selector: Box<Selector>,
/// Flag to enable quiet mode. True if quiet mode enabled.
pub quiet: bool,
/// Flag to enable verbose mode. True if verbose mode enabled.
pub verbose: bool,
/// Name of the CSS class that marks elements to not check URLs for
pub skip_class: CssLocalName,
/// Vector of hosts (domain names and IP addresses) that Spider Crab will traverse
Expand Down Expand Up @@ -104,11 +101,6 @@ impl Default for SpiderOptions {
max_depth: -1,
link_selector: Box::new(Selector::parse("a").expect("Invalid title selector!")),
title_selector: Box::new(Selector::parse("title").expect("Invalid title selector!")),
quiet: false,
#[cfg(test)]
verbose: true,
#[cfg(not(test))]
verbose: false,
skip_class: CssLocalName::from("scrab-skip"),
hosts: vec![],
}
Expand Down
46 changes: 22 additions & 24 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use log::{error, info};
use std::fs::File;
use std::io::Write;

Expand All @@ -17,7 +18,6 @@ fn save_graph_file(
#[tokio::main(flavor = "current_thread")]
async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
let matches = Command::new("Spider Crab")
.version("0.0.1")
.about("Checks links and images in a webpage.")
.author("Tyler Sengia")
.arg(
Expand All @@ -38,15 +38,13 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
.arg(
Arg::new("quiet")
.short('q')
.long("quiet")
.action(ArgAction::SetTrue)
.help("Do not print to STDOUT or STDERR."),
.help("Silence logging output."),
)
.arg(
Arg::new("verbose")
Arg::new("verbosity")
.short('v')
.long("verbose")
.action(ArgAction::SetTrue)
.action(ArgAction::Count)
.help("Print more log messages."),
)
.arg(
Expand All @@ -65,16 +63,21 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {

let depth: i32 = *matches.get_one::<i32>("depth").expect("Invalid depth!");

let quiet: bool = matches.get_flag("quiet");
let verbose: bool = matches.get_flag("verbose");
let verbose = matches.get_count("verbosity");

let dot_output_file = matches.get_one::<String>("dot");

stderrlog::new()
.module(module_path!())
.quiet(matches.get_flag("quiet"))
.verbosity(verbose as usize)
.init()
.unwrap();

let mut spider_crab = SpiderCrab::default();
spider_crab.options.add_host(url_str);

spider_crab.options.max_depth = depth;
spider_crab.options.verbose = verbose;

const EXPECTED_PAGES: usize = 50;
spider_crab.graph.reserve_edges(200);
Expand All @@ -83,16 +86,12 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {

let result = spider_crab.visit_website(url_str).await;

if !quiet {
println!("Discovered {} pages", spider_crab.graph.node_count());
println!("Visited {} pages", spider_crab.map.len());
println!("Discovered {} links", spider_crab.graph.edge_count());
}
info!("Discovered {} pages", spider_crab.graph.node_count());
info!("Visited {} pages", spider_crab.map.len());
info!("Discovered {} links", spider_crab.graph.edge_count());

if result {
if !quiet {
println!("All links good!");
}
info!("All links good!");
if dot_output_file.is_some() {
let save_result = save_graph_file(&spider_crab, dot_output_file.unwrap());
if save_result.is_err() {
Expand All @@ -101,13 +100,12 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
}
return Ok(());
} else {
if !quiet {
for page in spider_crab.graph.node_weights() {
for error in &page.errors {
println!("{}", error);
}
for page in spider_crab.graph.node_weights() {
for error in &page.errors {
error!("{}", error);
}
}

let e = Box::new(SpiderError {
error_type: spider_crab::error::SpiderErrorType::FailedCrawl,
source_page: None,
Expand All @@ -118,11 +116,11 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
if dot_output_file.is_some() {
let save_result = save_graph_file(&spider_crab, dot_output_file.unwrap());
if save_result.is_err() {
eprintln!(
error!(
"Save to Dot output file {} failed!",
dot_output_file.unwrap()
);
eprintln!("Error: {:?}", save_result.err().unwrap());
error!("Error: {:?}", save_result.err().unwrap());
}
}
return Err(e);
Expand Down

0 comments on commit 3de8a3e

Please sign in to comment.