Skip to content

Commit

Permalink
parser: fancyindex compatibility fix, and add timezone fmt support in…
Browse files Browse the repository at this point in the history
… fancyindex
  • Loading branch information
taoky committed Aug 24, 2024
1 parent 17cf5f8 commit c77ba0e
Show file tree
Hide file tree
Showing 12 changed files with 153 additions and 21 deletions.
21 changes: 21 additions & 0 deletions fixtures/misc/1/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<html>
<head>
<title>Misc test index 1</title>
</head>
<body>
<h2>/etc/</h2>
<table>
<tr>
<th>Filename</th>
<th>Size</th>
<th>Last Modified</th>
<th>SHA256</th>
</tr>
<tr>
<td><a href="passwd">passwd</a></td>
<td>3.3 KB</td>
<td>2024-08-24 15:04:11 +0000</td>
<td>477a3d43f692aeaf1c7f40c0c91bffde3e2e638d8e90c668422373ee82a18521</td>
</tr></table>
</body>
</html>
1 change: 1 addition & 0 deletions src/cli/sync.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ fn extension_push_task(worker: &Worker<Task>, wake: &AtomicUsize, package: &Exte
// size and mtime would be ignored as skip_check is set
size: None,
mtime: NaiveDateTime::default(),
timezone: None,
skip_check: true,
}),
relative: package.relative.clone(),
Expand Down
10 changes: 8 additions & 2 deletions src/compare.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,15 @@ pub fn should_download_by_list(
}
}
.into();
let remote_mtime = naive_to_utc(&remote.mtime, remote_timezone);
// Use remote timezone or not?
let timezone = match remote.timezone {
None => remote_timezone,
Some(tz) => Some(tz),
};
let remote_mtime = naive_to_utc(&remote.mtime, timezone);
let offset = remote_mtime - local_mtime;
debug!("DateTime offset: {:?} {:?}", path, offset);
match remote_timezone {
match timezone {
None => {
// allow an offset to up to 24hrs
offset.num_hours().abs() > 24
Expand Down Expand Up @@ -107,6 +112,7 @@ pub fn should_download_by_head(path: &Path, resp: &reqwest::Response, size_only:
.expect("No content-length from upstream"),
)),
mtime: utils::get_response_mtime(resp).unwrap().naive_utc(),
timezone: None,
skip_check: false,
};
should_download_by_list(path, &item, FixedOffset::east_opt(0), false, size_only)
Expand Down
12 changes: 10 additions & 2 deletions src/listing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ pub struct ListItem {
pub size: Option<FileSize>,
/// mtime is parsed from HTML, which is the local datetime of the "server" (not necessarily localtime or UTC)
pub mtime: NaiveDateTime,
/// Some HTML provides "timezone", parser shall set this if so (otherwise just None)
pub timezone: Option<FixedOffset>,
/// Don't check size and mtime: download only if the file doesn't exist.
/// This is expected to be set by apt/yum parser extension (parser will not use this).
pub skip_check: bool,
Expand All @@ -142,13 +144,15 @@ impl ListItem {
type_: FileType,
size: Option<FileSize>,
mtime: NaiveDateTime,
timezone: Option<FixedOffset>,
) -> Self {
Self {
url,
name,
type_,
size,
mtime,
timezone,
skip_check: false,
}
}
Expand All @@ -161,10 +165,14 @@ impl Display for ListItem {
None => String::from("(none)"),
};
let mtime_str = self.mtime.format("%Y-%m-%d %H:%M:%S").to_string();
let timezone = match self.timezone {
None => "",
Some(tz) => &format!("({})", tz),
};
write!(
f,
"{} {:?} {} {} {}",
self.url, self.type_, size_str, mtime_str, self.name
"{} {:?} {} {}{} {}",
self.url, self.type_, size_str, mtime_str, timezone, self.name
)
}
}
Expand Down
1 change: 1 addition & 0 deletions src/parser/apache_f2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ impl Parser for ApacheF2ListingParser {
}
},
date,
None,
))
}

Expand Down
2 changes: 1 addition & 1 deletion src/parser/caddy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ impl Parser for CaddyListingParser {
// Store UTC time
let date = NaiveDateTime::parse_from_str(mtime, "%Y-%m-%dT%H:%M:%S%Z")?;

items.push(ListItem::new(href, name, type_, size, date))
items.push(ListItem::new(href, name, type_, size, date, None))
}

Ok(ListResult::List(items))
Expand Down
1 change: 1 addition & 0 deletions src/parser/directory_lister.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ impl Parser for DirectoryListerListingParser {
}
},
date,
None,
))
}

Expand Down
9 changes: 8 additions & 1 deletion src/parser/docker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,14 @@ impl Parser for DockerListingParser {
href.set_path(&format!("{}/", href.path()));
}

items.push(ListItem::new(href, name.to_string(), type_, size, date))
items.push(ListItem::new(
href,
name.to_string(),
type_,
size,
date,
None,
))
}
Ok(ListResult::List(items))
}
Expand Down
78 changes: 68 additions & 10 deletions src/parser/fancyindex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ use crate::{
};

use super::*;
use anyhow::Result;
use chrono::NaiveDateTime;
use anyhow::{anyhow, Result};
use chrono::{DateTime, NaiveDateTime};
use scraper::{Html, Selector};

#[derive(Debug, Clone, Default)]
Expand All @@ -27,11 +27,27 @@ impl Parser for FancyIndexListingParser {
let selector = Selector::parse("tbody tr").unwrap();
let mut items = Vec::new();
for element in document.select(&selector) {
let link_selector = Selector::parse("td.link a").unwrap();
let size_selector = Selector::parse("td.size").unwrap();
let date_selector = Selector::parse("td.date").unwrap();
// let link_selector = Selector::parse("td.link a").unwrap();
// let size_selector = Selector::parse("td.size").unwrap();
// let date_selector = Selector::parse("td.date").unwrap();

let a = element.select(&link_selector).next().unwrap();
// Select <td> in order, instead of using class name, to improve compatibility for strange pages
let td_selector = Selector::parse("td").unwrap();
let mut td_iterator = element.select(&td_selector);

let td_a = match td_iterator.next() {
Some(tda) => tda,
None => {
warn!("Cannot find <td> in this <tr> (header maybe?), skipping...");
continue;
}
};
let a = match td_a.select(&Selector::parse("a").unwrap()).next() {
Some(a) => a,
None => {
return Err(anyhow!("Cannot find <a> in first cell."));
}
};
let href = a.value().attr("href").unwrap();
let displayed_filename = a.inner_html();

Expand All @@ -46,14 +62,23 @@ impl Parser for FancyIndexListingParser {
} else {
FileType::File
};
let size = element.select(&size_selector).next().unwrap().inner_html();
let size = td_iterator.next().unwrap().inner_html();
let size = size.trim();
let date = element.select(&date_selector).next().unwrap().inner_html();
let date = td_iterator.next().unwrap().inner_html();
let date = date.trim();

// decide (guess) which time format to use
let (date_fmt, _) = guess_date_fmt(date);
let date = NaiveDateTime::parse_from_str(date, &date_fmt)?;
let naive_date;
let timezone;
if !date_fmt_has_timezone(&date_fmt) {
naive_date = NaiveDateTime::parse_from_str(date, &date_fmt)?;
timezone = None;
} else {
let date = DateTime::parse_from_str(date, &date_fmt)?;
naive_date = date.naive_utc();
timezone = Some(date.offset().to_owned());
}

items.push(ListItem::new(
href,
Expand All @@ -67,7 +92,8 @@ impl Parser for FancyIndexListingParser {
Some(FileSize::HumanizedBinary(n_size, unit))
}
},
date,
naive_date,
timezone,
));
}

Expand All @@ -77,6 +103,8 @@ impl Parser for FancyIndexListingParser {

#[cfg(test)]
mod tests {
use chrono::FixedOffset;

use super::*;
use crate::listing::SizeUnit;
use crate::parser::tests::*;
Expand Down Expand Up @@ -148,4 +176,34 @@ mod tests {
_ => unreachable!(),
}
}

#[test]
fn test_misc_1() {
// In fact this is NOT a fancyindex page, but it basically match the layout of that.
let context = init_async_context();
let items = FancyIndexListingParser
.get_list(
&context,
&Url::parse("http://localhost:1921/misc/1/").unwrap(),
)
.unwrap();
match items {
ListResult::List(items) => {
assert_eq!(items.len(), 1);
assert_eq!(items[0].name, "passwd");
assert_eq!(items[0].type_, FileType::File);
assert_eq!(
items[0].size,
Some(FileSize::HumanizedBinary(3.3, SizeUnit::K))
);
assert_eq!(
items[0].mtime,
NaiveDateTime::parse_from_str("2024-08-24 15:04:11", "%Y-%m-%d %H:%M:%S")
.unwrap()
);
assert_eq!(items[0].timezone, FixedOffset::east_opt(0),);
}
_ => unreachable!(),
}
}
}
2 changes: 1 addition & 1 deletion src/parser/lighttpd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ impl Parser for LighttpdListingParser {
};

// debug!("{} {} {} {:?} {:?}", href, name, mtime, size, type_);
items.push(ListItem::new(href, name, type_, size, mtime))
items.push(ListItem::new(href, name, type_, size, mtime, None))
}

Ok(ListResult::List(items))
Expand Down
36 changes: 32 additions & 4 deletions src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,15 +82,27 @@ fn contains_two_colons(s: &str) -> bool {
s.matches(':').count() >= 2
}

fn has_numeric_prefix(s: &str) -> bool {
fn has_four_numeric_prefix(s: &str) -> bool {
s.chars().take(4).all(|c| c.is_ascii_digit()) && s.len() >= 4
}

fn has_timezone_suffix(s: &str) -> bool {
if s.len() < 5 {
return false;
}
let chars: Vec<char> = s.chars().collect();
let c1 = chars[chars.len() - 4..].iter().all(|c| c.is_ascii_digit());
let c2 = chars[chars.len() - 5] == '+' || chars[chars.len() - 5] == '-';

c1 && c2
}

// Returns format and regex string
fn guess_date_fmt(date: &str) -> (String, String) {
let two_colons = contains_two_colons(date);
let abbr_month = contains_abbreviated_month(date);
let year_first = has_numeric_prefix(date);
let year_first = has_four_numeric_prefix(date);
let has_timezone = has_timezone_suffix(date);
let (dfmt, dfmt_regex) = match (abbr_month, year_first) {
(true, true) => ("%Y-%b-%d", r"\d{4}-\w{3}-\d{2}"),
(true, false) => ("%d-%b-%Y", r"\d{2}-\w{3}-\d{4}"),
Expand All @@ -102,12 +114,21 @@ fn guess_date_fmt(date: &str) -> (String, String) {
} else {
("%H:%M", r"\d{2}:\d{2}")
};
let (zfmt, zfmt_regex) = if has_timezone {
(" %z", r" [+-]\d{4}")
} else {
("", "")
};
(
format!("{} {}", dfmt, tfmt),
format!("{} {}", dfmt_regex, tfmt_regex),
format!("{} {}{}", dfmt, tfmt, zfmt),
format!("{} {}{}", dfmt_regex, tfmt_regex, zfmt_regex),
)
}

fn date_fmt_has_timezone(datefmt: &str) -> bool {
datefmt.contains("%z")
}

#[cfg(test)]
mod tests {
use super::*;
Expand All @@ -130,5 +151,12 @@ mod tests {
r"\d{4}-\w{3}-\d{2} \d{2}:\d{2}".to_owned()
)
);
assert_eq!(
guess_date_fmt("2023-11-27 14:22:08 +0000"),
(
"%Y-%m-%d %H:%M:%S %z".to_owned(),
r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} [+-]\d{4}".to_owned()
)
);
}
}
1 change: 1 addition & 0 deletions src/parser/nginx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ impl Parser for NginxListingParser {
}
},
date,
None,
))
}
Ok(ListResult::List(items))
Expand Down

0 comments on commit c77ba0e

Please sign in to comment.