-
Notifications
You must be signed in to change notification settings - Fork 3
/
logfetch.pl
executable file
·87 lines (71 loc) · 2.6 KB
/
logfetch.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env perl
# fetch IRC logs for safekeeping
use strict;
use warnings;
# based on http://search.cpan.org/perldoc?HTML::Tree::Scanning
use HTML::TreeBuilder;
use LWP::Simple;
use Readonly;
use Carp;
use English '-no_match_vars';
#use Smart::Comments;
# http://irclog.perlgeek.de/text.pl?channel=crimsonfu;date=2012-02-08
Readonly my $BASE_URL => 'http://irclog.perlgeek.de';
Readonly my $CHANNEL_NAME => 'crimsonfu';
Readonly my $INDEX_URL => "$BASE_URL/$CHANNEL_NAME";
Readonly my $LOG_DIR => 'irclogs';
Readonly my $SLEEP_INTERVAL => .5;
Readonly my $EXPECTED_IN_CWD => '_config.yml';
#my $file = '/tmp/index.html';
#my $index_tree = HTML::TreeBuilder->new->parse_file($file);
croak "Please cd to root of repo. Couldn't find $EXPECTED_IN_CWD: $OS_ERROR"
unless -f $EXPECTED_IN_CWD;
# NB: not Y3K compliant
my @local_logs = glob("$LOG_DIR/2*");
s{^$LOG_DIR/}{}g foreach @local_logs;
s{\.html$}{}g foreach @local_logs;
my $newish_local_log = $local_logs[-3];
my $index_content = get $INDEX_URL
or croak "Couldn't download index page: $OS_ERROR";
my $index_tree = HTML::TreeBuilder->new->parse($index_content)->eof;
my @dates_with_activity = get_dates_with_activity();
## @dates_with_activity;
my $index_page = build_index_page(@dates_with_activity);
## $index_page;
mkdir $LOG_DIR unless -d $LOG_DIR;
open my $fh, '>', "$LOG_DIR/index.md" or croak $OS_ERROR;
print {$fh} $index_page;
close $fh or croak $OS_ERROR;
for my $date (@dates_with_activity) {
# skip dates for logs we already have
next if $date lt $newish_local_log;
#print "$date is greater than $newish_local_log\n";
#print $date, "\n";
my $day_url = "$BASE_URL/$CHANNEL_NAME/$date/text";
my $chat_content = get $day_url
or croak "Couldn't download chat content for from $day_url: $OS_ERROR";
#print $chat_content, "\n";
open my $fh, '>:encoding(UTF-8)', "$LOG_DIR/$date.html" or croak $OS_ERROR;
print {$fh} $chat_content;
close $fh or croak $OS_ERROR;
sleep $SLEEP_INTERVAL;
}
sub build_index_page {
my @dates = @_;
my $page = "---\nlayout: default\ntitle: IRC logs\n---\n";
for my $date (@dates) {
$page .= "* [$date]($date.html)\n";
}
return $page;
}
sub get_dates_with_activity {
my @dates;
for my $date_with_activity ( $index_tree->look_down( class => 'calendars' )->look_down( _tag => 'a' ) ) {
my $href = $date_with_activity->attr('href');
my ($date_only) = $href =~ m{^/$CHANNEL_NAME/(\d\d\d\d-\d\d-\d\d)$}sm;
push @dates, $date_only;
}
my @dates_sorted = sort @dates;
return @dates_sorted;
}
$index_tree->delete();