-
Notifications
You must be signed in to change notification settings - Fork 1
/
generate_planet
executable file
·206 lines (191 loc) · 6.15 KB
/
generate_planet
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/usr/bin/env perl
use strict;
use warnings;
use autodie;
use feature qw(say);
use Text::CSV;
use XML::Feed;
use DateTime;
use Regexp::Assemble;
use File::Slurp qw(write_file);
use Fcntl qw(:flock);
use Encode qw(encode_utf8);
use IO::Handle ();
STDOUT->autoflush(1);
STDERR->autoflush(1);
my $work_dir = 'work';
my $csv_file = 'feeds.csv';
my $csv_site_url_column = 'site_url';
my $csv_feed_url_column = 'feed_url';
my $csv_author_column = 'author';
my $csv_id_column = 'md5';
my $planet_title = 'Blog posts related to network booting';
my $planet_author = 'NetworkBoot.org';
my $planet_link = 'http://networkboot.org/planet/';
my $planet_file = 'planet.xml';
my @keywords = (
'networkboot.org',
'network boot',
'network booting',
qw(
PXE netboot
iPXE gPXE Etherboot
DHCP BOOTP TFTP
iSCSI NFS AoE
grub
PXELinux SYSLinux
)
);
my $keyword_re = build_keyword_re(@keywords);
chdir $work_dir;
my @contributors = get_contributors_from_csv();
my @feeds = load_feeds(@contributors);
my @entries = extract_entries(@feeds);
my $feed = create_aggregate_feed(@entries);
say "Writing $planet_file...";
write_file($planet_file, { binmode => ':raw', atomic => 1 }, $feed->as_xml);
exit;
############################
sub get_contributors_from_csv {
say "Extracting contributors from $csv_file...";
my $csv = Text::CSV->new({
'sep_char' => ';',
'auto_diag' => 1,
'binary' => 1,
});
open my $csv_fh, '<:encoding(UTF-8)', $csv_file;
flock($csv_fh, LOCK_SH);
my @columns = @{ $csv->getline($csv_fh) };
$csv->column_names(@columns);
my @contributors;
while ( my $row = $csv->getline_hr($csv_fh) ) {
push @contributors, {
'site_url' => $row->{ $csv_site_url_column },
'feed_url' => $row->{ $csv_feed_url_column },
'author' => $row->{ $csv_author_column },
'id' => $row->{ $csv_id_column },
};
}
flock($csv_fh, LOCK_UN);
close $csv_fh;
return @contributors;
}
sub load_feeds {
my (@contributors) = @_;
say "Loading feeds...";
my @feeds;
my %file_map;
# Load feed files
while ( my $contributor = shift @contributors) {
my $site_url = $contributor->{'site_url'};
my $feed_url = $contributor->{'feed_url'};
my $author = $contributor->{'author'};
my $id = $contributor->{'id'};
next unless $feed_url;
next unless $id;
my $feed_file = $id . ".xml";
say "Reading $feed_file...";
my $feed = XML::Feed->parse( "$feed_file" );
say "\tUnable to read $feed_file" unless $feed;
say "\tError occured: " . XML::Feed->error if XML::Feed->error;
next unless $feed;
say "\t" . $feed->format . ": $feed_url";
# $feed->title docs doesn't say if it returns character or byte string
my $title = $feed->title;
# Make sure it is always bytes
$title = encode_utf8($title) if utf8::is_utf8($title);
say qq{\t"} . $title . qq{"};
# Convert it to Atom, if needed
$feed = $feed->convert('Atom') unless $feed->format =~ m/Atom/;
# Add some metadata
$feed->link( $site_url ) if $site_url;
$feed->base( $site_url ) if ! $feed->base;
$feed->self_link( $feed_url ) if $feed_url;
$feed->author( $author ) if $author;
$file_map{$feed_file} = 1;
push @feeds, $feed;
}
# Remove stale feed files
say "Removing stale feed files...";
opendir my $dh, '.';
while ( readdir $dh ) {
next unless m/^[0-9a-f]{32}\.xml$/;
next if $file_map{$_};
say "\t$_";
unlink $_;
}
return @feeds;
}
# Get all feed entries in chronological order, newest first
# Also filter out entries that don't match keywords
# and rewrite title to include author name
sub extract_entries {
my (@feeds) = @_;
say "Extracting entries...";
my @entries;
while ( my $feed = shift @feeds ) {
next unless $feed;
my @items = $feed->entries;
while ( my $entry = shift @items ) {
next unless $entry;
# Try to work around the fact that some
# entries lack an issued date, but has
# a modified date. Use modification date
# as issued date.
$entry->issued( $entry->modified )
if ! $entry->issued && $entry->modified;
next unless $entry->issued;
$entry->author( $feed->author )
if $feed->author;
$entry->base( $feed->base )
if ! $entry->base;
push @entries, $entry;
}
}
# Return entries filtered and sorted, newest first
return reverse
sort { DateTime->compare($a->issued, $b->issued) }
grep { defined }
map { filter_entry($_) }
@entries;
}
# Make sure the textual parts of a feed entry matches the keywords
sub filter_entry {
my ($entry) = @_;
return unless defined $entry;
return $entry unless $entry->can('title');
return $entry unless $entry->can('content');
return $entry unless $entry->can('summary');
return $entry unless $entry->can('category');
my $text = join("\n",
grep { defined }
$entry->title,
$entry->category, # might be multiple
( $entry->summary && $entry->summary->body ),
( $entry->content && $entry->content->body ),
);
# Put name of author in title, if present
$entry->title( $entry->author . ': ' . $entry->title ) if $entry->author;
return $entry if $text =~ m/$keyword_re/i;
return;
}
sub build_keyword_re {
my (@keywords) = @_;
my $ra = Regexp::Assemble->new();
while ( my $keyword = shift @keywords ) {
my $re = qr/\Q$keyword\E/;
$ra->add($re);
}
return $ra->re;
}
# Generate aggregate feed of all entries
sub create_aggregate_feed {
my (@entries) = @_;
say "Creating aggregate feed...";
my $feed = XML::Feed->new('Atom');
$feed->author( $planet_author );
$feed->link( $planet_link );
$feed->title( $planet_title );
$feed->add_entry($_) for @entries;
return $feed;
}