forked from NikolaiT/scrapeulous
-
Notifications
You must be signed in to change notification settings - Fork 0
/
http_crawler.js
110 lines (101 loc) · 2.94 KB
/
http_crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
/**
* @author Nikolai Tschacher
* @version 1.0
* @last_modified April 2020
* @website: scrapeulous.com
*
* This is a simple (but functional) http crawler that can be executed
* within the scrapeulous.com crawling infrastructure.
*
* @param: options.stay_within_domain: only add urls to the queue with the same domain default: true
* @param: options.allow_arbitrary_subdomains: Allow www.example.org if the initial url is example.org
* @param: options.only_urls_with_text: Will only add href urls of <a> elements with element text to the corpus. default: true
*/
class HttpCrawler extends HttpWorker {
async crawl(url) {
// set default options
if (!this.options.stay_within_domain) {
this.options.stay_within_domain = true;
}
if (!this.options.allow_arbitrary_subdomains) {
this.options.allow_arbitrary_subdomains = true;
}
if (!this.options.only_urls_with_text) {
this.options.only_urls_with_text = true;
}
let parsed_url;
try {
parsed_url = new URL(url);
} catch (err) {
return {
error: `url ${url} is invalid: ${err.message}`
};
}
// set an random desktop user agent
let user_agent = new this.UserAgent({deviceCategory: 'desktop'}).toString();
let headers = {'User-Agent': user_agent};
let html;
try {
let response = await this.Got(url, {headers: headers});
html = response.body;
} catch (err) {
return err.toString();
}
let $ = this.Cheerio.load(html);
let links = this.getLinks($);
let cleaned_links = this.cleanLinks(links, parsed_url);
if (this.enqueue) {
this.enqueue(null, cleaned_links, {allow_duplicates: false});
} else {
return cleaned_links;
}
}
cleanLinks(links, parsed_url) {
let filtered = [];
for (let link of links) {
let skip = false;
let url;
try {
url = new URL(link.link, parsed_url.origin);
} catch (err) {
this.logger.warn(`url ${link.link} cannot be parsed`);
continue;
}
// https://developer.mozilla.org/en-US/docs/Web/API/URL/URL
if (this.options.stay_within_domain) {
if (url.hostname !== parsed_url.hostname) {
skip = true;
}
}
if (!skip) {
let url_string = url.toString();
url_string = url_string.replace(/#/g, '');
filtered.push(url_string);
}
}
return [...new Set(filtered)];
}
getLinks($) {
let all_links = [];
$($('a')).each(function(i, link) {
let link_text = $(link).text();
let href = $(link).attr('href');
if (href && href.trim()) {
let obj = {
link: href,
text: link_text
};
let add = true;
if (this.options.only_urls_with_text) {
if (link_text.trim().length <= 0) {
add = false;
}
}
if (add) {
all_links.push(obj);
}
}
}.bind(this));
return all_links;
}
}