This repository has been archived by the owner on Feb 12, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler-await.js
executable file
·171 lines (116 loc) · 5.45 KB
/
crawler-await.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
/*
By Robin Herzog
Goal : Give a score to each leads in order to focus sales on good leads.
What we have:
1. Email
2. First name & Last Name
3. Company domain name
4. Linkedin Profile
5. Keywords
6. Job
7. Company name
8. Industry in french
Type of possible website :
1. Agency event (https://www.dynamicevents.com/)
2. WTF website like universalmusic.com
3. Website of a particular event
4. Website of organizer but not in French Or English
Why qualify :
1. AnyLeads will get many leads but not all of them organize event tech conferences. Qualify allows leads to focus on better potential leads.
How to qualify:
1. People is already filter by Linkedin Criteria but it can't be precise enough.
2. On Keywords, Job and Company Name data is limited. Linkedin url profile is know but how to access it?
3. Idea 1 : Analyse company domain to find words such as (tech, event, summit...)
Hypothesis:
1. The domain contains many keywords that match with our criteria
2. The domain does not match BUT Linkedin Profile match with our criteria
3. The domain and Linkedin does not match with our criteria
4. AnyLeads can't find the email but we could find it with the mail
Machine Learning:
1. Website texts could be classified with ML
2. Linkedin Profile could be classified with ML
To Do:
1. Get all url from website
2. Extract all texts from it
3. Compare with critera
4. Give a note
*/
/*
@Website scoring
Whats makes a good website :
1. Finding specific words (ML?)
2. Finding events they organize
3. Read their company page on Linkedin
*/
const request = require('request');
const cheerio = require('cheerio');
const Url = require('url');
const parseDomain = require("parse-domain");
const validUrl = require('valid-url');
const followRedirects = require('follow-redirects');
const rp = require('request-promise');
const phantom = require('phantom');
var exports = module.exports = {};
exports.crawlWeb = function(startUrl, callback) {
return new Promise((resolve) => {
console.log('Crawler started');
var baseParsedStartUrl = Url.parse(startUrl);
var baseStartUrl = baseParsedStartUrl.protocol + '//' + baseParsedStartUrl.hostname;
var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
pagesToVisit.push(startUrl);
crawl();
function crawl() {
let nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
crawl();
} else {
if (nextPage) {
collectInternalLinks(nextPage, crawl);
}
}
// Quand il n'y a plus de nextPage, le crawl a fini.
if (nextPage === undefined) {
resolve(Object.keys(pagesVisited));
}
}
async function collectInternalLinks(startUrl, callback) {
const instance = await phantom.create(['--load-images=no']);
const page = await instance.createPage();
await page.on("onResourceRequested", function(requestData) {
//console.info('Requesting', requestData.url)
});
const status = await page.open(startUrl);
const content = await page.property('content');
$ = cheerio.load(content);
console.log(startUrl);
console.log(numPagesVisited + '<= AWAIT =================================');
pagesVisited[startUrl] = true;
numPagesVisited++;
let parcedStartDomain = Url.parse(startUrl);
$("a").each(function (index, a) {
let toQueueUrl = $(a).attr('href');
if (toQueueUrl) {
//Permet de transformer les liens relative en absolue
let parsedUrlOrigin = Url.parse(toQueueUrl);
toQueueUrl = Url.resolve(baseStartUrl,parsedUrlOrigin.href);
let parsedUrl = Url.parse(toQueueUrl);
//let parsedUrlDomain = parseDomain(parsedUrl.host);
if (validUrl.isUri(parsedUrl.href)) {
if ((toQueueUrl.match(/(https?:\/\/[-\w;\/?:@&=+$\|\_.!~*\|'()\[\]%#,☺]+[\w\/#](\(\))?)(?=$|[\s',\|\(\).:;?\-\[\]>\)])/) && !toQueueUrl.match(['#']) && !toQueueUrl.match(['mailto']) && !toQueueUrl.match(/^https?:\/\/(?:[a-z\-]+\.)+[a-z]{2,6}(?:\/[^\/#?]+)+\.(?:jpe?g|gif|png|pdf|mp3|mp4|eps)$/)) || (parsedUrl.hostname === null && toQueueUrl.match(/((\/[a-zA-Z][a-z[A-Z]*)+?)/)) && !toQueueUrl.match(/.(?:jpe?g|gif|png|pdf|mp3|mp4|eps)$/)) {
//console.log(parsedUrl);
//console.log(parsedUrlDomain);
if (parcedStartDomain.hostname === parsedUrl.hostname) {
//console.log(toQueueUrl);
pagesToVisit.push(toQueueUrl);
}
}
}
}
});
//await instance.exit();
callback();
}
});
};