This repository has been archived by the owner on Feb 15, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
/
crawl.js
74 lines (59 loc) · 2.1 KB
/
crawl.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
const cheerio = require('cheerio')
const axios = require('axios')
// Returns set of distinct URLs found in a given URL.
/* This function:
1. Fetches the contents from the crawl URL
2. Finds a list of URLs in the HTML document.
3. Filters the list for valid URLs.
4. Return distinct set of URLs
*/
module.exports.crawl = async (crawlUrl) => {
const response = await axios.get(crawlUrl)
const foundURLs = [] // Discovered URLs from the page
console.log('crawl started: ', crawlUrl)
const $ = cheerio.load(response.data, {
withDomLvl1: true,
normalizeWhitespace: false,
xmlMode: false,
decodeEntities: false
})
// Iterate through all hrefs on the crawled page
$('a').each((i, link) => {
const linkUrl = $(link).attr('href')
console.log(i, linkUrl)
// Validate URL
const validatedURL = validateURL(crawlUrl, linkUrl)
if (validatedURL) {
console.log('Valid foundURL: ', validatedURL)
foundURLs.push(validatedURL)
}
})
// Remove the duplicates
return new Set(foundURLs)
}
// Takes original crawled URL and link URL.
// Returns validated URL or undefined if not valid.
const validateURL = (crawlUrl, linkUrl) => {
let foundUrl = ''
if (!linkUrl) return // Remove nulls/empty hrefs
if (linkUrl.charAt(0) === '#') return // Remove anchor hrefs
const parsedCrawlUrl = new URL(crawlUrl)
const parsedUrl = new URL(linkUrl)
// Relative URLs/hashed URLs, etc.
if (!parsedUrl.protocol) {
// Remove hashed URLs (#chat, etc)
if (!parsedUrl.path) return
// Build absolute URL - some relative URLs don't start with a slash, so add one
const paddedSlash = parsedUrl.path.charAt(0) === '/' ? '' : '/'
foundUrl = `${parsedCrawlUrl.protocol}//${parsedCrawlUrl.host}${paddedSlash}${parsedUrl.pathname}`
} else {
// Ensure http/https
if (!parsedUrl.protocol.includes('http')) return
// Check same domain
if (parsedUrl.host !== parsedCrawlUrl.host) return
foundUrl = `${parsedUrl.protocol}//${parsedUrl.host}${parsedUrl.pathname}`
}
// Remove self references
if (foundUrl === crawlUrl) return
return foundUrl
}