-
Notifications
You must be signed in to change notification settings - Fork 0
/
process.go
81 lines (72 loc) · 1.8 KB
/
process.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
package main
import (
"bufio"
"compress/gzip"
"net/url"
"os"
"strconv"
"strings"
log "github.com/sirupsen/logrus"
"github.com/zeebo/xxh3"
)
func process(path string, outFile *os.File, seencheck *Seencheck, stats *Stats) {
var scanner *bufio.Scanner
// Open frontier file
frontier, err := os.Open(path)
if err != nil {
log.Fatal(err)
}
defer frontier.Close()
// If the file ends with .gz, we open it has a GZIP file
if strings.HasSuffix(path, ".gz") {
reader, err := gzip.NewReader(frontier)
if err != nil {
log.Fatal(err)
}
defer reader.Close()
scanner = bufio.NewScanner(reader)
} else {
scanner = bufio.NewScanner(frontier)
}
for scanner.Scan() {
stats.ParsedCounter.Incr(1)
stats.URIsPerSecond.Incr(1)
rawURL := strings.Split(scanner.Text(), " ")
if strings.Compare(rawURL[0], "F+") == 0 {
// Parse URL
URL, err := url.Parse(rawURL[1])
if err != nil {
stats.ParsingFailures.Incr(1)
continue
}
// Generate URL hash
hash := strconv.FormatUint(xxh3.HashString(URL.String()), 10)
// Check if we already saw the URL, is yes then we skip it, else we add it to the seed list
found, _, err := seencheck.IsSeen(hash)
if err != nil {
log.Fatal(err)
}
if !found {
// Check host to see if we should exclude the URL
if arguments.MaxHostOccurrence != -1 || len(arguments.ExcludedHosts) > 0 {
if isHostExcluded(URL.Host, stats) {
stats.UniqueCounter.Incr(1)
seencheck.Seen(hash, URL.Host)
continue
}
}
if _, err = outFile.WriteString(URL.String() + "\n"); err != nil {
panic(err)
}
stats.UniqueCounter.Incr(1)
stats.SeedsListSize.Incr(1)
seencheck.Seen(hash, URL.Host)
} else {
stats.DuplicateCounter.Incr(1)
}
}
}
if err := scanner.Err(); err != nil {
log.Fatal(err)
}
}