-
Notifications
You must be signed in to change notification settings - Fork 95
/
maybeurl.go
81 lines (67 loc) · 1.81 KB
/
maybeurl.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
package jsluice
import (
"net/url"
"strings"
)
var fileExtensions set
func init() {
fileExtensions = newSet([]string{
"js", "css", "html", "htm", "xhtml", "xlsx",
"xls", "docx", "doc", "pdf", "rss", "xml",
"php", "phtml", "asp", "aspx", "asmx", "ashx",
"cgi", "pl", "rb", "py", "do", "jsp",
"jspa", "json", "jsonp", "txt",
})
}
func MaybeURL(in string) bool {
// This should eliminate a pretty big percentage of
// string literals that we find, and avoid spending
// the resources on parsing them as URLs
if !strings.ContainsAny(in, "/?.") {
return false
}
// We want to be fairly restrictive to cut out things
// like regex strings, blocks of HTML etc. We will miss
// a handful of URLs this way, but that's probably
// better than spitting out a ton of false-positives
if strings.ContainsAny(in, " ()!<>'\"`{}^$,") {
return false
}
// This could be prone to false positives, but it
// seems that in the wild most strings that start
// with a slash are actually paths
if strings.HasPrefix(in, "/") {
return true
}
// Let's attempt to parse it as a URL, so we can
// do some analysis on the individual parts
u, err := url.Parse(in)
if err != nil {
return false
}
// Valid-scheme?
if u.Scheme != "" {
s := strings.ToLower(u.Scheme)
if s != "http" && s != "https" {
return false
}
}
// Valid-looking hostname?
if len(strings.Split(u.Hostname(), ".")) > 1 {
return true
}
// Valid query string with at least one value?
for _, vv := range u.Query() {
if len(vv) > 0 && len(vv[0]) > 0 {
return true
}
}
// Known file extensions is the last thing we want to
// check so if there's no dot then it's a no from us.
if !strings.ContainsAny(u.Path, ".") {
return false
}
parts := strings.Split(u.Path, ".")
ext := parts[len(parts)-1]
return fileExtensions.Contains(ext)
}