-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurl.go
111 lines (93 loc) · 2.71 KB
/
url.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
package url
import (
"crypto/tls"
"io"
"log"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"github.com/yhat/scrape"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"github.com/seabird-chat/seabird-go/pb"
)
// NOTE: This isn't perfect in any sense of the word, but it's pretty close
// and I don't know if it's worth the time to make it better.
var (
urlRegex = regexp.MustCompile(`https?://[^ ]+`)
newlineRegex = regexp.MustCompile(`\s*\n\s*`)
)
func (c *Client) messageCallback(source *pb.ChannelSource, text string) {
// Run all the message matchers in a goroutine to avoid blocking the main
// URL matching. Note that it may be better to call this serially and let
// each callback spin up goroutines as needed.
go func() {
for _, cb := range c.messageCallbacks {
cb(c, source, text)
}
}()
for _, rawurl := range urlRegex.FindAllString(text, -1) {
go func(raw string) {
u, err := url.ParseRequestURI(raw)
if err != nil {
return
}
// Strip the last character if it's a slash
u.Path = strings.TrimRight(u.Path, "/")
targets := []string{u.Host}
// If there was a www, we fall back to no www This is not perfect,
// but it will fix a number of issues Alternatively, we could
// require the linkifiers to register multiple times
if strings.HasPrefix(u.Host, "www.") {
targets = append(targets, strings.TrimPrefix(u.Host, "www."))
}
for _, host := range targets {
for _, provider := range c.callbacks[host] {
if ok := provider(c, source, u); ok {
return
}
}
}
// If we ran through all the providers and didn't reply, try with
// the default link provider.
defaultLinkProvider(c, source, raw)
}(rawurl)
}
}
// NOTE: This nasty work is done so we ignore invalid ssl certs. We know what
// we're doing, I promise. Famous last words.
//nolint:gosec
var client = &http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
},
Timeout: 5 * time.Second,
}
func defaultLinkProvider(c *Client, source *pb.ChannelSource, url string) bool {
resp, err := client.Get(url)
if err != nil {
return false
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return false
}
// We search the first 1K and if a title isn't in there, we deal with it
z, err := html.Parse(io.LimitReader(resp.Body, 1024*1024))
if err != nil {
log.Printf("Failed to grab URL: %s", err)
return false
}
// Scrape the tree for the first title node we find
n, ok := scrape.Find(z, scrape.ByTag(atom.Title))
// If we got a result, pull the text from it
if ok {
title := newlineRegex.ReplaceAllLiteralString(scrape.Text(n), " ")
c.Replyf(source, "Title: %s", title)
return true
}
// URL not handled
return false
}