-
Notifications
You must be signed in to change notification settings - Fork 0
/
title.py
166 lines (139 loc) · 6.04 KB
/
title.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
from __future__ import unicode_literals, absolute_import, print_function, division
import re
import sopel.module
import collections
import requests
from sopel import web, tools, __version__
helptext = collections.namedtuple('HelpText', 'perms command line')
USER_AGENT = 'Sopel/{} (http://sopel.chat)'.format(__version__)
default_headers = {'User-Agent': USER_AGENT,
'Accept-Language': 'en-US,en *;q=0.5'}
url_finder = None
# These are used to clean up the title tag before actually parsing it. Not the
# world's best way to do this, but it'll do for now.
title_tag_data = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE)
quoted_title = re.compile('[\'"]<title>[\'"]', re.IGNORECASE)
# This is another regex that presumably does something important.
re_dcc = re.compile(r'(?i)dcc\ssend')
# This sets the maximum number of bytes that should be read in order to find
# the title. We don't want it too high, or a link to a big file/stream will
# just keep downloading until there's no more memory. 640k ought to be enough
# for anybody.
max_bytes = 655360
def setup(bot):
global url_finder
url_finder = re.compile(r'(?u)(https?:\/\/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*))', re.IGNORECASE)
if not bot.memory.contains('last_seen_url'):
bot.memory['last_seen_url'] = tools.SopelMemory()
if not bot.memory.contains('url_callbacks'):
bot.memory['url_callbacks'] = tools.SopelMemory()
if not bot.memory.contains('help'):
bot.memory['help'] = tools.SopelMemory()
bot.memory['help']['title'] = tools.SopelMemory()
bot.memory['help']['title']['short'] = 'Privmsg or notices link titles'
bot.memory['help']['title']['long'] = {
helptext('all', '!title [status]', 'Prints status of titles for you'),
helptext('all', '!title on|notice', 'Sets titles to notice'),
helptext('all', '!title privmsg', 'Sets titles to privmsg'),
helptext('all', '!title off', 'Turns titles off')
}
@sopel.module.commands('title')
def title(bot, trigger):
if not trigger.group(2) or trigger.group(3) == 'status':
status = bot.db.get_nick_value(trigger.nick, 'title')
bot.notice('Your titles are set to {}'.format(status or 'off'), trigger.nick)
if trigger.group(3) == 'on':
bot.db.set_nick_value(trigger.nick, 'title', 'on')
bot.notice('Titles enabled', trigger.nick)
elif trigger.group(3) == 'privmsg':
bot.db.set_nick_value(trigger.nick, 'title', 'privmsg')
bot.notice('Titles enabled (privmsg)', trigger.nick)
elif trigger.group(3) == 'notice':
bot.db.set_nick_value(trigger.nick, 'title', 'notice')
bot.notice('Titles enabled (notice)', trigger.nick)
elif trigger.group(3) == 'off':
bot.db.set_nick_value(trigger.nick, 'title', 'off')
bot.notice('Titles disabled', trigger.nick)
@sopel.module.rule('(?u).*(https?://\S+).*')
def title_auto(bot, trigger):
# Avoid fetching known malicious links
if 'safety_cache' in bot.memory and trigger in bot.memory['safety_cache']:
if bot.memory['safety_cache'][trigger]['positives'] > 1:
return
urls = re.findall(url_finder, trigger)
if len(urls) == 0:
return
results = process_urls(bot, trigger, urls)
bot.memory['last_seen_url'][trigger.sender] = urls[-1]
for title, domain in results[:4]:
message = '[ {} ] - {}'.format(title, domain)
# Guard against responding to other instances of this bot.
if message != trigger:
if trigger.sender in bot.channels:
for user in bot.channels[trigger.sender].users:
enabled = bot.db.get_nick_value(user, 'title') or 'off'
if enabled == 'on' or enabled == 'notice':
bot.notice(message, user)
elif enabled == 'privmsg':
bot.say(message, user)
def process_urls(bot, trigger, urls):
results = []
for url in urls:
try:
url = web.iri_to_uri(url)
except:
pass
title = None
for regex, function in tools.iteritems(bot.memory['url_callbacks']):
match = regex.search(url)
if match is not None:
title = function(bot, trigger, match)
if title:
results.append((title, get_hostname(url)))
break
if not title:
title = find_title(url, verify=bot.config.core.verify_ssl)
if title:
results.append((title, get_hostname(url)))
return results
def find_title(url, verify=True):
"""Return the title for the given URL."""
try:
response = requests.get(url, stream=True, verify=verify, headers=default_headers)
except:
return None
try:
content = b''
for byte in response.iter_content(chunk_size=512):
content += byte
if b'</title>' in content or len(content) > max_bytes:
break
content = content.decode('utf-8', errors='ignore')
finally:
# need to close the connexion because we have not read all the data
response.close()
# Some cleanup that I don't really grok, but was in the original, so
# we'll keep it (with the compiled regexes made global) for now.
content = title_tag_data.sub(r'<\1title>', content)
content = quoted_title.sub('', content)
start = content.find('<title>')
end = content.find('</title>')
if start == -1 or end == -1:
return
title = web.decode(content[start + 7:end])
title = title.strip()[:200]
title = ' '.join(title.split()) # cleanly remove multiple spaces
# More cryptic regex substitutions. This one looks to be myano's invention.
title = re_dcc.sub('', title)
return title or None
def get_hostname(url):
idx = 7
if url.startswith('https://'):
idx = 8
elif url.startswith('ftp://'):
idx = 6
hostname = url[idx:]
slash = hostname.find('/')
if slash != -1:
hostname = hostname[:slash]
return hostname