Skip to content

Commit

Permalink
Web: Add workaround for minified Reddit URLs
Browse files Browse the repository at this point in the history
  • Loading branch information
progval committed Oct 14, 2024
1 parent 2aa1f91 commit b13ebeb
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 11 deletions.
47 changes: 36 additions & 11 deletions plugins/Web/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,22 +150,47 @@ def noIgnore(self, irc, msg):
def getTitle(self, irc, url, raiseErrors, msg):
size = conf.supybot.protocols.http.peekSize()

parsed_url = utils.web.urlparse(url)
if parsed_url.netloc in ('youtube.com', 'youtu.be') \
or parsed_url.netloc.endswith(('.youtube.com')):
# there is a lot of Javascript before the <title>
size = max(819200, size)
if parsed_url.netloc in ('reddit.com', 'www.reddit.com', 'new.reddit.com'):
# Since 2022-03, New Reddit has 'Reddit - Dive into anything' as
# <title> on every page.
parsed_url = parsed_url._replace(netloc='old.reddit.com')
url = utils.web.urlunparse(parsed_url)

def url_workaround(url):
"""Returns a new URL that should be the target of a new request,
or None if the request is fine as it is.
The returned URL may be the same as the parameter, in case
something else was changed by this function through side-effects.
"""
nonlocal size
parsed_url = utils.web.urlparse(url)
print(repr(parsed_url.netloc))
if parsed_url.netloc in ('youtube.com', 'youtu.be') \
or parsed_url.netloc.endswith(('.youtube.com')):
# there is a lot of Javascript before the <title>
if size < 819200:
size = max(819200, size)
return url
else:
return None
if parsed_url.netloc in ('reddit.com', 'www.reddit.com', 'new.reddit.com'):
# Since 2022-03, New Reddit has 'Reddit - Dive into anything' as
# <title> on every page.
parsed_url = parsed_url._replace(netloc='old.reddit.com')
url = utils.web.urlunparse(parsed_url)
self.log.debug("Rewrite URL to %s", url)
return url

return None

url = url_workaround(url) or url
timeout = self.registryValue('timeout')
headers = conf.defaultHttpHeaders(irc.network, msg.channel)
try:
fd = utils.web.getUrlFd(url, timeout=timeout, headers=headers)
target = fd.geturl()
fixed_target = url_workaround(target)
if fixed_target is not None:
# happens when using minification services linking to one of
# the websites handled by url_workaround; eg. v.redd.it
fd.close()
fd = utils.web.getUrlFd(fixed_target, timeout=timeout, headers=headers)
target = fd.geturl()
text = fd.read(size)
response_headers = fd.headers
fd.close()
Expand Down
3 changes: 3 additions & 0 deletions plugins/Web/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ def testtitleReddit(self):
self.assertRegexp(
'title https://www.reddit.com/r/irc/',
'Internet Relay Chat')
self.assertRegexp(
'title https://v.redd.it/odhemxo6giud1',
'Small Kitty Big Goals : MadeMeSmile')

def testTitleMarcinfo(self):
# Checks that we don't crash on 'Content-Type: text/html;'
Expand Down

0 comments on commit b13ebeb

Please sign in to comment.