Skip to content

Commit

Permalink
fix: remove html tags in overviews
Browse files Browse the repository at this point in the history
  • Loading branch information
SethFalco committed Jul 30, 2023
1 parent 2b49cc7 commit 317d880
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 2 deletions.
4 changes: 2 additions & 2 deletions resources/lib/item_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import xbmcgui

from .utils import (
datetime_from_string, get_art_url, image_url, get_current_datetime
datetime_from_string, get_art_url, image_url, get_current_datetime, plainify_html
)
from .lazylogger import LazyLogger

Expand Down Expand Up @@ -300,7 +300,7 @@ def extract_item_info(item, gui_options):
item_details.resume_time = int(reasonable_ticks / 10000)

item_details.series_name = item.get("SeriesName", '')
item_details.plot = item.get("Overview", '')
item_details.plot = plainify_html(item.get("Overview", ''))

runtime = item.get("RunTimeTicks")
if item_details.is_folder is False and runtime:
Expand Down
123 changes: 123 additions & 0 deletions resources/lib/plainhtmlparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
from html.parser import HTMLParser


class PlainHTMLParser(HTMLParser):
'''
HTMLParser implementation that strips HTML tags, preserving the content.
This is not intended to interpret HTML, nor output sanitized and secure HTML
that's safe to use in a web browser.
This parses a string that may contain HTML, and removes HTML tags, and content
that isn't intended for users to read, such as <script> and <style>. It will
preserve the content of tags that is semantically intended to be read by the
user, such <a>, <p>, and <span>.
Implementation should always be locale independent. It works with the HTML,
not string/ASCII content.
'''
html_elements = [
"html", "base", "head", "link", "meta", "style", "title", "body",
"address", "article", "aside", "footer", "header", "h1", "h2", "h3",
"h4", "h5", "h6", "hgroup", "main", "nav", "section", "search",
"blockquote", "dd", "div", "dl", "dt", "figcaption", "figure", "hr",
"li", "menu", "ol", "p", "pre", "ul", "a", "abbr", "b", "bdi", "bdo",
"br", "cite", "code", "data", "dfn", "em", "i", "kbd", "mark", "q",
"rp", "rt", "ruby", "s", "samp", "small", "span", "strong", "sub",
"sup", "time", "u", "var", "wbr", "area", "audio", "img", "map",
"track", "video", "embed", "iframe", "object", "picture", "portal",
"source", "svg", "math", "canvas", "noscript", "script", "del", "ins",
"caption", "col", "colgroup", "table", "tbody", "td", "tfoot", "th",
"thead", "tr", "button", "datalist", "fieldset", "form", "input",
"label", "legend", "meter", "optgroup", "option", "output", "progress",
"select", "textarea", "details", "dialog", "summary", "slot", "template"
]
'''
List of HTML elements, excluding obsolete and deprecated HTML elements.
This allows us to seperate stylized text from actual HTML. For example,
if a description or movie name contains "<3", it will be preserved because
"3" is not the list of known HTML elements.
See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element
'''

inline_elements = [
"a", "b", "em", "i", "s", "span", "strong", "sub", "sup", "u"
]
'''
List of inline HTML elements. Allows us know which element doesn't need to
have whitespace appended.
'''

tag_denylist = [
"head", "meta", "style", "canvas", "noscript", "script", "summary"
]
'''
Denylist of tags with content we don't want to display.
We drop <summary> because it's part of the <details> tag, but since we'll
always display the details anyway, there is no need for the summary of it.
'''

def __init__(self):
super().__init__(convert_charrefs=True)
self.elements = []
self.accumulator = []
self.pending_data = []
self.result = None

def handle_starttag(self, tag, _):
if self.elements and self.elements[-1] in PlainHTMLParser.tag_denylist:
return

self.handle_pending_data()

if tag not in PlainHTMLParser.html_elements:
self.accumulator.append(self.get_starttag_text())
return

self.elements.append(tag)

def handle_endtag(self, tag):
self.handle_pending_data()

if self.elements and self.elements[-1] == tag:
self.elements.pop()
return
elif self.elements and self.elements[-1] in PlainHTMLParser.tag_denylist:
return

if tag not in PlainHTMLParser.html_elements:
self.accumulator.append(self.get_starttag_text())

def handle_startendtag(self, tag, _):
if self.elements and self.elements[-1] in PlainHTMLParser.tag_denylist:
return

self.handle_pending_data()

if tag not in PlainHTMLParser.html_elements:
self.accumulator.append(self.get_starttag_text())

def handle_data(self, data):
if self.elements and self.elements[-1] in PlainHTMLParser.tag_denylist:
return

self.pending_data.append(data)

def close(self):
super().close()
self.handle_pending_data()
self.result = "".join(self.accumulator)

def handle_pending_data(self):
if not self.pending_data:
return

data_concat = "".join(self.pending_data)

if self.accumulator and self.elements and self.elements[-1] not in PlainHTMLParser.inline_elements:
data_concat = " " + data_concat

self.accumulator.append(data_concat)
self.pending_data.clear()
14 changes: 14 additions & 0 deletions resources/lib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

from .lazylogger import LazyLogger
from .kodi_utils import HomeWindow
from .plainhtmlparser import PlainHTMLParser

# hack to get datetime strptime loaded
throwaway = time.strptime('20110101', '%Y%m%d')
Expand Down Expand Up @@ -451,3 +452,16 @@ def get_bitrate(enum_value):
7000, 8000, 9000, 10000, 12000, 14000, 16000, 18000,
20000, 25000, 30000, 35000, 40000, 100000, 1000000, 2147483]
return bitrate[int(enum_value) if enum_value else 24] * 1000


def plainify_html(body):
'''
Strip HTML elements from the string, preserving human-readable content.
'''
if body == None:

Check notice

Code scanning / CodeQL

Testing equality to None Note

Testing for None should use the 'is' operator.
raise ValueError("body must not be None")

parser = PlainHTMLParser()
parser.feed(body)
parser.close()
return parser.result

0 comments on commit 317d880

Please sign in to comment.