-
Notifications
You must be signed in to change notification settings - Fork 0
/
filter_metatext.py
45 lines (33 loc) · 1.37 KB
/
filter_metatext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os
import re
'''Regular expressions that match metatext fragments.'''
metatext_patterns = [
r'888',
r'NPO ONDERTITELING TT888, \d+\ninformatie: service.npo.nl',
]
def matches_metatext_re(fragment):
'''Returns True if a fragment matches any of the regular expressions above.'''
return any(re.fullmatch(pattern, fragment) for pattern in metatext_patterns)
def is_in_parentheses(fragment):
return fragment.startswith('(') and fragment.endswith(')')
def is_uppercase(fragment):
return re.search(r'[A-Z]', fragment) and fragment.upper() == fragment
def start_with_asterisk(fragment):
'''Song lyrics often start with *'''
return fragment.startswith('*')
def qualifies_for_any(fragment, functions):
'''Returns True if any of the functions in `functions` return true for `fragment`.'''
return any(map(lambda func: func(fragment), functions))
def is_metatext(fragment):
'''Returns True if a fragment should be counted as metatext and thus ignored.'''
metatext_checks = [matches_metatext_re, is_in_parentheses, is_uppercase, start_with_asterisk]
return qualifies_for_any(fragment, metatext_checks)
def filter_metatext(text_fragments):
'''
Filters an iterable of text fragments (strings)
to exclude metatext segments like '888
'''
return filter(
lambda text: not is_metatext(text),
text_fragments
)