Skip to content

Commit

Permalink
try fix regexp
Browse files Browse the repository at this point in the history
  • Loading branch information
hndgzkn committed Jan 23, 2023
1 parent cf43b2b commit 942ad61
Showing 1 changed file with 29 additions and 16 deletions.
45 changes: 29 additions & 16 deletions wikiextractor/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@

# match tail after wikilink
tailRE = re.compile('\w+')
syntaxhighlight = re.compile('<syntaxhighlight .*?>(.*?)</syntaxhighlight>', re.DOTALL)
syntaxhighlight = re.compile(
'<syntaxhighlight .*?>(.*?)</syntaxhighlight>', re.DOTALL)

## PARAMS ####################################################################

Expand Down Expand Up @@ -170,7 +171,8 @@ def clean(extractor, text, expand_templates=False, html_safe=True):
text = dots.sub('...', text)
text = re.sub(u' (,:\.\)\]»)', r'\1', text)
text = re.sub(u'(\[\(«) ', r'\1', text)
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
# lines with only punctuations
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U)
text = text.replace(',,', ',').replace(',.', '.')
if html_safe:
text = html.escape(text, quote=False)
Expand Down Expand Up @@ -220,7 +222,7 @@ def compact(text, mark_headers=False):

headers[lev] = title
# drop previous headers
headers = { k:v for k,v in headers.items() if k <= lev }
headers = {k: v for k, v in headers.items() if k <= lev}
emptySection = True
continue
# Handle page title
Expand Down Expand Up @@ -380,7 +382,8 @@ def dropSpans(spans, text):
# as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052
EXT_LINK_URL_CLASS = r'[^][<>"\x00-\x20\x7F\s]'
ExtLinkBracketedRegex = re.compile(
'\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]',
(r'\[(((?i)' + '|'.join(wgUrlProtocols) + ')' +
EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]'),
re.S | re.U)
EXT_IMAGE_REGEX = re.compile(
r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+)
Expand Down Expand Up @@ -746,7 +749,8 @@ def fixup(m):


def ignoreTag(tag):
left = re.compile(r'<%s\b.*?>' % tag, re.IGNORECASE | re.DOTALL) # both <ref> and <reference>
left = re.compile(r'<%s\b.*?>' % tag, re.IGNORECASE |
re.DOTALL) # both <ref> and <reference>
right = re.compile(r'</\s*%s>' % tag, re.IGNORECASE)
ignored_tag_patterns.append((left, right))

Expand Down Expand Up @@ -792,6 +796,7 @@ def resetIgnoredTags():

# ======================================================================


class Template(list):
"""
A Template is a list of TemplateText or TemplateArgs
Expand All @@ -806,11 +811,11 @@ def parse(cls, body):
# {{#if:{{{{{#if:{{{nominee|}}}|nominee|candidate}}|}}}|
#
start = 0
for s,e in findMatchingBraces(body, 3):
for s, e in findMatchingBraces(body, 3):
tpl.append(TemplateText(body[start:s]))
tpl.append(TemplateArg(body[s+3:e-3]))
start = e
tpl.append(TemplateText(body[start:])) # leftover
tpl.append(TemplateText(body[start:])) # leftover
return tpl

def subst(self, params, extractor, depth=0):
Expand Down Expand Up @@ -852,6 +857,7 @@ class TemplateArg():
parameter to a template.
Has a name and a default value, both of which are Templates.
"""

def __init__(self, parameter):
"""
:param parameter: the parts of a tplarg.
Expand Down Expand Up @@ -890,10 +896,11 @@ def subst(self, params, extractor, depth):
paramName = extractor.expandTemplates(paramName)
res = ''
if paramName in params:
res = params[paramName] # use parameter value specified in template invocation
# use parameter value specified in template invocation
res = params[paramName]
elif self.default: # use the default value
defaultValue = self.default.subst(params, extractor, depth+1)
res = extractor.expandTemplates(defaultValue)
res = extractor.expandTemplates(defaultValue)
#logging.debug('subst arg %d %s -> %s' % (depth, paramName, res))
return res

Expand Down Expand Up @@ -949,7 +956,8 @@ def clean_text(self, text, mark_headers=False, expand_templates=True,
:param mark_headers: True to distinguish headers from paragraphs
e.g. "## Section 1"
"""
self.magicWords['namespace'] = self.title[:max(0, self.title.find(":"))]
self.magicWords['namespace'] = self.title[:max(
0, self.title.find(":"))]
self.magicWords['pagename'] = self.title
self.magicWords['fullpagename'] = self.title
self.magicWords['currentyear'] = time.strftime('%Y')
Expand All @@ -975,7 +983,7 @@ def extract(self, out, html_safe=True):

if self.to_json:
json_data = {
'id': self.id,
'id': self.id,
'revid': self.revid,
'url': self.url,
'title': self.title,
Expand All @@ -985,7 +993,8 @@ def extract(self, out, html_safe=True):
out.write(out_str)
out.write('\n')
else:
header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, self.url, self.title)
header = '<doc id="%s" url="%s" title="%s">\n' % (
self.id, self.url, self.title)
# Separate header from text with a newline.
header += self.title + '\n\n'
footer = "\n</doc>\n"
Expand Down Expand Up @@ -1119,7 +1128,8 @@ def templateParams(self, parameters):
if ']]' not in param: # if the value does not contain a link, trim whitespace
param = param.strip()
templateParams[str(unnamedParameterCounter)] = param
logging.debug(' templateParams> %s', '|'.join(templateParams.values()))
logging.debug(' templateParams> %s',
'|'.join(templateParams.values()))
return templateParams

def expandTemplate(self, body):
Expand Down Expand Up @@ -1198,7 +1208,8 @@ def expandTemplate(self, body):
colon = title.find(':')
if colon > 1:
funct = title[:colon]
parts[0] = title[colon + 1:].strip() # side-effect (parts[0] not used later)
# side-effect (parts[0] not used later)
parts[0] = title[colon + 1:].strip()
# arguments after first are not evaluated
ret = callParserFunction(funct, parts, self.frame)
return self.expandTemplates(ret)
Expand Down Expand Up @@ -1394,7 +1405,8 @@ def findMatchingBraces(text, ldelim=0):

if ldelim: # 2-3
reOpen = re.compile('[{]{%d,}' % ldelim) # at least ldelim
reNext = re.compile('[{]{2,}|}{2,}') # at least 2 open or close bracces
# at least 2 open or close bracces
reNext = re.compile('[{]{2,}|}{2,}')
else:
reOpen = re.compile('{{2,}|\[{2,}')
reNext = re.compile('{{2,}|}{2,}|\[{2,}|]{2,}') # at least 2
Expand Down Expand Up @@ -1473,7 +1485,8 @@ def findBalanced(text, openDelim, closeDelim):
"""
openPat = '|'.join([re.escape(x) for x in openDelim])
# patter for delimiters expected after each opening delimiter
afterPat = {o: re.compile(openPat + '|' + c, re.DOTALL) for o, c in zip(openDelim, closeDelim)}
afterPat = {o: re.compile(openPat + '|' + c, re.DOTALL)
for o, c in zip(openDelim, closeDelim)}
stack = []
start = 0
cur = 0
Expand Down

0 comments on commit 942ad61

Please sign in to comment.