try fix regexp

attardi · Jan 23, 2023 · 942ad61 · 942ad61
1 parent cf43b2b
commit 942ad61
Showing 1 changed file with 29 additions and 16 deletions.
diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py
@@ -31,7 +31,8 @@
 
 # match tail after wikilink
 tailRE = re.compile('\w+')
-syntaxhighlight = re.compile('&lt;syntaxhighlight .*?&gt;(.*?)&lt;/syntaxhighlight&gt;', re.DOTALL)
+syntaxhighlight = re.compile(
+    '&lt;syntaxhighlight .*?&gt;(.*?)&lt;/syntaxhighlight&gt;', re.DOTALL)
 
 ## PARAMS ####################################################################
 
@@ -170,7 +171,8 @@ def clean(extractor, text, expand_templates=False, html_safe=True):
     text = dots.sub('...', text)
     text = re.sub(u' (,:\.\)\]»)', r'\1', text)
     text = re.sub(u'(\[\(«) ', r'\1', text)
-    text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U)  # lines with only punctuations
+    # lines with only punctuations
+    text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U)
     text = text.replace(',,', ',').replace(',.', '.')
     if html_safe:
         text = html.escape(text, quote=False)
@@ -220,7 +222,7 @@ def compact(text, mark_headers=False):
 
             headers[lev] = title
             # drop previous headers
-            headers = { k:v for k,v in headers.items() if k <= lev }
+            headers = {k: v for k, v in headers.items() if k <= lev}
             emptySection = True
             continue
         # Handle page title
@@ -380,7 +382,8 @@ def dropSpans(spans, text):
 # as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052
 EXT_LINK_URL_CLASS = r'[^][<>"\x00-\x20\x7F\s]'
 ExtLinkBracketedRegex = re.compile(
-    '\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]',
+    (r'\[(((?i)' + '|'.join(wgUrlProtocols) + ')' +
+     EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]'),
     re.S | re.U)
 EXT_IMAGE_REGEX = re.compile(
     r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+)
@@ -746,7 +749,8 @@ def fixup(m):
 
 
 def ignoreTag(tag):
-    left = re.compile(r'<%s\b.*?>' % tag, re.IGNORECASE | re.DOTALL)  # both <ref> and <reference>
+    left = re.compile(r'<%s\b.*?>' % tag, re.IGNORECASE |
+                      re.DOTALL)  # both <ref> and <reference>
     right = re.compile(r'</\s*%s>' % tag, re.IGNORECASE)
     ignored_tag_patterns.append((left, right))
 
@@ -792,6 +796,7 @@ def resetIgnoredTags():
 
 # ======================================================================
 
+
 class Template(list):
     """
     A Template is a list of TemplateText or TemplateArgs
@@ -806,11 +811,11 @@ def parse(cls, body):
         # {{#if:{{{{{#if:{{{nominee|}}}|nominee|candidate}}|}}}|
         #
         start = 0
-        for s,e in findMatchingBraces(body, 3):
+        for s, e in findMatchingBraces(body, 3):
             tpl.append(TemplateText(body[start:s]))
             tpl.append(TemplateArg(body[s+3:e-3]))
             start = e
-        tpl.append(TemplateText(body[start:])) # leftover
+        tpl.append(TemplateText(body[start:]))  # leftover
         return tpl
 
     def subst(self, params, extractor, depth=0):
@@ -852,6 +857,7 @@ class TemplateArg():
     parameter to a template.
     Has a name and a default value, both of which are Templates.
     """
+
     def __init__(self, parameter):
         """
         :param parameter: the parts of a tplarg.
@@ -890,10 +896,11 @@ def subst(self, params, extractor, depth):
         paramName = extractor.expandTemplates(paramName)
         res = ''
         if paramName in params:
-            res = params[paramName]  # use parameter value specified in template invocation
+            # use parameter value specified in template invocation
+            res = params[paramName]
         elif self.default:            # use the default value
             defaultValue = self.default.subst(params, extractor, depth+1)
-            res =  extractor.expandTemplates(defaultValue)
+            res = extractor.expandTemplates(defaultValue)
         #logging.debug('subst arg %d %s -> %s' % (depth, paramName, res))
         return res
 
@@ -949,7 +956,8 @@ def clean_text(self, text, mark_headers=False, expand_templates=True,
         :param mark_headers: True to distinguish headers from paragraphs
           e.g. "## Section 1"
         """
-        self.magicWords['namespace'] = self.title[:max(0, self.title.find(":"))]
+        self.magicWords['namespace'] = self.title[:max(
+            0, self.title.find(":"))]
         self.magicWords['pagename'] = self.title
         self.magicWords['fullpagename'] = self.title
         self.magicWords['currentyear'] = time.strftime('%Y')
@@ -975,7 +983,7 @@ def extract(self, out, html_safe=True):
 
         if self.to_json:
             json_data = {
-		'id': self.id,
+                'id': self.id,
                 'revid': self.revid,
                 'url': self.url,
                 'title': self.title,
@@ -985,7 +993,8 @@ def extract(self, out, html_safe=True):
             out.write(out_str)
             out.write('\n')
         else:
-            header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, self.url, self.title)
+            header = '<doc id="%s" url="%s" title="%s">\n' % (
+                self.id, self.url, self.title)
             # Separate header from text with a newline.
             header += self.title + '\n\n'
             footer = "\n</doc>\n"
@@ -1119,7 +1128,8 @@ def templateParams(self, parameters):
                 if ']]' not in param:  # if the value does not contain a link, trim whitespace
                     param = param.strip()
                 templateParams[str(unnamedParameterCounter)] = param
-        logging.debug('   templateParams> %s', '|'.join(templateParams.values()))
+        logging.debug('   templateParams> %s',
+                      '|'.join(templateParams.values()))
         return templateParams
 
     def expandTemplate(self, body):
@@ -1198,7 +1208,8 @@ def expandTemplate(self, body):
         colon = title.find(':')
         if colon > 1:
             funct = title[:colon]
-            parts[0] = title[colon + 1:].strip()  # side-effect (parts[0] not used later)
+            # side-effect (parts[0] not used later)
+            parts[0] = title[colon + 1:].strip()
             # arguments after first are not evaluated
             ret = callParserFunction(funct, parts, self.frame)
             return self.expandTemplates(ret)
@@ -1394,7 +1405,8 @@ def findMatchingBraces(text, ldelim=0):
 
     if ldelim:  # 2-3
         reOpen = re.compile('[{]{%d,}' % ldelim)  # at least ldelim
-        reNext = re.compile('[{]{2,}|}{2,}')  # at least 2 open or close bracces
+        # at least 2 open or close bracces
+        reNext = re.compile('[{]{2,}|}{2,}')
     else:
         reOpen = re.compile('{{2,}|\[{2,}')
         reNext = re.compile('{{2,}|}{2,}|\[{2,}|]{2,}')  # at least 2
@@ -1473,7 +1485,8 @@ def findBalanced(text, openDelim, closeDelim):
     """
     openPat = '|'.join([re.escape(x) for x in openDelim])
     # patter for delimiters expected after each opening delimiter
-    afterPat = {o: re.compile(openPat + '|' + c, re.DOTALL) for o, c in zip(openDelim, closeDelim)}
+    afterPat = {o: re.compile(openPat + '|' + c, re.DOTALL)
+                for o, c in zip(openDelim, closeDelim)}
     stack = []
     start = 0
     cur = 0