From e397ec3376a31b45b06534ff18d22945c225edd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Vr=C5=A1ovsk=C3=BD?= Date: Fri, 12 Jan 2024 21:19:22 +0100 Subject: [PATCH] feat: better formatting in text mode --- wikiextractor/WikiExtractor.py | 2 +- wikiextractor/extract.py | 58 +++++++++++++++++++--------------- 2 files changed, 34 insertions(+), 26 deletions(-) diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py index 830235d..ad8a331 100755 --- a/wikiextractor/WikiExtractor.py +++ b/wikiextractor/WikiExtractor.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- # ============================================================================= diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py index a00e23d..fab0589 100644 --- a/wikiextractor/extract.py +++ b/wikiextractor/extract.py @@ -118,7 +118,7 @@ def clean(extractor, text, expand_templates=False, html_safe=True): text = bold_italic.sub(r'\1', text) text = bold.sub(r'\1', text) text = italic_quote.sub(r'"\1"', text) - text = italic.sub(r'"\1"', text) + text = italic.sub(r'\1', text) text = quote_quote.sub(r'"\1"', text) # residuals of unbalanced quotes text = text.replace("'''", '').replace("''", '"') @@ -201,7 +201,8 @@ def compact(text, mark_headers=False): if not line: if len(listLevel): # implies Extractor.HtmlFormatting for c in reversed(listLevel): - page.append(listClose[c]) + if Extractor.HtmlFormatting: + page.append(listClose[c]) listLevel = '' continue @@ -212,6 +213,8 @@ def compact(text, mark_headers=False): lev = len(m.group(1)) if Extractor.HtmlFormatting: page.append("%s" % (lev, title, lev)) + else: + page.append("\n%s" % title) if title and title[-1] not in '!?': title += '.' @@ -236,33 +239,37 @@ def compact(text, mark_headers=False): # handle lists # @see https://www.mediawiki.org/wiki/Help:Formatting elif line[0] in '*#;': - if Extractor.HtmlFormatting: - # close extra levels - l = 0 - for c in listLevel: - if l < len(line) and c != line[l]: - for extra in reversed(listLevel[l:]): + # close extra levels + l = 0 + for c in listLevel: + if l < len(line) and c != line[l]: + for extra in reversed(listLevel[l:]): + if Extractor.HtmlFormatting: page.append(listClose[extra]) - listLevel = listLevel[:l] - break - l += 1 - if l < len(line) and line[l] in '*#;:': - # add new level (only one, no jumps) - # FIXME: handle jumping levels - type = line[l] + listLevel = listLevel[:l] + break + l += 1 + if l < len(line) and line[l] in '*#;:': + # add new level (only one, no jumps) + # FIXME: handle jumping levels + type = line[l] + if Extractor.HtmlFormatting: page.append(listOpen[type]) - listLevel += type - line = line[l+1:].strip() - else: - # continue on same level + listLevel += type + line = line[l+1:].strip() + else: + # continue on same level + if l < len(line): type = line[l-1] - line = line[l:].strip() + line = line[l:].strip() + if Extractor.HtmlFormatting: page.append(listItem[type] % line) else: - continue + page.append("- %s" % line) elif len(listLevel): # implies Extractor.HtmlFormatting for c in reversed(listLevel): - page.append(listClose[c]) + if Extractor.HtmlFormatting: + page.append(listClose[c]) listLevel = [] # Drop residuals of lists @@ -617,7 +624,8 @@ def __setitem__(self, name, value): '__INDEX__', '__NOINDEX__', '__STATICREDIRECT__', - '__DISAMBIG__' + '__DISAMBIG__', + '__NOEDITSECTION__', ) @@ -668,7 +676,7 @@ def __setitem__(self, name, value): 'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'div', 'em', 'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd', 'nowiki', 'p', 'plaintext', 's', 'span', 'strike', 'strong', - 'sub', 'sup', 'tt', 'u', 'var' + 'sub', 'sup', 'tt', 'u', 'var', 'templatestyles', 'indicator', 'br', ) placeholder_tags = {'math': 'formula', 'code': 'codice'} @@ -913,7 +921,7 @@ class Extractor(): ## # Whether to preserve section titles - keepSections = True + keepSections = False ## # Whether to output text with HTML formatting elements in files.