Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better formatting in text mode #327

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion wikiextractor/WikiExtractor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# =============================================================================
Expand Down
58 changes: 33 additions & 25 deletions wikiextractor/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def clean(extractor, text, expand_templates=False, html_safe=True):
text = bold_italic.sub(r'\1', text)
text = bold.sub(r'\1', text)
text = italic_quote.sub(r'"\1"', text)
text = italic.sub(r'"\1"', text)
text = italic.sub(r'\1', text)
text = quote_quote.sub(r'"\1"', text)
# residuals of unbalanced quotes
text = text.replace("'''", '').replace("''", '"')
Expand Down Expand Up @@ -201,7 +201,8 @@ def compact(text, mark_headers=False):
if not line:
if len(listLevel): # implies Extractor.HtmlFormatting
for c in reversed(listLevel):
page.append(listClose[c])
if Extractor.HtmlFormatting:
page.append(listClose[c])
listLevel = ''
continue

Expand All @@ -212,6 +213,8 @@ def compact(text, mark_headers=False):
lev = len(m.group(1))
if Extractor.HtmlFormatting:
page.append("<h%d>%s</h%d>" % (lev, title, lev))
else:
page.append("\n%s" % title)
if title and title[-1] not in '!?':
title += '.'

Expand All @@ -236,33 +239,37 @@ def compact(text, mark_headers=False):
# handle lists
# @see https://www.mediawiki.org/wiki/Help:Formatting
elif line[0] in '*#;':
if Extractor.HtmlFormatting:
# close extra levels
l = 0
for c in listLevel:
if l < len(line) and c != line[l]:
for extra in reversed(listLevel[l:]):
# close extra levels
l = 0
for c in listLevel:
if l < len(line) and c != line[l]:
for extra in reversed(listLevel[l:]):
if Extractor.HtmlFormatting:
page.append(listClose[extra])
listLevel = listLevel[:l]
break
l += 1
if l < len(line) and line[l] in '*#;:':
# add new level (only one, no jumps)
# FIXME: handle jumping levels
type = line[l]
listLevel = listLevel[:l]
break
l += 1
if l < len(line) and line[l] in '*#;:':
# add new level (only one, no jumps)
# FIXME: handle jumping levels
type = line[l]
if Extractor.HtmlFormatting:
page.append(listOpen[type])
listLevel += type
line = line[l+1:].strip()
else:
# continue on same level
listLevel += type
line = line[l+1:].strip()
else:
# continue on same level
if l < len(line):
type = line[l-1]
line = line[l:].strip()
line = line[l:].strip()
if Extractor.HtmlFormatting:
page.append(listItem[type] % line)
else:
continue
page.append("- %s" % line)
elif len(listLevel): # implies Extractor.HtmlFormatting
for c in reversed(listLevel):
page.append(listClose[c])
if Extractor.HtmlFormatting:
page.append(listClose[c])
listLevel = []

# Drop residuals of lists
Expand Down Expand Up @@ -617,7 +624,8 @@ def __setitem__(self, name, value):
'__INDEX__',
'__NOINDEX__',
'__STATICREDIRECT__',
'__DISAMBIG__'
'__DISAMBIG__',
'__NOEDITSECTION__',
)


Expand Down Expand Up @@ -668,7 +676,7 @@ def __setitem__(self, name, value):
'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'div', 'em',
'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd', 'nowiki',
'p', 'plaintext', 's', 'span', 'strike', 'strong',
'sub', 'sup', 'tt', 'u', 'var'
'sub', 'sup', 'tt', 'u', 'var', 'templatestyles', 'indicator', 'br',
)

placeholder_tags = {'math': 'formula', 'code': 'codice'}
Expand Down Expand Up @@ -913,7 +921,7 @@ class Extractor():

##
# Whether to preserve section titles
keepSections = True
keepSections = False

##
# Whether to output text with HTML formatting elements in <doc> files.
Expand Down