Skip to content

Commit

Permalink
Fix whitespace issues around wrapping
Browse files Browse the repository at this point in the history
This fixes various issues relating to how input whitespace is handled
and how wrapping handles whitespace resulting from hard line breaks.

This PR uses a branch based on that for matthewwithanm#120 to avoid conflicts with
the fixes and associated test changes there.  My suggestion is thus
first to merge matthewwithanm#120 (which fixes two open issues), then to merge the
remaining changes from this PR.

Wrapping paragraphs has the effect of losing all newlines including
those from `<br>` tags, contrary to HTML semantics (wrapping should be
a matter of pretty-printing the output; input whitespace from the HTML
input should be normalized, but `<br>` should remain as a hard line
break).  To fix this, we need to wrap the portions of a paragraph
between hard line breaks separately.  For this to work, ensure that
when wrapping, all input whitespace is normalized at an early stage,
including turning newlines into spaces.  (Only ASCII whitespace is
handled this way; `\s` is not used as it's not clear Unicode
whitespace should get such normalization.)

When not wrapping, there is still too much input whitespace
preservation.  If the input contains a blank line, that ends up as a
paragraph break in the output, or breaks the header formatting when
appearing in a header tag, though in terms of HTML semantics such a
blank line is no different from a space.  In the case of an ATX
header, even a single newline appearing in the output breaks the
Markdown.  Thus, when not wrapping, arrange for input whitespace
containing at least one `\r` or `\n` to be normalized to a single
newline, and in the ATX header case, normalize to a space.

Fixes matthewwithanm#130
(probably, not sure exactly what the HTML input there is)

Fixes matthewwithanm#88
(a related case, anyway; the actual input in matthewwithanm#88 has already been fixed)
  • Loading branch information
jsm28 committed Oct 3, 2024
1 parent 4399ee7 commit c2ffe46
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 9 deletions.
29 changes: 23 additions & 6 deletions markdownify/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
convert_heading_re = re.compile(r'convert_h(\d+)')
line_beginning_re = re.compile(r'^', re.MULTILINE)
whitespace_re = re.compile(r'[\t ]+')
all_whitespace_re = re.compile(r'[\s]+')
all_whitespace_re = re.compile(r'[\t \r\n]+')
newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
html_heading_re = re.compile(r'h[1-6]')


Expand Down Expand Up @@ -168,7 +169,11 @@ def process_text(self, el):

# normalize whitespace if we're not inside a preformatted element
if not el.find_parent('pre'):
text = whitespace_re.sub(' ', text)
if self.options['wrap']:
text = all_whitespace_re.sub(' ', text)
else:
text = newline_whitespace_re.sub('\n', text)
text = whitespace_re.sub(' ', text)

# escape special characters if we're not inside a preformatted or code element
if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
Expand Down Expand Up @@ -286,6 +291,7 @@ def convert_hn(self, n, el, text, convert_as_inline):
if style == UNDERLINED and n <= 2:
line = '=' if n == 1 else '-'
return self.underline(text, line)
text = all_whitespace_re.sub(' ', text)
hashes = '#' * n
if style == ATX_CLOSED:
return '\n%s %s %s\n\n' % (hashes, text, hashes)
Expand Down Expand Up @@ -351,10 +357,21 @@ def convert_p(self, el, text, convert_as_inline):
if convert_as_inline:
return text
if self.options['wrap']:
text = fill(text,
width=self.options['wrap_width'],
break_long_words=False,
break_on_hyphens=False)
# Preserve newlines (and preceding whitespace) resulting
# from <br> tags. Newlines in the input have already been
# replaced by spaces.
lines = text.split('\n')
new_lines = []
for line in lines:
line = line.lstrip()
line_no_trailing = line.rstrip()
trailing = line[len(line_no_trailing):]
line = fill(line,
width=self.options['wrap_width'],
break_long_words=False,
break_on_hyphens=False)
new_lines.append(line + trailing)
text = '\n'.join(new_lines)
return '\n\n%s\n\n' % text if text else ''

def convert_pre(self, el, text, convert_as_inline):
Expand Down
1 change: 1 addition & 0 deletions tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ def test_soup():

def test_whitespace():
assert md(' a b \t\t c ') == ' a b c '
assert md(' a b \n\n c ') == ' a b\nc '
15 changes: 13 additions & 2 deletions tests/test_conversions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCORE
from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE


def inline_tests(tag, markup):
Expand Down Expand Up @@ -113,6 +113,7 @@ def test_em():

def test_header_with_space():
assert md('<h3>\n\nHello</h3>') == '\n### Hello\n\n'
assert md('<h3>Hello\n\n\nWorld</h3>') == '\n### Hello World\n\n'
assert md('<h4>\n\nHello</h4>') == '\n#### Hello\n\n'
assert md('<h5>\n\nHello</h5>') == '\n##### Hello\n\n'
assert md('<h5>\n\nHello\n\n</h5>') == '\n##### Hello\n\n'
Expand Down Expand Up @@ -174,7 +175,7 @@ def test_hn_nested_img():
("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""),
]
for image_attributes, markdown, title in image_attributes_to_markdown:
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '\n### A ' + markdown + ' B\n\n'
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '\n### A' + (' ' + markdown + ' ' if markdown else ' ') + 'B\n\n'
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '\n### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n'


Expand Down Expand Up @@ -214,10 +215,20 @@ def test_kbd():
def test_p():
assert md('<p>hello</p>') == '\n\nhello\n\n'
assert md('<p>123456789 123456789</p>') == '\n\n123456789 123456789\n\n'
assert md('<p>123456789\n\n\n123456789</p>') == '\n\n123456789\n123456789\n\n'
assert md('<p>123456789\n\n\n123456789</p>', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n'
assert md('<p>123456789 123456789</p>', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n'
assert md('<p><a href="https://example.com">Some long link</a></p>', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n'
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n'
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n'
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345 \n67890\n\n'
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345 \n67890\n\n'
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n'
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n'
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n'
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n'
assert md('<p>1234 5678 9012<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n1234 5678\n9012\\\n67890\n\n'
assert md('<p>1234 5678 9012<br />67890</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n1234 5678\n9012 \n67890\n\n'
assert md('First<p>Second</p><p>Third</p>Fourth') == 'First\n\nSecond\n\nThird\n\nFourth'


Expand Down
2 changes: 1 addition & 1 deletion tests/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def test_table():
assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
assert md(table_with_html_content) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n'
assert md(table_with_paragraphs) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n'
assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n'
assert md(table_with_header_column) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
assert md(table_head_body_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
Expand Down

0 comments on commit c2ffe46

Please sign in to comment.