Skip to content

Commit

Permalink
Update pre/tree/postprocessors
Browse files Browse the repository at this point in the history
  • Loading branch information
waylan committed Sep 21, 2023
1 parent dfa6944 commit f7b06d5
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 49 deletions.
6 changes: 3 additions & 3 deletions markdown/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,11 +283,11 @@ def set_output_format(self, format: str) -> Markdown:
raise
return self

# Note: the `tag` argument is not type annotated as ElementTree uses many various objects as tags.
# Note: the `tag` argument is type annotated `Any` as ElementTree uses many various objects as tags.
# As there is no standardization in ElementTree, the type of a given tag is unpredictable.
def is_block_level(self, tag) -> bool:
def is_block_level(self, tag: Any) -> bool:
"""
Check if the given tag is a block level HTML tag.
Check if the given `tag` is a block level HTML tag.
Returns `True` for any string listed in `Markdown.block_level_elements`. A `tag` which is
not a string always returns `False`.
Expand Down
35 changes: 21 additions & 14 deletions markdown/postprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,11 @@
# License: BSD (see LICENSE.md for details).

"""
POST-PROCESSORS
=============================================================================
Markdown also allows post-processors, which are similar to preprocessors in
that they need to implement a "run" method. However, they are run after core
processing.
Post-processors run on the text of the entire document after is has been serialized into a string.
Postprocessors should be used to work with the text just before output. Usually, they are used add
back sections that were extracted in a preprocessor, fix up outgoing encodings, or wrap the whole
document.
"""

Expand All @@ -34,7 +33,7 @@
import re


def build_postprocessors(md, **kwargs):
def build_postprocessors(md: Markdown, **kwargs: Any) -> util.Registry:
""" Build the default postprocessors for Markdown. """
postprocessors = util.Registry()
postprocessors.register(RawHtmlPostprocessor(md), 'raw_html', 30)
Expand All @@ -46,16 +45,16 @@ class Postprocessor(util.Processor):
"""
Postprocessors are run after the ElementTree it converted back into text.
Each Postprocessor implements a "run" method that takes a pointer to a
Each Postprocessor implements a `run` method that takes a pointer to a
text string, modifies it as necessary and returns a text string.
Postprocessors must extend markdown.Postprocessor.
Postprocessors must extend `Postprocessor`.
"""

def run(self, text):
def run(self, text: str) -> str:
"""
Subclasses of Postprocessor should implement a `run` method, which
Subclasses of `Postprocessor` should implement a `run` method, which
takes the html document as a single text string and returns a
(possibly modified) string.
Expand All @@ -68,7 +67,7 @@ class RawHtmlPostprocessor(Postprocessor):

BLOCK_LEVEL_REGEX = re.compile(r'^\<\/?([^ >]+)')

def run(self, text):
def run(self, text: str):
""" Iterate over html stash and restore html. """
replacements = OrderedDict()
for i in range(self.md.htmlStash.html_counter):
Expand Down Expand Up @@ -101,7 +100,8 @@ def substitute_match(m):
else:
return self.run(processed_text)

def isblocklevel(self, html):
def isblocklevel(self, html: str) -> bool:
""" Check is block of HTML is block-level. """
m = self.BLOCK_LEVEL_REGEX.match(html)
if m:
if m.group(1)[0] in ('!', '?', '@', '%'):
Expand All @@ -110,7 +110,7 @@ def isblocklevel(self, html):
return self.md.is_block_level(m.group(1))
return False

def stash_to_string(self, text):
def stash_to_string(self, text: str) -> str:
""" Convert a stashed object to a string. """
return str(text)

Expand All @@ -128,7 +128,14 @@ def run(self, text):
"use 'treeprocessors.UnescapeTreeprocessor' instead."
)
class UnescapePostprocessor(Postprocessor):
""" Restore escaped chars """
"""
Restore escaped chars.
!!! warning "Deprecated"
This class is deprecated and will be removed in the future; use
[`UnescapeTreeprocessor`][markdown.treeprocessors.UnescapeTreeprocessor] instead.
"""

RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX))

Expand Down
30 changes: 16 additions & 14 deletions markdown/preprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,9 @@
# License: BSD (see LICENSE.md for details).

"""
PRE-PROCESSORS
=============================================================================
Preprocessors work on source text before we start doing anything too
complicated.
Preprocessors work on source text before it is broken down into its individual parts.
This is an excellent place to clean up bad characters or to extract portions for later
processing that the parser may otherwise choke on.
"""

from __future__ import annotations
Expand All @@ -32,8 +30,8 @@
import re


def build_preprocessors(md, **kwargs):
""" Build the default set of preprocessors used by Markdown. """
def build_preprocessors(md: Markdown, **kwargs: Any) -> util.Registry:
""" Build and return the default set of preprocessors used by Markdown. """
preprocessors = util.Registry()
preprocessors.register(NormalizeWhitespace(md), 'normalize_whitespace', 30)
preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20)
Expand All @@ -44,16 +42,16 @@ class Preprocessor(util.Processor):
"""
Preprocessors are run after the text is broken into lines.
Each preprocessor implements a "run" method that takes a pointer to a
Each preprocessor implements a `run` method that takes a pointer to a
list of lines of the document, modifies it as necessary and returns
either the same pointer or a pointer to a new list.
Preprocessors must extend markdown.Preprocessor.
Preprocessors must extend `Preprocessor`.
"""
def run(self, lines):
def run(self, lines: list[str]) -> list[str]:
"""
Each subclass of Preprocessor should override the `run` method, which
Each subclass of `Preprocessor` should override the `run` method, which
takes the document as a list of strings split by newlines and returns
the (possibly modified) list of lines.
Expand All @@ -64,7 +62,7 @@ def run(self, lines):
class NormalizeWhitespace(Preprocessor):
""" Normalize whitespace for consistent parsing. """

def run(self, lines):
def run(self, lines: list[str]) -> list[str]:
source = '\n'.join(lines)
source = source.replace(util.STX, "").replace(util.ETX, "")
source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
Expand All @@ -74,9 +72,13 @@ def run(self, lines):


class HtmlBlockPreprocessor(Preprocessor):
"""Remove html blocks from the text and store them for later retrieval."""
"""
Remove html blocks from the text and store them for later retrieval.
The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the [`Markdown`][markdown.Markdown] instance.
"""

def run(self, lines):
def run(self, lines: list[str]) -> list[str]:
source = '\n'.join(lines)
parser = HTMLExtractor(self.md)
parser.feed(source)
Expand Down
40 changes: 23 additions & 17 deletions markdown/treeprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@

# License: BSD (see LICENSE.md for details).

"""
Tree processors manipulate the tree created by block processors. They can even create an entirely
new `ElementTree` object. This is an excellent place for creating summaries, adding collected
references, or last minute adjustments.
"""

from __future__ import annotations

import re
Expand All @@ -26,7 +33,7 @@
from . import inlinepatterns


def build_treeprocessors(md, **kwargs):
def build_treeprocessors(md: Markdown, **kwargs: Any) -> util.Registry:
""" Build the default `treeprocessors` for Markdown. """
treeprocessors = util.Registry()
treeprocessors.register(InlineProcessor(md), 'inline', 20)
Expand All @@ -35,8 +42,8 @@ def build_treeprocessors(md, **kwargs):
return treeprocessors


def isString(s):
""" Check if it's string """
def isString(s: Any) -> bool:
""" Return `True` if object is a string but not an [`AtomicString`][markdown.util.AtomicString]. """
if not isinstance(s, util.AtomicString):
return isinstance(s, str)
return False
Expand All @@ -47,17 +54,16 @@ class Treeprocessor(util.Processor):
`Treeprocessor`s are run on the `ElementTree` object before serialization.
Each `Treeprocessor` implements a `run` method that takes a pointer to an
`ElementTree`, modifies it as necessary and returns an `ElementTree`
object.
`Element` and modifies it as necessary.
`Treeprocessors` must extend `markdown.Treeprocessor`.
"""
def run(self, root):
def run(self, root: etree.Element) -> etree.Element | None:
"""
Subclasses of `Treeprocessor` should implement a `run` method, which
takes a root `ElementTree`. This method can return another `ElementTree`
object, and the existing root `ElementTree `will be replaced, or it can
takes a root `Element`. This method can return another `Element`
object, and the existing root `Element` will be replaced, or it can
modify the current tree and return `None`.
"""
pass # pragma: no cover
Expand Down Expand Up @@ -320,18 +326,18 @@ def __build_ancestors(self, parent, parents):
ancestors.reverse()
parents.extend(ancestors)

def run(self, tree: etree.ElementTree, ancestors: Sequence[str] | None = None) -> etree.ElementTree:
def run(self, tree: etree.Element, ancestors: Sequence[str] | None = None) -> etree.Element:
"""Apply inline patterns to a parsed Markdown tree.
Iterate over `ElementTree`, find elements with inline tag, apply inline
patterns and append newly created Elements to tree. If you don't
want to process your data with inline patterns, instead of normal
string, use subclass `AtomicString`:
Iterate over `Element`, find elements with inline tag, apply inline
patterns and append newly created Elements to tree. To avoid further
processing of string with inline patterns, instead of normal string,
use subclass [`AtomicString`][markdown.util.AtomicString]:
node.text = markdown.AtomicString("This will not be processed.")
node.text = markdown.util.AtomicString("This will not be processed.")
Arguments:
tree: `ElementTree` object, representing Markdown tree.
tree: `Element` object, representing Markdown tree.
ancestors: List of parent tag names that precede the tree node (if needed).
Returns:
Expand Down Expand Up @@ -409,8 +415,8 @@ def _prettifyETree(self, elem):
if not elem.tail or not elem.tail.strip():
elem.tail = i

def run(self, root):
""" Add line breaks to `ElementTree` root object. """
def run(self, root: etree.Element) -> None:
""" Add line breaks to `Element` object and its children. """

self._prettifyETree(root)
# Do `<br />`'s separately as they are often in the middle of
Expand Down
11 changes: 10 additions & 1 deletion markdown/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,16 @@ class AtomicString(str):


class Processor:
def __init__(self, md=None):
""" The base class for all processors.
Attributes:
Processor.md: The `Markdown` instance passed in an initialization.
Arguments:
md: The `Markdown` instance this processor is a part of.
"""
def __init__(self, md: Markdown=None):
self.md = md


Expand Down

0 comments on commit f7b06d5

Please sign in to comment.