Update pre/tree/postprocessors

Python-Markdown · Sep 21, 2023 · f7b06d5 · f7b06d5
1 parent dfa6944
commit f7b06d5
Show file tree

Hide file tree

Showing 5 changed files with 73 additions and 49 deletions.
diff --git a/markdown/core.py b/markdown/core.py
@@ -283,11 +283,11 @@ def set_output_format(self, format: str) -> Markdown:
             raise
         return self
 
-    # Note: the `tag` argument is not type annotated as ElementTree uses many various objects as tags.
+    # Note: the `tag` argument is type annotated `Any` as ElementTree uses many various objects as tags.
     # As there is no standardization in ElementTree, the type of a given tag is unpredictable.
-    def is_block_level(self, tag) -> bool:
+    def is_block_level(self, tag: Any) -> bool:
         """
-        Check if the given tag is a block level HTML tag.
+        Check if the given `tag` is a block level HTML tag.
 
         Returns `True` for any string listed in `Markdown.block_level_elements`. A `tag` which is
         not a string always returns `False`.

diff --git a/markdown/postprocessors.py b/markdown/postprocessors.py
@@ -18,12 +18,11 @@
 # License: BSD (see LICENSE.md for details).
 
 """
-POST-PROCESSORS
-=============================================================================
 
-Markdown also allows post-processors, which are similar to preprocessors in
-that they need to implement a "run" method. However, they are run after core
-processing.
+Post-processors run on the text of the entire document after is has been serialized into a string.
+Postprocessors should be used to work with the text just before output. Usually, they are used add
+back sections that were extracted in a preprocessor, fix up outgoing encodings, or wrap the whole
+document.
 
 """
 
@@ -34,7 +33,7 @@
 import re
 
 
-def build_postprocessors(md, **kwargs):
+def build_postprocessors(md: Markdown, **kwargs: Any) -> util.Registry:
     """ Build the default postprocessors for Markdown. """
     postprocessors = util.Registry()
     postprocessors.register(RawHtmlPostprocessor(md), 'raw_html', 30)
@@ -46,16 +45,16 @@ class Postprocessor(util.Processor):
     """
     Postprocessors are run after the ElementTree it converted back into text.
 
-    Each Postprocessor implements a "run" method that takes a pointer to a
+    Each Postprocessor implements a `run` method that takes a pointer to a
     text string, modifies it as necessary and returns a text string.
 
-    Postprocessors must extend markdown.Postprocessor.
+    Postprocessors must extend `Postprocessor`.
 
     """
 
-    def run(self, text):
+    def run(self, text: str) -> str:
         """
-        Subclasses of Postprocessor should implement a `run` method, which
+        Subclasses of `Postprocessor` should implement a `run` method, which
         takes the html document as a single text string and returns a
         (possibly modified) string.
 
@@ -68,7 +67,7 @@ class RawHtmlPostprocessor(Postprocessor):
 
     BLOCK_LEVEL_REGEX = re.compile(r'^\<\/?([^ >]+)')
 
-    def run(self, text):
+    def run(self, text: str):
         """ Iterate over html stash and restore html. """
         replacements = OrderedDict()
         for i in range(self.md.htmlStash.html_counter):
@@ -101,7 +100,8 @@ def substitute_match(m):
         else:
             return self.run(processed_text)
 
-    def isblocklevel(self, html):
+    def isblocklevel(self, html: str) -> bool:
+        """ Check is block of HTML is block-level. """
         m = self.BLOCK_LEVEL_REGEX.match(html)
         if m:
             if m.group(1)[0] in ('!', '?', '@', '%'):
@@ -110,7 +110,7 @@ def isblocklevel(self, html):
             return self.md.is_block_level(m.group(1))
         return False
 
-    def stash_to_string(self, text):
+    def stash_to_string(self, text: str) -> str:
         """ Convert a stashed object to a string. """
         return str(text)
 
@@ -128,7 +128,14 @@ def run(self, text):
     "use 'treeprocessors.UnescapeTreeprocessor' instead."
 )
 class UnescapePostprocessor(Postprocessor):
-    """ Restore escaped chars """
+    """
+    Restore escaped chars.
+
+    !!! warning "Deprecated"
+
+        This class is deprecated and will be removed in the future; use
+        [`UnescapeTreeprocessor`][markdown.treeprocessors.UnescapeTreeprocessor] instead.
+    """
 
     RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX))
 

diff --git a/markdown/preprocessors.py b/markdown/preprocessors.py
@@ -18,11 +18,9 @@
 # License: BSD (see LICENSE.md for details).
 
 """
-PRE-PROCESSORS
-=============================================================================
-
-Preprocessors work on source text before we start doing anything too
-complicated.
+Preprocessors work on source text before it is broken down into its individual parts.
+This is an excellent place to clean up bad characters or to extract portions for later
+processing that the parser may otherwise choke on.
 """
 
 from __future__ import annotations
@@ -32,8 +30,8 @@
 import re
 
 
-def build_preprocessors(md, **kwargs):
-    """ Build the default set of preprocessors used by Markdown. """
+def build_preprocessors(md: Markdown, **kwargs: Any) -> util.Registry:
+    """ Build and return the default set of preprocessors used by Markdown. """
     preprocessors = util.Registry()
     preprocessors.register(NormalizeWhitespace(md), 'normalize_whitespace', 30)
     preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20)
@@ -44,16 +42,16 @@ class Preprocessor(util.Processor):
     """
     Preprocessors are run after the text is broken into lines.
 
-    Each preprocessor implements a "run" method that takes a pointer to a
+    Each preprocessor implements a `run` method that takes a pointer to a
     list of lines of the document, modifies it as necessary and returns
     either the same pointer or a pointer to a new list.
 
-    Preprocessors must extend markdown.Preprocessor.
+    Preprocessors must extend `Preprocessor`.
 
     """
-    def run(self, lines):
+    def run(self, lines: list[str]) -> list[str]:
         """
-        Each subclass of Preprocessor should override the `run` method, which
+        Each subclass of `Preprocessor` should override the `run` method, which
         takes the document as a list of strings split by newlines and returns
         the (possibly modified) list of lines.
 
@@ -64,7 +62,7 @@ def run(self, lines):
 class NormalizeWhitespace(Preprocessor):
     """ Normalize whitespace for consistent parsing. """
 
-    def run(self, lines):
+    def run(self, lines: list[str]) -> list[str]:
         source = '\n'.join(lines)
         source = source.replace(util.STX, "").replace(util.ETX, "")
         source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
@@ -74,9 +72,13 @@ def run(self, lines):
 
 
 class HtmlBlockPreprocessor(Preprocessor):
-    """Remove html blocks from the text and store them for later retrieval."""
+    """
+    Remove html blocks from the text and store them for later retrieval.
+
+    The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the [`Markdown`][markdown.Markdown] instance.
+    """
 
-    def run(self, lines):
+    def run(self, lines: list[str]) -> list[str]:
         source = '\n'.join(lines)
         parser = HTMLExtractor(self.md)
         parser.feed(source)

diff --git a/markdown/treeprocessors.py b/markdown/treeprocessors.py
@@ -17,6 +17,13 @@
 
 # License: BSD (see LICENSE.md for details).
 
+"""
+Tree processors manipulate the tree created by block processors. They can even create an entirely
+new `ElementTree` object. This is an excellent place for creating summaries, adding collected
+references, or last minute adjustments.
+
+"""
+
 from __future__ import annotations
 
 import re
@@ -26,7 +33,7 @@
 from . import inlinepatterns
 
 
-def build_treeprocessors(md, **kwargs):
+def build_treeprocessors(md: Markdown, **kwargs: Any) -> util.Registry:
     """ Build the default  `treeprocessors` for Markdown. """
     treeprocessors = util.Registry()
     treeprocessors.register(InlineProcessor(md), 'inline', 20)
@@ -35,8 +42,8 @@ def build_treeprocessors(md, **kwargs):
     return treeprocessors
 
 
-def isString(s):
-    """ Check if it's string """
+def isString(s: Any) -> bool:
+    """ Return `True` if object is a string but not an  [`AtomicString`][markdown.util.AtomicString]. """
     if not isinstance(s, util.AtomicString):
         return isinstance(s, str)
     return False
@@ -47,17 +54,16 @@ class Treeprocessor(util.Processor):
     `Treeprocessor`s are run on the `ElementTree` object before serialization.
 
     Each `Treeprocessor` implements a `run` method that takes a pointer to an
-    `ElementTree`, modifies it as necessary and returns an `ElementTree`
-    object.
+    `Element` and modifies it as necessary.
 
     `Treeprocessors` must extend `markdown.Treeprocessor`.
 
     """
-    def run(self, root):
+    def run(self, root: etree.Element) -> etree.Element | None:
         """
         Subclasses of `Treeprocessor` should implement a `run` method, which
-        takes a root `ElementTree`. This method can return another `ElementTree`
-        object, and the existing root `ElementTree `will be replaced, or it can
+        takes a root `Element`. This method can return another `Element`
+        object, and the existing root `Element` will be replaced, or it can
         modify the current tree and return `None`.
         """
         pass  # pragma: no cover
@@ -320,18 +326,18 @@ def __build_ancestors(self, parent, parents):
         ancestors.reverse()
         parents.extend(ancestors)
 
-    def run(self, tree: etree.ElementTree, ancestors: Sequence[str] | None = None) -> etree.ElementTree:
+    def run(self, tree: etree.Element, ancestors: Sequence[str] | None = None) -> etree.Element:
         """Apply inline patterns to a parsed Markdown tree.
 
-        Iterate over `ElementTree`, find elements with inline tag, apply inline
-        patterns and append newly created Elements to tree.  If you don't
-        want to process your data with inline patterns, instead of normal
-        string, use subclass `AtomicString`:
+        Iterate over `Element`, find elements with inline tag, apply inline
+        patterns and append newly created Elements to tree.  To avoid further
+        processing of string with inline patterns, instead of normal string,
+        use subclass [`AtomicString`][markdown.util.AtomicString]:
 
-            node.text = markdown.AtomicString("This will not be processed.")
+            node.text = markdown.util.AtomicString("This will not be processed.")
 
         Arguments:
-            tree: `ElementTree` object, representing Markdown tree.
+            tree: `Element` object, representing Markdown tree.
             ancestors: List of parent tag names that precede the tree node (if needed).
 
         Returns:
@@ -409,8 +415,8 @@ def _prettifyETree(self, elem):
         if not elem.tail or not elem.tail.strip():
             elem.tail = i
 
-    def run(self, root):
-        """ Add line breaks to `ElementTree` root object. """
+    def run(self, root: etree.Element) -> None:
+        """ Add line breaks to `Element` object and its children. """
 
         self._prettifyETree(root)
         # Do `<br />`'s separately as they are often in the middle of

diff --git a/markdown/util.py b/markdown/util.py
@@ -169,7 +169,16 @@ class AtomicString(str):
 
 
 class Processor:
-    def __init__(self, md=None):
+    """ The base class for all processors.
+
+    Attributes:
+        Processor.md: The `Markdown` instance passed in an initialization.
+
+    Arguments:
+        md: The `Markdown` instance this processor is a part of.
+
+    """
+    def __init__(self, md: Markdown=None):
         self.md = md