huggingface · garrethlee · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024 · Sep 27, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -55,6 +55,9 @@ processing = [
 #   "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup",
     "tldextract",
     "trafilatura>=1.8.0,<1.12.0",
+    "justext",
+    "resiliparse",
+    "readabilipy",
     "tokenizers",
     "ftfy",
     "fasteners",

diff --git a/src/datatrove/pipeline/extractors/__init__.py b/src/datatrove/pipeline/extractors/__init__.py
@@ -1,2 +1,7 @@
+from .inscriptis import Inscriptis
+from .justext import Justext
 from .modular import ReadabilityInscriptis
+from .readabilipy import ReadabiliPy
+from .readability import Readability
+from .resiliparse import Resiliparse
 from .trafilatura import Trafilatura
diff --git a/src/datatrove/pipeline/extractors/inscriptis.py b/src/datatrove/pipeline/extractors/inscriptis.py
@@ -0,0 +1,63 @@
+import re
+
+from .base import BaseExtractor
+
+
+class Inscriptis(BaseExtractor):
+    """Inscriptis extractor, it uses https://github.com/weblyzard/inscriptis
+
+    We're using the main entry point of inscriptis: the `get_text` function.
+    No specific data structure is exchanged with Inscriptis, only the HTML is passed and the extracted text is returned.
+
+    Args:
+        timeout: the timeout for extraction, per document, in seconds
+        deduplicate_captions: whether to remove duplicate captions
+        display_links: whether to display link targets
+        display_anchors: whether to display anchor texts
+        **kwargs: any other option will be passed to inscriptis
+    """
+
+    name = "⛏ Inscriptis"
+    _requires_dependencies = ["inscriptis"]
+
+    def __init__(
+        self,
+        timeout: float = 0.1,
+        max_new_lines: int = 2,
+        deduplicate_captions: bool = True,
+        display_links: bool = False,
+        display_anchors: bool = True,
+        **kwargs,
+    ):
+        super().__init__(timeout)
+        self.new_line_chars = "\n" * max_new_lines
+        self.deduplicate_captions = deduplicate_captions
+        self.display_links = display_links
+        self.display_anchors = display_anchors
+        self.kwargs = kwargs
+        self.regex_excessive_lines = re.compile(r"(" + self.new_line_chars + "\n+)")
+
+    def extract(self, text: str) -> str:
+        """
+        Args:
+          text: str: html content
+
+        Returns: plaintext extracted text
+        """
+        from inscriptis import get_text
+        from inscriptis.css_profiles import CSS_PROFILES
+        from inscriptis.model.config import ParserConfig
+
+        text = get_text(
+            html_content=text,
+            config=ParserConfig(
+                css=CSS_PROFILES["strict"],
+                deduplicate_captions=self.deduplicate_captions,
+                display_links=self.display_links,
+                display_anchors=self.display_anchors,
+                **self.kwargs,
+            ),
+        )
+
+        # remove excessive empty lines
+        return self.regex_excessive_lines.sub(self.new_line_chars, text).strip()
diff --git a/src/datatrove/pipeline/extractors/justext.py b/src/datatrove/pipeline/extractors/justext.py
@@ -0,0 +1,87 @@
+from .base import BaseExtractor
+
+
+class Justext(BaseExtractor):
+    """Justext extractor, it uses https://github.com/miso-belica/jusText
+
+    We're actually only using the main entry point of justext: the `justext` function.
+    No specific data structure is exchanged with Justext, only the text is passed and the extracted text is returned.
+
+    Args:
+        length_low: the minimum length of a paragraph
+        length_high: the maximum length of a paragraph
+        stopwords_low: the minimum stopwords ratio of a paragraph
+        stopwords_high: the maximum stopwords ratio of a paragraph
+        max_link_density: the maximum link density of a paragraph
+        max_heading_distance: the maximum distance between headings of a paragraph
+        no_headings: whether to remove headings from the extracted text
+        remove_boilerplate: whether to remove boilerplate from the extracted text
+        kwargs: any other option will be passed to justext
+        timeout: the timeout for extraction, per document, in seconds
+    """
+
+    name = "⛏ Justext"
+    _requires_dependencies = ["justext"]
+
+    def __init__(
+        self,
+        stoplist: list[str] = None,
+        length_low: int = 70,
+        length_high: int = 200,
+        stopwords_low: float = 0.3,
+        stopwords_high: float = 0.32,
+        max_link_density: float = 0.2,
+        max_heading_distance: int = 200,
+        no_headings: bool = False,
+        remove_boilerplate: bool = True,
+        timeout: float = 0.1,
+        **kwargs,
+    ):
+        super().__init__(timeout)
+        if stoplist is None:
+            stoplist = self.get_stoplist(lang="english")
+        self.stoplist = frozenset(stoplist)
+        self.length_low = length_low
+        self.length_high = length_high
+        self.stopwords_low = stopwords_low
+        self.stopwords_high = stopwords_high
+        self.max_link_density = max_link_density
+        self.max_heading_distance = max_heading_distance
+        self.no_headings = no_headings
+        self.remove_boilerplate = remove_boilerplate
+        self.kwargs = kwargs
+
+    @staticmethod
+    def get_stoplist(lang: str = "english") -> list[str]:
+        from justext import get_stoplist
+
+        return get_stoplist(lang)
+
+    def extract(self, text: str) -> str:
+        """
+
+        Args:
+          text: str: html content
+
+        Returns: plaintext extracted text
+        """
+        from justext import justext
+
+        paragraphs = justext(
+            text,
+            stoplist=self.stoplist,
+            length_low=self.length_low,
+            length_high=self.length_high,
+            stopwords_low=self.stopwords_low,
+            stopwords_high=self.stopwords_high,
+            max_link_density=self.max_link_density,
+            max_heading_distance=self.max_heading_distance,
+            no_headings=self.no_headings,
+            **self.kwargs,
+        )
+
+        # Join text blocks with double newlines to separate them
+        if self.remove_boilerplate:
+            return "\n\n".join([p.text for p in paragraphs if not p.is_boilerplate])
+        else:
+            return "\n\n".join([p.text for p in paragraphs])
diff --git a/src/datatrove/pipeline/extractors/readabilipy.py b/src/datatrove/pipeline/extractors/readabilipy.py
@@ -0,0 +1,57 @@
+from .base import BaseExtractor
+
+
+class ReadabiliPy(BaseExtractor):
+    """ReadabiliPy extractor, it uses https://github.com/alan-turing-institute/ReadabiliPy
+
+    We're using the main entry point of ReadabiliPy: the `simple_json_from_html_string` function.
+    The extracted content is returned as plain text.
+
+    Args:
+        timeout: the timeout for extraction, per document, in seconds
+        use_readability: whether to use Mozilla's Readability.js (requires Node.js)
+        content_digests: whether to include content digests in the output
+        node_indexes: whether to include node indexes in the output
+        **kwargs: any other option will be passed to ReadabiliPy
+    """
+
+    name = "⛏ ReadabiliPy"
+    _requires_dependencies = ["readabilipy"]
+
+    def __init__(
+        self,
+        timeout: float = 0.1,
+        use_readability: bool = False,
+        content_digests: bool = False,
+        node_indexes: bool = False,
+        **kwargs,
+    ):
+        super().__init__(timeout)
+        self.use_readability = use_readability
+        self.content_digests = content_digests
+        self.node_indexes = node_indexes
+        self.kwargs = kwargs
+
+    def extract(self, text: str) -> str:
+        """
+        Args:
+          text: str: html content
+
+        Returns: plaintext extracted text
+        """
+        from readabilipy import simple_json_from_html_string
+
+        result = simple_json_from_html_string(
+            text,
+            use_readability=self.use_readability,
+            content_digests=self.content_digests,
+            node_indexes=self.node_indexes,
+            **self.kwargs,
+        )
+
+        content = result.get("plain_text", "")
+
+        if isinstance(content, list):
+            content = "\n\n".join(block["text"] for block in content)
+
+        return content
diff --git a/src/datatrove/pipeline/extractors/readability.py b/src/datatrove/pipeline/extractors/readability.py
@@ -0,0 +1,59 @@
+from .base import BaseExtractor
+
+
+class Readability(BaseExtractor):
+    """Readability extractor, it uses https://github.com/buriy/python-readability
+
+    We're using the main entry point of readability-lxml: the `Document` class, which cleans up the HTML and outputs a
+    cleaned HTML string.
+
+    The postprocessor (another Datatrove extractor) is used to convert the cleaned HTML to plain text
+
+    Args:
+        timeout: the timeout for extraction, per document, in seconds
+        min_text_length: the minimum length of text to consider
+        retry_length: number of chars to use when searching for body
+        url: the URL of the page (optional, used for better parsing)
+        keep_classes: list of classes to keep in the extracted content
+        **kwargs: any other option will be passed to readability
+    """
+
+    name = "⛏ Readability"
+    _requires_dependencies = ["readability"]
+
+    def __init__(
+        self,
+        postprocessor: BaseExtractor,
+        timeout: float = 0.1,
+        min_text_length: int = 25,
+        retry_length: int = 250,
+        url: str = None,
+        **kwargs,
+    ):
+        super().__init__(timeout)
+        self.postprocessor = postprocessor
+        self.min_text_length = min_text_length
+        self.retry_length = retry_length
+        self.url = url
+        self.kwargs = kwargs
+
+    def extract(self, text: str) -> str:
+        """
+        Args:
+          text: str: html content
+
+        Returns: plaintext extracted text
+        """
+        from readability import Document
+
+        doc = Document(
+            text,
+            min_text_length=self.min_text_length,
+            retry_length=self.retry_length,
+            url=self.url,
+            **self.kwargs,
+        )
+
+        cleaned_html = doc.summary()
+
+        return self.postprocessor.extract(cleaned_html)
diff --git a/src/datatrove/pipeline/extractors/resiliparse.py b/src/datatrove/pipeline/extractors/resiliparse.py
@@ -0,0 +1,74 @@
+from .base import BaseExtractor
+
+
+class Resiliparse(BaseExtractor):
+    """
+    Resiliparse extractor, it uses https://resiliparse.chatnoir.eu/en/latest/index.html
+
+    We're actually only using the main entry point of resiliparse's text extraction: the `extract_plain_text` function.
+    No specific data structure is exchanged with Resiliparse, only the text is passed and the extracted text is returned.
+
+    Args:
+        timeout: the timeout for extraction, per document, in seconds
+        preserve_formatting: whether to preserve the formatting of the text
+        main_content: whether to extract the main content of the document
+        list_bullets: whether to extract the bullets of the document
+        alt_texts: whether to extract the alt texts of the document
+        links: whether to extract the links of the document
+        form_fields: whether to extract the form fields of the document
+        noscript: whether to extract the noscript of the document
+        comments: whether to extract the comments that are present in the document
+        skip_elements: whether to skip the elements of the document
+    """
+
+    name = "⛏ Resiliparse"
+    _requires_dependencies = ["resiliparse"]
+
+    def __init__(
+        self,
+        preserve_formatting: bool = True,
+        main_content: bool = True,
+        list_bullets: bool = True,
+        alt_texts: bool = False,
+        links: bool = False,
+        form_fields: bool = False,
+        noscript: bool = False,
+        comments: bool = True,
+        skip_elements: list = None,
+        timeout: float = 0.1,
+        **kwargs,
+    ):
+        super().__init__(timeout)
+        self.preserve_formatting = preserve_formatting
+        self.main_content = main_content
+        self.list_bullets = list_bullets
+        self.alt_texts = alt_texts
+        self.links = links
+        self.form_fields = form_fields
+        self.noscript = noscript
+        self.comments = comments
+        self.skip_elements = skip_elements
+
+    def extract(self, text: str) -> str:
+        """
+
+        Args:
+          text: str: html content
+
+        Returns: plaintext extracted text
+
+        """
+        from resiliparse.extract.html2text import extract_plain_text
+
+        return extract_plain_text(
+            text,
+            preserve_formatting=self.preserve_formatting,
+            main_content=self.main_content,
+            list_bullets=self.list_bullets,
+            alt_texts=self.alt_texts,
+            links=self.links,
+            form_fields=self.form_fields,
+            noscript=self.noscript,
+            comments=self.comments,
+            skip_elements=self.skip_elements,
+        )