diff --git a/pyproject.toml b/pyproject.toml index cf226903..b29025ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,9 @@ processing = [ # "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup", "tldextract", "trafilatura>=1.8.0,<1.12.0", + "justext", + "resiliparse", + "readabilipy", "tokenizers", "ftfy", "fasteners", diff --git a/src/datatrove/pipeline/extractors/__init__.py b/src/datatrove/pipeline/extractors/__init__.py index 9620bd2f..c4ef7a89 100644 --- a/src/datatrove/pipeline/extractors/__init__.py +++ b/src/datatrove/pipeline/extractors/__init__.py @@ -1,2 +1,7 @@ +from .inscriptis import Inscriptis +from .justext import Justext from .modular import ReadabilityInscriptis +from .readabilipy import ReadabiliPy +from .readability import Readability +from .resiliparse import Resiliparse from .trafilatura import Trafilatura diff --git a/src/datatrove/pipeline/extractors/inscriptis.py b/src/datatrove/pipeline/extractors/inscriptis.py new file mode 100644 index 00000000..721b308c --- /dev/null +++ b/src/datatrove/pipeline/extractors/inscriptis.py @@ -0,0 +1,63 @@ +import re + +from .base import BaseExtractor + + +class Inscriptis(BaseExtractor): + """Inscriptis extractor, it uses https://github.com/weblyzard/inscriptis + + We're using the main entry point of inscriptis: the `get_text` function. + No specific data structure is exchanged with Inscriptis, only the HTML is passed and the extracted text is returned. + + Args: + timeout: the timeout for extraction, per document, in seconds + deduplicate_captions: whether to remove duplicate captions + display_links: whether to display link targets + display_anchors: whether to display anchor texts + **kwargs: any other option will be passed to inscriptis + """ + + name = "⛏ Inscriptis" + _requires_dependencies = ["inscriptis"] + + def __init__( + self, + timeout: float = 0.1, + max_new_lines: int = 2, + deduplicate_captions: bool = True, + display_links: bool = False, + display_anchors: bool = True, + **kwargs, + ): + super().__init__(timeout) + self.new_line_chars = "\n" * max_new_lines + self.deduplicate_captions = deduplicate_captions + self.display_links = display_links + self.display_anchors = display_anchors + self.kwargs = kwargs + self.regex_excessive_lines = re.compile(r"(" + self.new_line_chars + "\n+)") + + def extract(self, text: str) -> str: + """ + Args: + text: str: html content + + Returns: plaintext extracted text + """ + from inscriptis import get_text + from inscriptis.css_profiles import CSS_PROFILES + from inscriptis.model.config import ParserConfig + + text = get_text( + html_content=text, + config=ParserConfig( + css=CSS_PROFILES["strict"], + deduplicate_captions=self.deduplicate_captions, + display_links=self.display_links, + display_anchors=self.display_anchors, + **self.kwargs, + ), + ) + + # remove excessive empty lines + return self.regex_excessive_lines.sub(self.new_line_chars, text).strip() diff --git a/src/datatrove/pipeline/extractors/justext.py b/src/datatrove/pipeline/extractors/justext.py new file mode 100644 index 00000000..616e9e60 --- /dev/null +++ b/src/datatrove/pipeline/extractors/justext.py @@ -0,0 +1,87 @@ +from .base import BaseExtractor + + +class Justext(BaseExtractor): + """Justext extractor, it uses https://github.com/miso-belica/jusText + + We're actually only using the main entry point of justext: the `justext` function. + No specific data structure is exchanged with Justext, only the text is passed and the extracted text is returned. + + Args: + length_low: the minimum length of a paragraph + length_high: the maximum length of a paragraph + stopwords_low: the minimum stopwords ratio of a paragraph + stopwords_high: the maximum stopwords ratio of a paragraph + max_link_density: the maximum link density of a paragraph + max_heading_distance: the maximum distance between headings of a paragraph + no_headings: whether to remove headings from the extracted text + remove_boilerplate: whether to remove boilerplate from the extracted text + kwargs: any other option will be passed to justext + timeout: the timeout for extraction, per document, in seconds + """ + + name = "⛏ Justext" + _requires_dependencies = ["justext"] + + def __init__( + self, + stoplist: list[str] = None, + length_low: int = 70, + length_high: int = 200, + stopwords_low: float = 0.3, + stopwords_high: float = 0.32, + max_link_density: float = 0.2, + max_heading_distance: int = 200, + no_headings: bool = False, + remove_boilerplate: bool = True, + timeout: float = 0.1, + **kwargs, + ): + super().__init__(timeout) + if stoplist is None: + stoplist = self.get_stoplist(lang="english") + self.stoplist = frozenset(stoplist) + self.length_low = length_low + self.length_high = length_high + self.stopwords_low = stopwords_low + self.stopwords_high = stopwords_high + self.max_link_density = max_link_density + self.max_heading_distance = max_heading_distance + self.no_headings = no_headings + self.remove_boilerplate = remove_boilerplate + self.kwargs = kwargs + + @staticmethod + def get_stoplist(lang: str = "english") -> list[str]: + from justext import get_stoplist + + return get_stoplist(lang) + + def extract(self, text: str) -> str: + """ + + Args: + text: str: html content + + Returns: plaintext extracted text + """ + from justext import justext + + paragraphs = justext( + text, + stoplist=self.stoplist, + length_low=self.length_low, + length_high=self.length_high, + stopwords_low=self.stopwords_low, + stopwords_high=self.stopwords_high, + max_link_density=self.max_link_density, + max_heading_distance=self.max_heading_distance, + no_headings=self.no_headings, + **self.kwargs, + ) + + # Join text blocks with double newlines to separate them + if self.remove_boilerplate: + return "\n\n".join([p.text for p in paragraphs if not p.is_boilerplate]) + else: + return "\n\n".join([p.text for p in paragraphs]) diff --git a/src/datatrove/pipeline/extractors/readabilipy.py b/src/datatrove/pipeline/extractors/readabilipy.py new file mode 100644 index 00000000..5b1c7861 --- /dev/null +++ b/src/datatrove/pipeline/extractors/readabilipy.py @@ -0,0 +1,57 @@ +from .base import BaseExtractor + + +class ReadabiliPy(BaseExtractor): + """ReadabiliPy extractor, it uses https://github.com/alan-turing-institute/ReadabiliPy + + We're using the main entry point of ReadabiliPy: the `simple_json_from_html_string` function. + The extracted content is returned as plain text. + + Args: + timeout: the timeout for extraction, per document, in seconds + use_readability: whether to use Mozilla's Readability.js (requires Node.js) + content_digests: whether to include content digests in the output + node_indexes: whether to include node indexes in the output + **kwargs: any other option will be passed to ReadabiliPy + """ + + name = "⛏ ReadabiliPy" + _requires_dependencies = ["readabilipy"] + + def __init__( + self, + timeout: float = 0.1, + use_readability: bool = False, + content_digests: bool = False, + node_indexes: bool = False, + **kwargs, + ): + super().__init__(timeout) + self.use_readability = use_readability + self.content_digests = content_digests + self.node_indexes = node_indexes + self.kwargs = kwargs + + def extract(self, text: str) -> str: + """ + Args: + text: str: html content + + Returns: plaintext extracted text + """ + from readabilipy import simple_json_from_html_string + + result = simple_json_from_html_string( + text, + use_readability=self.use_readability, + content_digests=self.content_digests, + node_indexes=self.node_indexes, + **self.kwargs, + ) + + content = result.get("plain_text", "") + + if isinstance(content, list): + content = "\n\n".join(block["text"] for block in content) + + return content diff --git a/src/datatrove/pipeline/extractors/readability.py b/src/datatrove/pipeline/extractors/readability.py new file mode 100644 index 00000000..f993220f --- /dev/null +++ b/src/datatrove/pipeline/extractors/readability.py @@ -0,0 +1,59 @@ +from .base import BaseExtractor + + +class Readability(BaseExtractor): + """Readability extractor, it uses https://github.com/buriy/python-readability + + We're using the main entry point of readability-lxml: the `Document` class, which cleans up the HTML and outputs a + cleaned HTML string. + + The postprocessor (another Datatrove extractor) is used to convert the cleaned HTML to plain text + + Args: + timeout: the timeout for extraction, per document, in seconds + min_text_length: the minimum length of text to consider + retry_length: number of chars to use when searching for body + url: the URL of the page (optional, used for better parsing) + keep_classes: list of classes to keep in the extracted content + **kwargs: any other option will be passed to readability + """ + + name = "⛏ Readability" + _requires_dependencies = ["readability"] + + def __init__( + self, + postprocessor: BaseExtractor, + timeout: float = 0.1, + min_text_length: int = 25, + retry_length: int = 250, + url: str = None, + **kwargs, + ): + super().__init__(timeout) + self.postprocessor = postprocessor + self.min_text_length = min_text_length + self.retry_length = retry_length + self.url = url + self.kwargs = kwargs + + def extract(self, text: str) -> str: + """ + Args: + text: str: html content + + Returns: plaintext extracted text + """ + from readability import Document + + doc = Document( + text, + min_text_length=self.min_text_length, + retry_length=self.retry_length, + url=self.url, + **self.kwargs, + ) + + cleaned_html = doc.summary() + + return self.postprocessor.extract(cleaned_html) diff --git a/src/datatrove/pipeline/extractors/resiliparse.py b/src/datatrove/pipeline/extractors/resiliparse.py new file mode 100644 index 00000000..74c2a997 --- /dev/null +++ b/src/datatrove/pipeline/extractors/resiliparse.py @@ -0,0 +1,74 @@ +from .base import BaseExtractor + + +class Resiliparse(BaseExtractor): + """ + Resiliparse extractor, it uses https://resiliparse.chatnoir.eu/en/latest/index.html + + We're actually only using the main entry point of resiliparse's text extraction: the `extract_plain_text` function. + No specific data structure is exchanged with Resiliparse, only the text is passed and the extracted text is returned. + + Args: + timeout: the timeout for extraction, per document, in seconds + preserve_formatting: whether to preserve the formatting of the text + main_content: whether to extract the main content of the document + list_bullets: whether to extract the bullets of the document + alt_texts: whether to extract the alt texts of the document + links: whether to extract the links of the document + form_fields: whether to extract the form fields of the document + noscript: whether to extract the noscript of the document + comments: whether to extract the comments that are present in the document + skip_elements: whether to skip the elements of the document + """ + + name = "⛏ Resiliparse" + _requires_dependencies = ["resiliparse"] + + def __init__( + self, + preserve_formatting: bool = True, + main_content: bool = True, + list_bullets: bool = True, + alt_texts: bool = False, + links: bool = False, + form_fields: bool = False, + noscript: bool = False, + comments: bool = True, + skip_elements: list = None, + timeout: float = 0.1, + **kwargs, + ): + super().__init__(timeout) + self.preserve_formatting = preserve_formatting + self.main_content = main_content + self.list_bullets = list_bullets + self.alt_texts = alt_texts + self.links = links + self.form_fields = form_fields + self.noscript = noscript + self.comments = comments + self.skip_elements = skip_elements + + def extract(self, text: str) -> str: + """ + + Args: + text: str: html content + + Returns: plaintext extracted text + + """ + from resiliparse.extract.html2text import extract_plain_text + + return extract_plain_text( + text, + preserve_formatting=self.preserve_formatting, + main_content=self.main_content, + list_bullets=self.list_bullets, + alt_texts=self.alt_texts, + links=self.links, + form_fields=self.form_fields, + noscript=self.noscript, + comments=self.comments, + skip_elements=self.skip_elements, + ) diff --git a/tests/pipeline/test_extractors.py b/tests/pipeline/test_extractors.py index ec7e1417..b306e1e8 100644 --- a/tests/pipeline/test_extractors.py +++ b/tests/pipeline/test_extractors.py @@ -1,8 +1,23 @@ import unittest -from datatrove.pipeline.extractors import ReadabilityInscriptis, Trafilatura +from datatrove.pipeline.extractors import ( + Inscriptis, + Justext, + ReadabiliPy, + Readability, + ReadabilityInscriptis, + Resiliparse, + Trafilatura, +) -from ..utils import require_inscriptis, require_readability, require_trafilatura +from ..utils import ( + require_inscriptis, + require_justext, + require_readabilipy, + require_readability, + require_resiliparse, + require_trafilatura, +) ARTICLE_HTML = "

Hello World!

" @@ -16,6 +31,32 @@ def test_basic_article_trafilatura(self): @require_readability @require_inscriptis - def test_basic_article_readability(self): + def test_basic_article_readability_inscriptis(self): extractor = ReadabilityInscriptis(min_text_length=10, min_text_score=1) self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!") + + @require_justext + def test_basic_article_justext(self): + extractor = Justext(remove_boilerplate=False) + self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!") + + @require_resiliparse + def test_basic_article_resiliparse(self): + extractor = Resiliparse() + self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!") + + @require_readabilipy + def test_basic_article_readabilipy(self): + extractor = ReadabiliPy() + self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!") + + @require_inscriptis + def test_basic_article_inscriptis(self): + extractor = Inscriptis() + self.assertEqual(extractor.extract(ARTICLE_HTML), "Hello World!") + + @require_readability + def test_basic_article_readability(self): + extractor = Readability(min_text_length=10, min_text_score=1) + postprocessor = Trafilatura() + self.assertEqual(extractor.extract(ARTICLE_HTML, postprocessor=postprocessor), "Hello World!") diff --git a/tests/utils.py b/tests/utils.py index 3d076308..4d00755a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -55,6 +55,14 @@ def require_trafilatura(test_case): return test_case +def require_justext(test_case): + try: + import justext # noqa: F401 + except ImportError: + test_case = unittest.skip("test requires justext")(test_case) + return test_case + + def require_readability(test_case): try: import readability # noqa: F401 @@ -63,6 +71,14 @@ def require_readability(test_case): return test_case +def require_resiliparse(test_case): + try: + import resiliparse # noqa: F401 + except ImportError: + test_case = unittest.skip("test requires resiliparse")(test_case) + return test_case + + def require_inscriptis(test_case): try: import inscriptis # noqa: F401 @@ -71,6 +87,14 @@ def require_inscriptis(test_case): return test_case +def require_readabilipy(test_case): + try: + import readabilipy # noqa: F401 + except ImportError: + test_case = unittest.skip("test requires readabilipy")(test_case) + return test_case + + def require_pyarrow(test_case): try: import pyarrow # noqa: F401