Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add several open-source text extraction libraries #293

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ processing = [
# "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup",
"tldextract",
"trafilatura>=1.8.0,<1.12.0",
"justext",
"resiliparse",
"readabilipy",
"tokenizers",
"ftfy",
"fasteners",
Expand Down
5 changes: 5 additions & 0 deletions src/datatrove/pipeline/extractors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
from .inscriptis import Inscriptis
from .justext import Justext
from .modular import ReadabilityInscriptis
from .readabilipy import ReadabiliPy
from .readability import Readability
from .resiliparse import Resiliparse
from .trafilatura import Trafilatura
63 changes: 63 additions & 0 deletions src/datatrove/pipeline/extractors/inscriptis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import re

from .base import BaseExtractor


class Inscriptis(BaseExtractor):
"""Inscriptis extractor, it uses https://github.com/weblyzard/inscriptis

We're using the main entry point of inscriptis: the `get_text` function.
No specific data structure is exchanged with Inscriptis, only the HTML is passed and the extracted text is returned.

Args:
timeout: the timeout for extraction, per document, in seconds
deduplicate_captions: whether to remove duplicate captions
display_links: whether to display link targets
display_anchors: whether to display anchor texts
**kwargs: any other option will be passed to inscriptis
"""

name = "⛏ Inscriptis"
_requires_dependencies = ["inscriptis"]

def __init__(
self,
timeout: float = 0.1,
max_new_lines: int = 2,
deduplicate_captions: bool = True,
display_links: bool = False,
display_anchors: bool = True,
**kwargs,
):
super().__init__(timeout)
self.new_line_chars = "\n" * max_new_lines
self.deduplicate_captions = deduplicate_captions
self.display_links = display_links
self.display_anchors = display_anchors
self.kwargs = kwargs
self.regex_excessive_lines = re.compile(r"(" + self.new_line_chars + "\n+)")

def extract(self, text: str) -> str:
"""
Args:
text: str: html content

Returns: plaintext extracted text
"""
from inscriptis import get_text
from inscriptis.css_profiles import CSS_PROFILES
from inscriptis.model.config import ParserConfig

text = get_text(
html_content=text,
config=ParserConfig(
css=CSS_PROFILES["strict"],
deduplicate_captions=self.deduplicate_captions,
display_links=self.display_links,
display_anchors=self.display_anchors,
**self.kwargs,
),
)

# remove excessive empty lines
return self.regex_excessive_lines.sub(self.new_line_chars, text).strip()
87 changes: 87 additions & 0 deletions src/datatrove/pipeline/extractors/justext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from .base import BaseExtractor


class Justext(BaseExtractor):
"""Justext extractor, it uses https://github.com/miso-belica/jusText

We're actually only using the main entry point of justext: the `justext` function.
No specific data structure is exchanged with Justext, only the text is passed and the extracted text is returned.

Args:
length_low: the minimum length of a paragraph
length_high: the maximum length of a paragraph
stopwords_low: the minimum stopwords ratio of a paragraph
stopwords_high: the maximum stopwords ratio of a paragraph
max_link_density: the maximum link density of a paragraph
max_heading_distance: the maximum distance between headings of a paragraph
no_headings: whether to remove headings from the extracted text
remove_boilerplate: whether to remove boilerplate from the extracted text
kwargs: any other option will be passed to justext
timeout: the timeout for extraction, per document, in seconds
"""

name = "⛏ Justext"
_requires_dependencies = ["justext"]

def __init__(
self,
stoplist: list[str] = None,
length_low: int = 70,
length_high: int = 200,
stopwords_low: float = 0.3,
stopwords_high: float = 0.32,
max_link_density: float = 0.2,
max_heading_distance: int = 200,
no_headings: bool = False,
remove_boilerplate: bool = True,
timeout: float = 0.1,
**kwargs,
):
super().__init__(timeout)
if stoplist is None:
stoplist = self.get_stoplist(lang="english")
self.stoplist = frozenset(stoplist)
self.length_low = length_low
self.length_high = length_high
self.stopwords_low = stopwords_low
self.stopwords_high = stopwords_high
self.max_link_density = max_link_density
self.max_heading_distance = max_heading_distance
self.no_headings = no_headings
self.remove_boilerplate = remove_boilerplate
self.kwargs = kwargs

@staticmethod
def get_stoplist(lang: str = "english") -> list[str]:
from justext import get_stoplist

return get_stoplist(lang)

def extract(self, text: str) -> str:
"""

Args:
text: str: html content

Returns: plaintext extracted text
"""
from justext import justext

paragraphs = justext(
text,
stoplist=self.stoplist,
length_low=self.length_low,
length_high=self.length_high,
stopwords_low=self.stopwords_low,
stopwords_high=self.stopwords_high,
max_link_density=self.max_link_density,
max_heading_distance=self.max_heading_distance,
no_headings=self.no_headings,
**self.kwargs,
)

# Join text blocks with double newlines to separate them
if self.remove_boilerplate:
return "\n\n".join([p.text for p in paragraphs if not p.is_boilerplate])
else:
return "\n\n".join([p.text for p in paragraphs])
57 changes: 57 additions & 0 deletions src/datatrove/pipeline/extractors/readabilipy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from .base import BaseExtractor


class ReadabiliPy(BaseExtractor):
"""ReadabiliPy extractor, it uses https://github.com/alan-turing-institute/ReadabiliPy

We're using the main entry point of ReadabiliPy: the `simple_json_from_html_string` function.
The extracted content is returned as plain text.

Args:
timeout: the timeout for extraction, per document, in seconds
use_readability: whether to use Mozilla's Readability.js (requires Node.js)
content_digests: whether to include content digests in the output
node_indexes: whether to include node indexes in the output
**kwargs: any other option will be passed to ReadabiliPy
"""

name = "⛏ ReadabiliPy"
_requires_dependencies = ["readabilipy"]

def __init__(
self,
timeout: float = 0.1,
use_readability: bool = False,
content_digests: bool = False,
node_indexes: bool = False,
**kwargs,
):
super().__init__(timeout)
self.use_readability = use_readability
self.content_digests = content_digests
self.node_indexes = node_indexes
self.kwargs = kwargs

def extract(self, text: str) -> str:
"""
Args:
text: str: html content

Returns: plaintext extracted text
"""
from readabilipy import simple_json_from_html_string

result = simple_json_from_html_string(
text,
use_readability=self.use_readability,
content_digests=self.content_digests,
node_indexes=self.node_indexes,
**self.kwargs,
)

content = result.get("plain_text", "")

if isinstance(content, list):
content = "\n\n".join(block["text"] for block in content)

return content
59 changes: 59 additions & 0 deletions src/datatrove/pipeline/extractors/readability.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from .base import BaseExtractor


class Readability(BaseExtractor):
"""Readability extractor, it uses https://github.com/buriy/python-readability

We're using the main entry point of readability-lxml: the `Document` class, which cleans up the HTML and outputs a
cleaned HTML string.

The postprocessor (another Datatrove extractor) is used to convert the cleaned HTML to plain text

Args:
timeout: the timeout for extraction, per document, in seconds
min_text_length: the minimum length of text to consider
retry_length: number of chars to use when searching for body
url: the URL of the page (optional, used for better parsing)
keep_classes: list of classes to keep in the extracted content
**kwargs: any other option will be passed to readability
"""

name = "⛏ Readability"
_requires_dependencies = ["readability"]

def __init__(
self,
postprocessor: BaseExtractor,
timeout: float = 0.1,
min_text_length: int = 25,
retry_length: int = 250,
url: str = None,
**kwargs,
):
super().__init__(timeout)
self.postprocessor = postprocessor
self.min_text_length = min_text_length
self.retry_length = retry_length
self.url = url
self.kwargs = kwargs

def extract(self, text: str) -> str:
"""
Args:
text: str: html content

Returns: plaintext extracted text
"""
from readability import Document

doc = Document(
text,
min_text_length=self.min_text_length,
retry_length=self.retry_length,
url=self.url,
**self.kwargs,
)

cleaned_html = doc.summary()

return self.postprocessor.extract(cleaned_html)
74 changes: 74 additions & 0 deletions src/datatrove/pipeline/extractors/resiliparse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from .base import BaseExtractor


class Resiliparse(BaseExtractor):
"""
Resiliparse extractor, it uses https://resiliparse.chatnoir.eu/en/latest/index.html

We're actually only using the main entry point of resiliparse's text extraction: the `extract_plain_text` function.
No specific data structure is exchanged with Resiliparse, only the text is passed and the extracted text is returned.

Args:
timeout: the timeout for extraction, per document, in seconds
preserve_formatting: whether to preserve the formatting of the text
main_content: whether to extract the main content of the document
list_bullets: whether to extract the bullets of the document
alt_texts: whether to extract the alt texts of the document
links: whether to extract the links of the document
form_fields: whether to extract the form fields of the document
noscript: whether to extract the noscript of the document
comments: whether to extract the comments that are present in the document
skip_elements: whether to skip the elements of the document
"""

name = "⛏ Resiliparse"
_requires_dependencies = ["resiliparse"]

def __init__(
self,
preserve_formatting: bool = True,
main_content: bool = True,
list_bullets: bool = True,
alt_texts: bool = False,
links: bool = False,
form_fields: bool = False,
noscript: bool = False,
comments: bool = True,
skip_elements: list = None,
timeout: float = 0.1,
**kwargs,
):
super().__init__(timeout)
self.preserve_formatting = preserve_formatting
self.main_content = main_content
self.list_bullets = list_bullets
self.alt_texts = alt_texts
self.links = links
self.form_fields = form_fields
self.noscript = noscript
self.comments = comments
self.skip_elements = skip_elements

def extract(self, text: str) -> str:
"""

Args:
text: str: html content

Returns: plaintext extracted text

"""
from resiliparse.extract.html2text import extract_plain_text

return extract_plain_text(
text,
preserve_formatting=self.preserve_formatting,
main_content=self.main_content,
list_bullets=self.list_bullets,
alt_texts=self.alt_texts,
links=self.links,
form_fields=self.form_fields,
noscript=self.noscript,
comments=self.comments,
skip_elements=self.skip_elements,
)
Loading
Loading