Skip to content

Commit

Permalink
fix: move postprocessor to init
Browse files Browse the repository at this point in the history
  • Loading branch information
garrethlee committed Sep 27, 2024
1 parent ea3a915 commit 891850e
Showing 1 changed file with 8 additions and 7 deletions.
15 changes: 8 additions & 7 deletions src/datatrove/pipeline/extractors/readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
class Readability(BaseExtractor):
"""Readability extractor, it uses https://github.com/buriy/python-readability
We're using the main entry point of readability-lxml: the `Document` class.
No specific data structure is exchanged with Readability, only the HTML is passed and the extracted text is returned.
We're using the main entry point of readability-lxml: the `Document` class, which cleans up the HTML and outputs a
cleaned HTML string.
The postprocessor (another Datatrove extractor) is used to convert the cleaned HTML to plain text
Args:
timeout: the timeout for extraction, per document, in seconds
Expand All @@ -21,19 +23,21 @@ class Readability(BaseExtractor):

def __init__(
self,
postprocessor: BaseExtractor,
timeout: float = 0.1,
min_text_length: int = 25,
retry_length: int = 250,
url: str = None,
**kwargs,
):
super().__init__(timeout)
self.postprocessor = postprocessor
self.min_text_length = min_text_length
self.retry_length = retry_length
self.url = url
self.kwargs = kwargs

def extract(self, text: str, postprocessor: BaseExtractor) -> str:
def extract(self, text: str) -> str:
"""
Args:
text: str: html content
Expand All @@ -42,9 +46,6 @@ def extract(self, text: str, postprocessor: BaseExtractor) -> str:
"""
from readability import Document

if not postprocessor:
raise ValueError("A postprocessor (extractor) must be provided")

doc = Document(
text,
min_text_length=self.min_text_length,
Expand All @@ -55,4 +56,4 @@ def extract(self, text: str, postprocessor: BaseExtractor) -> str:

cleaned_html = doc.summary()

return postprocessor.extract(cleaned_html)
return self.postprocessor.extract(cleaned_html)

0 comments on commit 891850e

Please sign in to comment.