forked from huggingface/datatrove
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrafilatura.py
55 lines (44 loc) · 1.71 KB
/
trafilatura.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from .base import BaseExtractor
class Trafilatura(BaseExtractor):
"""Trafilatura extractor, it uses https://trafilatura.readthedocs.io/en/latest/index.html
We're actually only using the main entry point of trafilatura: the `extract` function.
No specific data structure is exchanged with Trafilatura, only the text is passed and the extracted text is returned.
Alternatively and identically, `trafilatura` could be used through its command line main interface.
Args:
favour_precision: prefer less text but correct extraction.
include_images: not implemented currently
timeout: the timeout for extraction, per document, in seconds
deduplicate: trafilatura's deduplicate option
**kwargs: any other option will be passed to trafilatura
"""
name = "⛏ Trafilatura"
_requires_dependencies = ["trafilatura"]
def __init__(
self,
favour_precision: bool = True,
include_images: bool = False,
timeout: float = 0.1,
deduplicate: bool = True,
**kwargs,
):
super().__init__(timeout)
self.favour_precision = favour_precision
self.include_images = include_images
self.deduplicate = deduplicate
self.kwargs = kwargs
if self.include_images:
raise NotImplementedError
def extract(self, text: str) -> str:
"""
Args:
text: str: html content
Returns: plain text extracted text
"""
from trafilatura import extract
return extract(
text,
favor_precision=self.favour_precision,
include_comments=False,
deduplicate=self.deduplicate,
**self.kwargs,
)