Skip to content

Commit

Permalink
feat: changed configs & pyproject
Browse files Browse the repository at this point in the history
  • Loading branch information
garrethlee committed Sep 27, 2024
1 parent c085736 commit ea3a915
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 0 deletions.
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ processing = [
# "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup",
"tldextract",
"trafilatura>=1.8.0,<1.12.0",
"justext",
"resiliparse",
"readabilipy",
"tokenizers",
"ftfy",
"fasteners",
Expand Down
3 changes: 3 additions & 0 deletions src/datatrove/pipeline/extractors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from .inscriptis import Inscriptis
from .justext import Justext
from .modular import ReadabilityInscriptis
from .readabilipy import ReadabiliPy
from .readability import Readability
from .resiliparse import Resiliparse
from .trafilatura import Trafilatura

0 comments on commit ea3a915

Please sign in to comment.