Skip to content

Commit

Permalink
config test pass
Browse files Browse the repository at this point in the history
  • Loading branch information
BeachWang committed Sep 5, 2024
1 parent f32e359 commit a41ccb6
Show file tree
Hide file tree
Showing 54 changed files with 175 additions and 181 deletions.
6 changes: 0 additions & 6 deletions data_juicer/ops/base_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,6 @@ def __init__(self, *args, **kwargs):
"""
Base class of operators.
:param extra_requirements: the extra requirements of the OP,
check and auto install here
:param text_key: the key name of field that stores sample texts
to be processed.
:param image_key: the key name of field that stores sample image list
Expand All @@ -136,10 +134,6 @@ def __init__(self, *args, **kwargs):
:param video_key: the key name of field that stores sample video list
to be processed
"""
# check and auto install extra dependencies
self.extra_requirements = kwargs.get('extra_requirements', [])
AUTOINSTALL.check(self.extra_requirements)

# init data keys
self.text_key = kwargs.get('text_key', 'text')
self.image_key = kwargs.get('image_key', 'images')
Expand Down
5 changes: 3 additions & 2 deletions data_juicer/ops/deduplicator/document_minhash_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.model_utils import prepare_sentencepiece_model

from ..base_op import OPERATORS, Deduplicator
from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
from ..common.helper_func import UnionFind, split_on_whitespace

OP_NAME = 'document_minhash_deduplicator'
Expand Down Expand Up @@ -148,7 +148,8 @@ def __init__(
:param tokenizer_model: path for the sentencepiece model, used for
sentencepiece tokenization.
"""
super().__init__(extra_requirements=['scipy'], *args, **kwargs)
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['scipy'])
# about minhash computation
self.tokenization = tokenization
self.window_size = window_size
Expand Down
7 changes: 3 additions & 4 deletions data_juicer/ops/deduplicator/document_simhash_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from data_juicer.utils.constant import HashKeys
from data_juicer.utils.lazy_loader import LazyLoader

from ..base_op import OPERATORS, Deduplicator
from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
from ..common.helper_func import split_on_whitespace

OP_NAME = 'document_simhash_deduplicator'
Expand Down Expand Up @@ -55,9 +55,8 @@ def __init__(self,
num_blocks
"""
# about simhash computation
super().__init__(extra_requirements=['simhash-pybind'],
*args,
**kwargs)
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['simhash-pybind'])
self.tokenization = tokenization
self.window_size = window_size
self.lowercase = lowercase
Expand Down
5 changes: 3 additions & 2 deletions data_juicer/ops/deduplicator/image_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image

from ..base_op import OPERATORS, Deduplicator
from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
from ..op_fusion import LOADED_IMAGES
from .document_deduplicator import DocumentDeduplicator

Expand Down Expand Up @@ -48,7 +48,8 @@ def __init__(self,
:param args: extra args
:param kwargs: extra args
"""
super().__init__(extra_requirements=['imagededup'], *args, **kwargs)
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['imagededup'])
if method not in HASH_METHOD:
raise ValueError(f'Keep strategy [{method}] is not supported. '
f'Can only be one of {HASH_METHOD}.')
Expand Down
6 changes: 3 additions & 3 deletions data_juicer/ops/deduplicator/ray_image_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image

from ..base_op import OPERATORS
from ..base_op import AUTOINSTALL, OPERATORS
from ..op_fusion import LOADED_IMAGES
from .ray_basic_deduplicator import RayBasicDeduplicator

Expand Down Expand Up @@ -44,11 +44,11 @@ def __init__(self,
:param args: extra args
:param kwargs: extra args
"""
super().__init__(extra_requirements=['imagededup'],
redis_host=redis_host,
super().__init__(redis_host=redis_host,
redis_port=redis_port,
*args,
**kwargs)
AUTOINSTALL.check(['imagededup'])
if method not in HASH_METHOD:
raise ValueError(f'Keep strategy [{method}] is not supported. '
f'Can only be one of {HASH_METHOD}.')
Expand Down
5 changes: 3 additions & 2 deletions data_juicer/ops/filter/alphanumeric_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..common import get_words_from_document

OP_NAME = 'alphanumeric_filter'
Expand Down Expand Up @@ -38,7 +38,8 @@ def __init__(self,
:param args: extra args
:param kwargs: extra args
"""
super().__init__(extra_requirements=['transformers'], *args, **kwargs)
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['transformers'])
self.tokenization = tokenization
self.min_ratio = min_ratio
self.max_ratio = max_ratio
Expand Down
5 changes: 3 additions & 2 deletions data_juicer/ops/filter/flagged_words_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from data_juicer.utils.model_utils import get_model, prepare_model

from ...utils.asset_utils import ASSET_DIR, load_words_asset
from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
words_refinement)
from ..op_fusion import INTER_WORDS
Expand Down Expand Up @@ -51,7 +51,8 @@ def __init__(self,
:param args: extra args
:param kwargs: extra args
"""
super().__init__(extra_requirements=['sentencepiece'], *args, **kwargs)
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['sentencepiece'])
self.lang = lang
self.max_ratio = max_ratio
self.use_words_aug = use_words_aug
Expand Down
10 changes: 4 additions & 6 deletions data_juicer/ops/filter/image_aesthetics_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from data_juicer.utils.mm_utils import load_data_with_context, load_image

from ...utils.model_utils import get_model, prepare_model
from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_aesthetics_filter'
Expand Down Expand Up @@ -49,11 +49,9 @@ def __init__(self,
:param kwargs: Extra keyword arguments.
"""

super().__init__(extra_requirements=[
'torch', 'transformers', 'simple-aesthetics-predictor'
],
*args,
**kwargs)
super().__init__(*args, **kwargs)
AUTOINSTALL.check(
['torch', 'transformers', 'simple-aesthetics-predictor'])
if hf_scorer_model == '':
hf_scorer_model = \
'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE'
Expand Down
5 changes: 3 additions & 2 deletions data_juicer/ops/filter/image_face_ratio_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
load_image)
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, UNFORKABLE, Filter
from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_face_ratio_filter'
Expand Down Expand Up @@ -53,7 +53,8 @@ def __init__(self,
:param args: Extra positional arguments.
:param kwargs: Extra keyword arguments.
"""
super().__init__(extra_requirements=['opencv-python'], *args, **kwargs)
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['opencv-python'])

if cv_classifier == '':
cv_classifier = os.path.join(cv2.data.haarcascades,
Expand Down
7 changes: 3 additions & 4 deletions data_juicer/ops/filter/image_nsfw_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from data_juicer.utils.mm_utils import load_data_with_context, load_image
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_nsfw_filter'
Expand Down Expand Up @@ -43,9 +43,8 @@ def __init__(self,
:param args: extra args
:param kwargs: extra args
"""
super().__init__(extra_requirements=['torch', 'transformers'],
*args,
**kwargs)
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['torch', 'transformers'])
self.score_threshold = score_threshold
if any_or_all not in ['any', 'all']:
raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
Expand Down
7 changes: 3 additions & 4 deletions data_juicer/ops/filter/image_text_matching_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
load_image, remove_special_tokens)
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_text_matching_filter'
Expand Down Expand Up @@ -57,9 +57,8 @@ def __init__(self,
:param args: extra args
:param kwargs: extra args
"""
super().__init__(extra_requirements=['torch', 'transformers'],
*args,
**kwargs)
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['torch', 'transformers'])
self.min_score = min_score
self.max_score = max_score
if reduce_mode not in ['avg', 'max', 'min']:
Expand Down
7 changes: 3 additions & 4 deletions data_juicer/ops/filter/image_text_similarity_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
load_image, remove_special_tokens)
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_text_similarity_filter'
Expand Down Expand Up @@ -57,9 +57,8 @@ def __init__(self,
:param args: extra args
:param kwargs: extra args
"""
super().__init__(extra_requirements=['torch', 'transformers'],
*args,
**kwargs)
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['torch', 'transformers'])
self.min_score = min_score
self.max_score = max_score
if reduce_mode not in ['avg', 'max', 'min']:
Expand Down
7 changes: 3 additions & 4 deletions data_juicer/ops/filter/image_watermark_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from data_juicer.utils.mm_utils import load_data_with_context, load_image
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_watermark_filter'
Expand Down Expand Up @@ -47,9 +47,8 @@ def __init__(self,
:param args: extra args
:param kwargs: extra args
"""
super().__init__(extra_requirements=['torch', 'transformers'],
*args,
**kwargs)
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['torch', 'transformers'])
self.prob_threshold = prob_threshold
if any_or_all not in ['any', 'all']:
raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
Expand Down
7 changes: 3 additions & 4 deletions data_juicer/ops/filter/language_id_score_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter

OP_NAME = 'language_id_score_filter'

Expand All @@ -33,9 +33,8 @@ def __init__(self,
:param args: extra args
:param kwargs: extra args
"""
super().__init__(extra_requirements=['fasttext', 'fasttext-wheel'],
*args,
**kwargs)
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['fasttext', 'fasttext-wheel'])
if not lang:
# lang is [], '' or None
self.lang = None
Expand Down
7 changes: 3 additions & 4 deletions data_juicer/ops/filter/perplexity_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..common import get_words_from_document
from ..op_fusion import INTER_WORDS

Expand Down Expand Up @@ -38,9 +38,8 @@ def __init__(self,
:param args: extra args
:param kwargs: extra args
"""
super().__init__(extra_requirements=['sentencepiece', 'kenlm'],
*args,
**kwargs)
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['sentencepiece', 'kenlm'])
self.max_ppl = max_ppl
self.lang = lang
self.sp_model_key = prepare_model(model_type='sentencepiece',
Expand Down
7 changes: 3 additions & 4 deletions data_juicer/ops/filter/phrase_grounding_recall_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
remove_special_tokens)
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'phrase_grounding_recall_filter'
Expand Down Expand Up @@ -116,9 +116,8 @@ def __init__(self,
:param args: extra args
:param kwargs: extra args
"""
super().__init__(extra_requirements=['torch', 'transformers', 'nltk'],
*args,
**kwargs)
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['torch', 'transformers', 'nltk'])
self.min_recall = min_recall
self.max_recall = max_recall
if reduce_mode not in ['avg', 'max', 'min']:
Expand Down
5 changes: 3 additions & 2 deletions data_juicer/ops/filter/stopwords_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
words_refinement)
from ..op_fusion import INTER_WORDS
Expand Down Expand Up @@ -53,7 +53,8 @@ def __init__(self,
:param args: extra args
:param kwargs: extra args
"""
super().__init__(extra_requirements=['sentencepiece'], *args, **kwargs)
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['sentencepiece'])
self.lang = lang
self.min_ratio = min_ratio
self.use_words_aug = use_words_aug
Expand Down
5 changes: 3 additions & 2 deletions data_juicer/ops/filter/text_action_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from data_juicer.utils.mm_utils import remove_special_tokens
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter

OP_NAME = 'text_action_filter'

Expand All @@ -27,7 +27,8 @@ def __init__(self,
will be filtered if their action number in the text is below this
parameter.
"""
super().__init__(extra_requirements=['spacy-pkuseg'], *args, **kwargs)
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['spacy-pkuseg'])

if lang not in ['en', 'zh']:
raise ValueError(
Expand Down
5 changes: 3 additions & 2 deletions data_juicer/ops/filter/text_entity_dependency_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from data_juicer.utils.mm_utils import remove_special_tokens
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter

OP_NAME = 'text_entity_dependency_filter'

Expand Down Expand Up @@ -34,7 +34,8 @@ def __init__(self,
'any': keep this sample if any objet is dependent. 'all': keep this
sample only if all images are dependent.
"""
super().__init__(extra_requirements=['spacy-pkuseg'], *args, **kwargs)
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['spacy-pkuseg'])

if lang not in ['en', 'zh']:
raise ValueError(
Expand Down
Loading

0 comments on commit a41ccb6

Please sign in to comment.