config test pass

modelscope · Sep 5, 2024 · a41ccb6 · a41ccb6
1 parent f32e359
commit a41ccb6
Show file tree

Hide file tree

Showing 54 changed files with 175 additions and 181 deletions.
diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py
@@ -125,8 +125,6 @@ def __init__(self, *args, **kwargs):
         """
         Base class of operators.
 
-        :param extra_requirements: the extra requirements of the OP,
-            check and auto install here
         :param text_key: the key name of field that stores sample texts
             to be processed.
         :param image_key: the key name of field that stores sample image list
@@ -136,10 +134,6 @@ def __init__(self, *args, **kwargs):
         :param video_key: the key name of field that stores sample video list
             to be processed
         """
-        # check and auto install extra dependencies
-        self.extra_requirements = kwargs.get('extra_requirements', [])
-        AUTOINSTALL.check(self.extra_requirements)
-
         # init data keys
         self.text_key = kwargs.get('text_key', 'text')
         self.image_key = kwargs.get('image_key', 'images')

diff --git a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py
@@ -16,7 +16,7 @@
 from data_juicer.utils.lazy_loader import LazyLoader
 from data_juicer.utils.model_utils import prepare_sentencepiece_model
 
-from ..base_op import OPERATORS, Deduplicator
+from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
 from ..common.helper_func import UnionFind, split_on_whitespace
 
 OP_NAME = 'document_minhash_deduplicator'
@@ -148,7 +148,8 @@ def __init__(
         :param tokenizer_model: path for the sentencepiece model, used for
             sentencepiece tokenization.
         """
-        super().__init__(extra_requirements=['scipy'], *args, **kwargs)
+        super().__init__(*args, **kwargs)
+        AUTOINSTALL.check(['scipy'])
         # about minhash computation
         self.tokenization = tokenization
         self.window_size = window_size

diff --git a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py
@@ -13,7 +13,7 @@
 from data_juicer.utils.constant import HashKeys
 from data_juicer.utils.lazy_loader import LazyLoader
 
-from ..base_op import OPERATORS, Deduplicator
+from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
 from ..common.helper_func import split_on_whitespace
 
 OP_NAME = 'document_simhash_deduplicator'
@@ -55,9 +55,8 @@ def __init__(self,
             num_blocks
         """
         # about simhash computation
-        super().__init__(extra_requirements=['simhash-pybind'],
-                         *args,
-                         **kwargs)
+        super().__init__(*args, **kwargs)
+        AUTOINSTALL.check(['simhash-pybind'])
         self.tokenization = tokenization
         self.window_size = window_size
         self.lowercase = lowercase

diff --git a/data_juicer/ops/deduplicator/image_deduplicator.py b/data_juicer/ops/deduplicator/image_deduplicator.py
@@ -7,7 +7,7 @@
 from data_juicer.utils.lazy_loader import LazyLoader
 from data_juicer.utils.mm_utils import load_data_with_context, load_image
 
-from ..base_op import OPERATORS, Deduplicator
+from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
 from ..op_fusion import LOADED_IMAGES
 from .document_deduplicator import DocumentDeduplicator
 
@@ -48,7 +48,8 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
-        super().__init__(extra_requirements=['imagededup'], *args, **kwargs)
+        super().__init__(*args, **kwargs)
+        AUTOINSTALL.check(['imagededup'])
         if method not in HASH_METHOD:
             raise ValueError(f'Keep strategy [{method}] is not supported. '
                              f'Can only be one of {HASH_METHOD}.')

diff --git a/data_juicer/ops/deduplicator/ray_image_deduplicator.py b/data_juicer/ops/deduplicator/ray_image_deduplicator.py
@@ -4,7 +4,7 @@
 from data_juicer.utils.lazy_loader import LazyLoader
 from data_juicer.utils.mm_utils import load_data_with_context, load_image
 
-from ..base_op import OPERATORS
+from ..base_op import AUTOINSTALL, OPERATORS
 from ..op_fusion import LOADED_IMAGES
 from .ray_basic_deduplicator import RayBasicDeduplicator
 
@@ -44,11 +44,11 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
-        super().__init__(extra_requirements=['imagededup'],
-                         redis_host=redis_host,
+        super().__init__(redis_host=redis_host,
                          redis_port=redis_port,
                          *args,
                          **kwargs)
+        AUTOINSTALL.check(['imagededup'])
         if method not in HASH_METHOD:
             raise ValueError(f'Keep strategy [{method}] is not supported. '
                              f'Can only be one of {HASH_METHOD}.')

diff --git a/data_juicer/ops/filter/alphanumeric_filter.py b/data_juicer/ops/filter/alphanumeric_filter.py
@@ -5,7 +5,7 @@
 from data_juicer.utils.constant import Fields, StatsKeys
 from data_juicer.utils.model_utils import get_model, prepare_model
 
-from ..base_op import OPERATORS, Filter
+from ..base_op import AUTOINSTALL, OPERATORS, Filter
 from ..common import get_words_from_document
 
 OP_NAME = 'alphanumeric_filter'
@@ -38,7 +38,8 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
-        super().__init__(extra_requirements=['transformers'], *args, **kwargs)
+        super().__init__(*args, **kwargs)
+        AUTOINSTALL.check(['transformers'])
         self.tokenization = tokenization
         self.min_ratio = min_ratio
         self.max_ratio = max_ratio

diff --git a/data_juicer/ops/filter/flagged_words_filter.py b/data_juicer/ops/filter/flagged_words_filter.py
@@ -8,7 +8,7 @@
 from data_juicer.utils.model_utils import get_model, prepare_model
 
 from ...utils.asset_utils import ASSET_DIR, load_words_asset
-from ..base_op import OPERATORS, Filter
+from ..base_op import AUTOINSTALL, OPERATORS, Filter
 from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
                       words_refinement)
 from ..op_fusion import INTER_WORDS
@@ -51,7 +51,8 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
-        super().__init__(extra_requirements=['sentencepiece'], *args, **kwargs)
+        super().__init__(*args, **kwargs)
+        AUTOINSTALL.check(['sentencepiece'])
         self.lang = lang
         self.max_ratio = max_ratio
         self.use_words_aug = use_words_aug

diff --git a/data_juicer/ops/filter/image_aesthetics_filter.py b/data_juicer/ops/filter/image_aesthetics_filter.py
@@ -7,7 +7,7 @@
 from data_juicer.utils.mm_utils import load_data_with_context, load_image
 
 from ...utils.model_utils import get_model, prepare_model
-from ..base_op import OPERATORS, Filter
+from ..base_op import AUTOINSTALL, OPERATORS, Filter
 from ..op_fusion import LOADED_IMAGES
 
 OP_NAME = 'image_aesthetics_filter'
@@ -49,11 +49,9 @@ def __init__(self,
         :param kwargs: Extra keyword arguments.
         """
 
-        super().__init__(extra_requirements=[
-            'torch', 'transformers', 'simple-aesthetics-predictor'
-        ],
-                         *args,
-                         **kwargs)
+        super().__init__(*args, **kwargs)
+        AUTOINSTALL.check(
+            ['torch', 'transformers', 'simple-aesthetics-predictor'])
         if hf_scorer_model == '':
             hf_scorer_model = \
                 'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE'

diff --git a/data_juicer/ops/filter/image_face_ratio_filter.py b/data_juicer/ops/filter/image_face_ratio_filter.py
@@ -10,7 +10,7 @@
                                         load_image)
 from data_juicer.utils.model_utils import get_model, prepare_model
 
-from ..base_op import OPERATORS, UNFORKABLE, Filter
+from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter
 from ..op_fusion import LOADED_IMAGES
 
 OP_NAME = 'image_face_ratio_filter'
@@ -53,7 +53,8 @@ def __init__(self,
         :param args: Extra positional arguments.
         :param kwargs: Extra keyword arguments.
         """
-        super().__init__(extra_requirements=['opencv-python'], *args, **kwargs)
+        super().__init__(*args, **kwargs)
+        AUTOINSTALL.check(['opencv-python'])
 
         if cv_classifier == '':
             cv_classifier = os.path.join(cv2.data.haarcascades,

diff --git a/data_juicer/ops/filter/image_nsfw_filter.py b/data_juicer/ops/filter/image_nsfw_filter.py
@@ -6,7 +6,7 @@
 from data_juicer.utils.mm_utils import load_data_with_context, load_image
 from data_juicer.utils.model_utils import get_model, prepare_model
 
-from ..base_op import OPERATORS, Filter
+from ..base_op import AUTOINSTALL, OPERATORS, Filter
 from ..op_fusion import LOADED_IMAGES
 
 OP_NAME = 'image_nsfw_filter'
@@ -43,9 +43,8 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
-        super().__init__(extra_requirements=['torch', 'transformers'],
-                         *args,
-                         **kwargs)
+        super().__init__(*args, **kwargs)
+        AUTOINSTALL.check(['torch', 'transformers'])
         self.score_threshold = score_threshold
         if any_or_all not in ['any', 'all']:
             raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '

diff --git a/data_juicer/ops/filter/image_text_matching_filter.py b/data_juicer/ops/filter/image_text_matching_filter.py
@@ -8,7 +8,7 @@
                                         load_image, remove_special_tokens)
 from data_juicer.utils.model_utils import get_model, prepare_model
 
-from ..base_op import OPERATORS, Filter
+from ..base_op import AUTOINSTALL, OPERATORS, Filter
 from ..op_fusion import LOADED_IMAGES
 
 OP_NAME = 'image_text_matching_filter'
@@ -57,9 +57,8 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
-        super().__init__(extra_requirements=['torch', 'transformers'],
-                         *args,
-                         **kwargs)
+        super().__init__(*args, **kwargs)
+        AUTOINSTALL.check(['torch', 'transformers'])
         self.min_score = min_score
         self.max_score = max_score
         if reduce_mode not in ['avg', 'max', 'min']:

diff --git a/data_juicer/ops/filter/image_text_similarity_filter.py b/data_juicer/ops/filter/image_text_similarity_filter.py
@@ -8,7 +8,7 @@
                                         load_image, remove_special_tokens)
 from data_juicer.utils.model_utils import get_model, prepare_model
 
-from ..base_op import OPERATORS, Filter
+from ..base_op import AUTOINSTALL, OPERATORS, Filter
 from ..op_fusion import LOADED_IMAGES
 
 OP_NAME = 'image_text_similarity_filter'
@@ -57,9 +57,8 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
-        super().__init__(extra_requirements=['torch', 'transformers'],
-                         *args,
-                         **kwargs)
+        super().__init__(*args, **kwargs)
+        AUTOINSTALL.check(['torch', 'transformers'])
         self.min_score = min_score
         self.max_score = max_score
         if reduce_mode not in ['avg', 'max', 'min']:

diff --git a/data_juicer/ops/filter/image_watermark_filter.py b/data_juicer/ops/filter/image_watermark_filter.py
@@ -6,7 +6,7 @@
 from data_juicer.utils.mm_utils import load_data_with_context, load_image
 from data_juicer.utils.model_utils import get_model, prepare_model
 
-from ..base_op import OPERATORS, Filter
+from ..base_op import AUTOINSTALL, OPERATORS, Filter
 from ..op_fusion import LOADED_IMAGES
 
 OP_NAME = 'image_watermark_filter'
@@ -47,9 +47,8 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
-        super().__init__(extra_requirements=['torch', 'transformers'],
-                         *args,
-                         **kwargs)
+        super().__init__(*args, **kwargs)
+        AUTOINSTALL.check(['torch', 'transformers'])
         self.prob_threshold = prob_threshold
         if any_or_all not in ['any', 'all']:
             raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '

diff --git a/data_juicer/ops/filter/language_id_score_filter.py b/data_juicer/ops/filter/language_id_score_filter.py
@@ -7,7 +7,7 @@
 from data_juicer.utils.lazy_loader import LazyLoader
 from data_juicer.utils.model_utils import get_model, prepare_model
 
-from ..base_op import OPERATORS, Filter
+from ..base_op import AUTOINSTALL, OPERATORS, Filter
 
 OP_NAME = 'language_id_score_filter'
 
@@ -33,9 +33,8 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
-        super().__init__(extra_requirements=['fasttext', 'fasttext-wheel'],
-                         *args,
-                         **kwargs)
+        super().__init__(*args, **kwargs)
+        AUTOINSTALL.check(['fasttext', 'fasttext-wheel'])
         if not lang:
             # lang is [], '' or None
             self.lang = None

diff --git a/data_juicer/ops/filter/perplexity_filter.py b/data_juicer/ops/filter/perplexity_filter.py
@@ -8,7 +8,7 @@
 from data_juicer.utils.lazy_loader import LazyLoader
 from data_juicer.utils.model_utils import get_model, prepare_model
 
-from ..base_op import OPERATORS, Filter
+from ..base_op import AUTOINSTALL, OPERATORS, Filter
 from ..common import get_words_from_document
 from ..op_fusion import INTER_WORDS
 
@@ -38,9 +38,8 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
-        super().__init__(extra_requirements=['sentencepiece', 'kenlm'],
-                         *args,
-                         **kwargs)
+        super().__init__(*args, **kwargs)
+        AUTOINSTALL.check(['sentencepiece', 'kenlm'])
         self.max_ppl = max_ppl
         self.lang = lang
         self.sp_model_key = prepare_model(model_type='sentencepiece',

diff --git a/data_juicer/ops/filter/phrase_grounding_recall_filter.py b/data_juicer/ops/filter/phrase_grounding_recall_filter.py
@@ -12,7 +12,7 @@
                                         remove_special_tokens)
 from data_juicer.utils.model_utils import get_model, prepare_model
 
-from ..base_op import OPERATORS, Filter
+from ..base_op import AUTOINSTALL, OPERATORS, Filter
 from ..op_fusion import LOADED_IMAGES
 
 OP_NAME = 'phrase_grounding_recall_filter'
@@ -116,9 +116,8 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
-        super().__init__(extra_requirements=['torch', 'transformers', 'nltk'],
-                         *args,
-                         **kwargs)
+        super().__init__(*args, **kwargs)
+        AUTOINSTALL.check(['torch', 'transformers', 'nltk'])
         self.min_recall = min_recall
         self.max_recall = max_recall
         if reduce_mode not in ['avg', 'max', 'min']:

diff --git a/data_juicer/ops/filter/stopwords_filter.py b/data_juicer/ops/filter/stopwords_filter.py
@@ -9,7 +9,7 @@
 from data_juicer.utils.lazy_loader import LazyLoader
 from data_juicer.utils.model_utils import get_model, prepare_model
 
-from ..base_op import OPERATORS, Filter
+from ..base_op import AUTOINSTALL, OPERATORS, Filter
 from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
                       words_refinement)
 from ..op_fusion import INTER_WORDS
@@ -53,7 +53,8 @@ def __init__(self,
         :param args: extra args
         :param kwargs: extra args
         """
-        super().__init__(extra_requirements=['sentencepiece'], *args, **kwargs)
+        super().__init__(*args, **kwargs)
+        AUTOINSTALL.check(['sentencepiece'])
         self.lang = lang
         self.min_ratio = min_ratio
         self.use_words_aug = use_words_aug

diff --git a/data_juicer/ops/filter/text_action_filter.py b/data_juicer/ops/filter/text_action_filter.py
@@ -2,7 +2,7 @@
 from data_juicer.utils.mm_utils import remove_special_tokens
 from data_juicer.utils.model_utils import get_model, prepare_model
 
-from ..base_op import OPERATORS, Filter
+from ..base_op import AUTOINSTALL, OPERATORS, Filter
 
 OP_NAME = 'text_action_filter'
 
@@ -27,7 +27,8 @@ def __init__(self,
             will be filtered if their action number in the text is below this
             parameter.
         """
-        super().__init__(extra_requirements=['spacy-pkuseg'], *args, **kwargs)
+        super().__init__(*args, **kwargs)
+        AUTOINSTALL.check(['spacy-pkuseg'])
 
         if lang not in ['en', 'zh']:
             raise ValueError(

diff --git a/data_juicer/ops/filter/text_entity_dependency_filter.py b/data_juicer/ops/filter/text_entity_dependency_filter.py
@@ -4,7 +4,7 @@
 from data_juicer.utils.mm_utils import remove_special_tokens
 from data_juicer.utils.model_utils import get_model, prepare_model
 
-from ..base_op import OPERATORS, Filter
+from ..base_op import AUTOINSTALL, OPERATORS, Filter
 
 OP_NAME = 'text_entity_dependency_filter'
 
@@ -34,7 +34,8 @@ def __init__(self,
             'any': keep this sample if any objet is dependent. 'all': keep this
             sample only if all images are dependent.
         """
-        super().__init__(extra_requirements=['spacy-pkuseg'], *args, **kwargs)
+        super().__init__(*args, **kwargs)
+        AUTOINSTALL.check(['spacy-pkuseg'])
 
         if lang not in ['en', 'zh']:
             raise ValueError(