autoinstall check

modelscope · Aug 23, 2024 · 2a6651d · 2a6651d
1 parent 705065b
commit 2a6651d
Show file tree

Hide file tree

Showing 48 changed files with 57 additions and 2 deletions.
diff --git a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py
@@ -105,6 +105,7 @@ class DocumentMinhashDeduplicator(Deduplicator):
     kept in the final dataset.
     """
 
+    @AUTOINSTALL.check(['scipy'])
     def __init__(
         self,
         tokenization: str = 'space',

diff --git a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py
@@ -27,6 +27,7 @@
 class DocumentSimhashDeduplicator(Deduplicator):
     """Deduplicator to deduplicate samples at document-level using SimHash."""
 
+    @AUTOINSTALL.check(['simhash-pybind'])
     def __init__(self,
                  tokenization: str = 'space',
                  window_size: PositiveInt = 6,

diff --git a/data_juicer/ops/deduplicator/image_deduplicator.py b/data_juicer/ops/deduplicator/image_deduplicator.py
@@ -40,6 +40,7 @@ class ImageDeduplicator(Deduplicator):
     of images between documents.
     """
 
+    @AUTOINSTALL.check(['imagededup'])
     def __init__(self,
                  method: str = 'phash',
                  consider_text: bool = False,

diff --git a/data_juicer/ops/deduplicator/ray_image_deduplicator.py b/data_juicer/ops/deduplicator/ray_image_deduplicator.py
@@ -37,6 +37,7 @@ class RayImageDeduplicator(RayBasicDeduplicator):
     of images between documents.
     """
 
+    @AUTOINSTALL.check(['imagededup'])
     def __init__(self,
                  redis_host: str = 'localhost',
                  redis_port: PositiveInt = 6380,

diff --git a/data_juicer/ops/filter/alphanumeric_filter.py b/data_juicer/ops/filter/alphanumeric_filter.py
@@ -21,6 +21,7 @@ class AlphanumericFilter(Filter):
     """Filter to keep samples with alphabet/numeric ratio within a specific
     range."""
 
+    @AUTOINSTALL.check(['transformers'])
     def __init__(self,
                  tokenization: bool = False,
                  min_ratio: float = 0.25,

diff --git a/data_juicer/ops/filter/flagged_words_filter.py b/data_juicer/ops/filter/flagged_words_filter.py
@@ -27,6 +27,7 @@ class FlaggedWordFilter(Filter):
     """Filter to keep samples with flagged-word ratio less than a specific max
     value."""
 
+    @AUTOINSTALL.check(['sentencepiece'])
     def __init__(self,
                  lang: str = 'en',
                  tokenization: bool = False,

diff --git a/data_juicer/ops/filter/image_aesthetics_filter.py b/data_juicer/ops/filter/image_aesthetics_filter.py
@@ -32,6 +32,7 @@ class ImageAestheticsFilter(Filter):
 
     _accelerator = 'cuda'
 
+    @AUTOINSTALL.check(['torch', 'transformers', 'simple-aesthetics-predictor'])
     def __init__(self,
                  hf_scorer_model='',
                  trust_remote_code=False,

diff --git a/data_juicer/ops/filter/image_face_ratio_filter.py b/data_juicer/ops/filter/image_face_ratio_filter.py
@@ -34,6 +34,7 @@ class ImageFaceRatioFilter(Filter):
         'maxSize': None,
     }
 
+    @AUTOINSTALL.check(['opencv-python'])
     def __init__(self,
                  cv_classifier='',
                  min_ratio: ClosedUnitInterval = 0.0,

diff --git a/data_juicer/ops/filter/image_nsfw_filter.py b/data_juicer/ops/filter/image_nsfw_filter.py
@@ -27,6 +27,7 @@ class ImageNSFWFilter(Filter):
 
     _accelerator = 'cuda'
 
+    @AUTOINSTALL.check(['torch', 'transformers'])
     def __init__(self,
                  hf_nsfw_model='Falconsai/nsfw_image_detection',
                  trust_remote_code=False,

diff --git a/data_juicer/ops/filter/image_text_matching_filter.py b/data_juicer/ops/filter/image_text_matching_filter.py
@@ -30,6 +30,7 @@ class ImageTextMatchingFilter(Filter):
 
     _accelerator = 'cuda'
 
+    @AUTOINSTALL.check(['torch', 'transformers'])
     def __init__(self,
                  hf_blip='Salesforce/blip-itm-base-coco',
                  trust_remote_code=False,

diff --git a/data_juicer/ops/filter/image_text_similarity_filter.py b/data_juicer/ops/filter/image_text_similarity_filter.py
@@ -31,6 +31,7 @@ class ImageTextSimilarityFilter(Filter):
 
     _accelerator = 'cuda'
 
+    @AUTOINSTALL.check(['torch', 'transformers'])
     def __init__(self,
                  hf_clip='openai/clip-vit-base-patch32',
                  trust_remote_code=False,

diff --git a/data_juicer/ops/filter/image_watermark_filter.py b/data_juicer/ops/filter/image_watermark_filter.py
@@ -30,6 +30,7 @@ class ImageWatermarkFilter(Filter):
 
     _accelerator = 'cuda'
 
+    @AUTOINSTALL.check(['torch', 'transformers'])
     def __init__(self,
                  hf_watermark_model='amrul-hzz/watermark_detector',
                  trust_remote_code=False,

diff --git a/data_juicer/ops/filter/language_id_score_filter.py b/data_juicer/ops/filter/language_id_score_filter.py
@@ -21,6 +21,7 @@ class LanguageIDScoreFilter(Filter):
     """Filter to keep samples in a specific language with confidence score
     larger than a specific min value."""
 
+    @AUTOINSTALL.check(['fasttext-wheel'])
     def __init__(self,
                  lang: Union[str, List[str], Tuple[str]] = '',
                  min_score: ClosedUnitInterval = 0.8,

diff --git a/data_juicer/ops/filter/perplexity_filter.py b/data_juicer/ops/filter/perplexity_filter.py
@@ -26,6 +26,7 @@ class PerplexityFilter(Filter):
     """Filter to keep samples with perplexity score less than a specific max
     value."""
 
+    @AUTOINSTALL.check(['perplexity_filter'])
     def __init__(self,
                  lang: str = 'en',
                  max_ppl: PositiveFloat = 1500,

diff --git a/data_juicer/ops/filter/phrase_grounding_recall_filter.py b/data_juicer/ops/filter/phrase_grounding_recall_filter.py
@@ -77,6 +77,7 @@ class PhraseGroundingRecallFilter(Filter):
 
     _accelerator = 'cuda'
 
+    @AUTOINSTALL.check(['torch', 'transformers', 'nltk'])
     def __init__(self,
                  hf_owlvit='google/owlvit-base-patch32',
                  trust_remote_code=False,

diff --git a/data_juicer/ops/filter/stopwords_filter.py b/data_juicer/ops/filter/stopwords_filter.py
@@ -27,6 +27,7 @@ class StopWordsFilter(Filter):
     """Filter to keep samples with stopword ratio larger than a specific min
     value."""
 
+    @AUTOINSTALL.check(['sentencepiece'])
     def __init__(self,
                  lang: str = 'en',
                  tokenization: bool = False,

diff --git a/data_juicer/ops/filter/text_entity_dependency_filter.py b/data_juicer/ops/filter/text_entity_dependency_filter.py
@@ -21,6 +21,7 @@ class TextEntityDependencyFilter(Filter):
     and filter them. The text containing no entities will be omitted.
     """
 
+    @AUTOINSTALL.check(['spacy-pkuseg'])
     def __init__(self,
                  lang: str = 'en',
                  min_dependency_num: int = 1,

diff --git a/data_juicer/ops/filter/token_num_filter.py b/data_juicer/ops/filter/token_num_filter.py
@@ -21,6 +21,7 @@ class TokenNumFilter(Filter):
     """Filter to keep samples with total token number within a specific
     range."""
 
+    @AUTOINSTALL.check(['transformers'])
     def __init__(self,
                  hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped',
                  min_num: PositiveInt = 10,

diff --git a/data_juicer/ops/filter/video_aesthetics_filter.py b/data_juicer/ops/filter/video_aesthetics_filter.py
@@ -10,7 +10,7 @@
                                         load_data_with_context, load_video)
 
 from ...utils.model_utils import get_model, prepare_model
-from ..base_op import AUTOINSTALL, AUTOINSTALL, OPERATORS, Filter
+from ..base_op import AUTOINSTALL, OPERATORS, Filter
 from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS
 
 OP_NAME = 'video_aesthetics_filter'

diff --git a/data_juicer/ops/filter/video_frames_text_similarity_filter.py b/data_juicer/ops/filter/video_frames_text_similarity_filter.py
@@ -35,6 +35,7 @@ class VideoFramesTextSimilarityFilter(Filter):
 
     _accelerator = 'cuda'
 
+    @AUTOINSTALL.check(['torch', 'transformers'])
     def __init__(self,
                  hf_clip='openai/clip-vit-base-patch32',
                  trust_remote_code=False,

diff --git a/data_juicer/ops/filter/video_motion_score_filter.py b/data_juicer/ops/filter/video_motion_score_filter.py
@@ -43,6 +43,7 @@ class VideoMotionScoreFilter(Filter):
         'flags': 0
     }
 
+    @AUTOINSTALL.check(['opencv-python'])
     def __init__(self,
                  min_score: float = 0.25,
                  max_score: float = sys.float_info.max,

diff --git a/data_juicer/ops/filter/video_nsfw_filter.py b/data_juicer/ops/filter/video_nsfw_filter.py
@@ -31,6 +31,7 @@ class VideoNSFWFilter(Filter):
 
     _accelerator = 'cuda'
 
+    @AUTOINSTALL.check(['torch', 'transformers'])
     def __init__(self,
                  hf_nsfw_model='Falconsai/nsfw_image_detection',
                  trust_remote_code=False,

diff --git a/data_juicer/ops/filter/video_ocr_area_ratio_filter.py b/data_juicer/ops/filter/video_ocr_area_ratio_filter.py
@@ -43,6 +43,7 @@ class VideoOcrAreaRatioFilter(Filter):
 
     _accelerator = 'cuda'
 
+    @AUTOINSTALL.check(['easyocr'])
     def __init__(self,
                  min_area_ratio: ClosedUnitInterval = 0,
                  max_area_ratio: ClosedUnitInterval = 1.0,

diff --git a/data_juicer/ops/filter/video_tagging_from_frames_filter.py b/data_juicer/ops/filter/video_tagging_from_frames_filter.py
@@ -31,6 +31,7 @@ class VideoTaggingFromFramesFilter(Filter):
 
     _accelerator = 'cuda'
 
+    @AUTOINSTALL.check(['torch', 'git+https://github.com/xinyu1205/recognize-anything.git'])
     def __init__(self,
                  tags: List[str] = ['people'],
                  contain: str = 'any',

diff --git a/data_juicer/ops/filter/video_watermark_filter.py b/data_juicer/ops/filter/video_watermark_filter.py
@@ -34,6 +34,7 @@ class VideoWatermarkFilter(Filter):
 
     _accelerator = 'cuda'
 
+    @AUTOINSTALL.check(['torch', 'transformers'])
     def __init__(self,
                  hf_watermark_model='amrul-hzz/watermark_detector',
                  trust_remote_code=False,

diff --git a/data_juicer/ops/filter/word_repetition_filter.py b/data_juicer/ops/filter/word_repetition_filter.py
@@ -26,6 +26,7 @@ class WordRepetitionFilter(Filter):
     """Filter to keep samples with word-level n-gram repetition ratio within a
     specific range."""
 
+    @AUTOINSTALL.check(['sentencepiece'])
     def __init__(self,
                  lang: str = 'en',
                  tokenization: bool = False,

diff --git a/data_juicer/ops/filter/words_num_filter.py b/data_juicer/ops/filter/words_num_filter.py
@@ -24,6 +24,7 @@ class WordsNumFilter(Filter):
     """Filter to keep samples with total words number within a specific
     range."""
 
+    @AUTOINSTALL.check(['sentencepiece'])
     def __init__(self,
                  lang: str = 'en',
                  tokenization: bool = False,

diff --git a/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py b/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py
@@ -19,6 +19,7 @@ class AudioFFmpegWrappedMapper(Mapper):
     """Simple wrapper for FFmpeg audio filters.
     """
 
+    @AUTOINSTALL.check(['ffmpeg-python'])
     def __init__(
         self,
         filter_name: Optional[str] = None,

diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py
@@ -28,6 +28,7 @@ class ChineseConvertMapper(Mapper):
     """Mapper to convert Chinese between Traditional Chinese, Simplified Chinese
     and Japanese Kanji."""
 
+    @AUTOINSTALL.check(['opencc'])
     def __init__(self, mode: str = 's2t', *args, **kwargs):
         """
         Initialization method.

diff --git a/data_juicer/ops/mapper/clean_html_mapper.py b/data_juicer/ops/mapper/clean_html_mapper.py
@@ -17,6 +17,7 @@
 class CleanHtmlMapper(Mapper):
     """Mapper to clean html code in text samples."""
 
+    @AUTOINSTALL.check(['selectolax'])
     def __init__(self, *args, **kwargs):
         """
         Initialization method.

diff --git a/data_juicer/ops/mapper/fix_unicode_mapper.py b/data_juicer/ops/mapper/fix_unicode_mapper.py
@@ -13,6 +13,7 @@
 class FixUnicodeMapper(Mapper):
     """Mapper to fix unicode errors in text samples."""
 
+    @AUTOINSTALL.check(['ftfy'])
     def __init__(self, normalization: str = None, *args, **kwargs):
         """
         Initialization method.

diff --git a/data_juicer/ops/mapper/image_captioning_mapper.py b/data_juicer/ops/mapper/image_captioning_mapper.py
@@ -38,6 +38,7 @@ class ImageCaptioningMapper(Mapper):
     _accelerator = 'cuda'
     _batched_op = True
 
+    @AUTOINSTALL.check(['torch', 'transformers', 'simhash-pybind'])
     def __init__(self,
                  hf_img2seq='Salesforce/blip2-opt-2.7b',
                  trust_remote_code=False,

diff --git a/data_juicer/ops/mapper/image_diffusion_mapper.py b/data_juicer/ops/mapper/image_diffusion_mapper.py
@@ -37,6 +37,7 @@ class ImageDiffusionMapper(Mapper):
     _accelerator = 'cuda'
     _batched_op = True
 
+    @AUTOINSTALL.check(['diffusers', 'torch', 'transformers', 'simhash-pybind'])
     def __init__(self,
                  hf_diffusion: str = 'CompVis/stable-diffusion-v1-4',
                  trust_remote_code=False,

diff --git a/data_juicer/ops/mapper/image_face_blur_mapper.py b/data_juicer/ops/mapper/image_face_blur_mapper.py
@@ -34,6 +34,7 @@ class ImageFaceBlurMapper(Mapper):
         'maxSize': None,
     }
 
+    @AUTOINSTALL.check(['opencv-python', 'Pillow'])
     def __init__(self,
                  cv_classifier='',
                  blur_type: str = 'gaussian',

diff --git a/data_juicer/ops/mapper/nlpaug_en_mapper.py b/data_juicer/ops/mapper/nlpaug_en_mapper.py
@@ -22,6 +22,7 @@ class NlpaugEnMapper(Mapper):
 
     _batched_op = True
 
+    @AUTOINSTALL.check(['nlpaug'])
     def __init__(self,
                  sequential: bool = False,
                  aug_num: int = 1,

diff --git a/data_juicer/ops/mapper/nlpcda_zh_mapper.py b/data_juicer/ops/mapper/nlpcda_zh_mapper.py
@@ -20,6 +20,7 @@ class NlpcdaZhMapper(Mapper):
 
     _batched_op = True
 
+    @AUTOINSTALL.check(['nlpaug'])
     def __init__(self,
                  sequential: bool = False,
                  aug_num: int = 1,

diff --git a/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py b/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py
@@ -19,6 +19,7 @@
 class RemoveWordsWithIncorrectSubstringsMapper(Mapper):
     """Mapper to remove words with incorrect substrings."""
 
+    @AUTOINSTALL.check(['sentencepiece'])
     def __init__(self,
                  lang: str = 'en',
                  tokenization: bool = False,

diff --git a/data_juicer/ops/mapper/sentence_split_mapper.py b/data_juicer/ops/mapper/sentence_split_mapper.py
@@ -15,6 +15,7 @@
 class SentenceSplitMapper(Mapper):
     """Mapper to split text samples to sentences."""
 
+    @AUTOINSTALL.check(['nltk'])
     def __init__(self, lang: str = 'en', *args, **kwargs):
         """
         Initialization method.

diff --git a/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py b/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py
@@ -33,6 +33,7 @@ class VideoCaptioningFromAudioMapper(Mapper):
     _accelerator = 'cuda'
     _batched_op = True
 
+    @AUTOINSTALL.check(['transformers', 'transformers_stream_generator', 'einops', 'accelerate', 'tiktoken'])
     def __init__(self, keep_original_sample: bool = True, *args, **kwargs):
         """
         Initialization method.

diff --git a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py
@@ -45,6 +45,7 @@ class VideoCaptioningFromFramesMapper(Mapper):
     _accelerator = 'cuda'
     _batched_op = True
 
+    @AUTOINSTALL.check(['torch', 'transformers', 'simhash-pybind'])
     def __init__(
         self,
         hf_img2seq='Salesforce/blip2-opt-2.7b',

diff --git a/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py b/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py
@@ -53,6 +53,15 @@ class VideoCaptioningFromSummarizerMapper(Mapper):
     _accelerator = 'cuda'
     _batched_op = True
 
+    @AUTOINSTALL.check(['torch',
+        'transformers',
+        'simhash-pybind',  # by video caption
+        'transformers_stream_generator',
+        'einops',
+        'accelerate',
+        'tiktoken',  # by audio caption
+        'torchaudio',  # by audio tag
+        'git+https://github.com/xinyu1205/recognize-anything.git'])
     def __init__(self,
                  hf_summarizer: str = None,
                  trust_remote_code=False,

diff --git a/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py b/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py
@@ -19,6 +19,7 @@ class VideoFFmpegWrappedMapper(Mapper):
     """Simple wrapper for FFmpeg video filters.
     """
 
+    @AUTOINSTALL.check(['ffmpeg-python'])
     def __init__(
         self,
         filter_name: Optional[str] = None,

diff --git a/data_juicer/ops/mapper/video_remove_watermark_mapper.py b/data_juicer/ops/mapper/video_remove_watermark_mapper.py
@@ -31,6 +31,7 @@ class VideoRemoveWatermarkMapper(Mapper):
         Remove the watermarks in videos given regions.
     """
 
+    @AUTOINSTALL.check(['opencv-python'])
     def __init__(self,
                  roi_strings: List[str] = ['0,0,0.1,0.1'],
                  roi_type: str = 'ratio',

diff --git a/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py b/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py
@@ -62,6 +62,7 @@ class VideoResizeAspectRatioMapper(Mapper):
 
     STRATEGY = ['decrease', 'increase']
 
+    @AUTOINSTALL.check(['ffmpeg-python'])
     def __init__(
         self,
         min_ratio: str = '9/21',

diff --git a/data_juicer/ops/mapper/video_resize_resolution_mapper.py b/data_juicer/ops/mapper/video_resize_resolution_mapper.py
@@ -11,7 +11,7 @@
 from data_juicer.utils.logger_utils import HiddenPrints
 from data_juicer.utils.mm_utils import close_video, load_video
 
-from ..base_op import AUTOINSTALL, AUTOINSTALL, OPERATORS, Mapper
+from ..base_op import AUTOINSTALL, OPERATORS, Mapper
 from ..op_fusion import LOADED_VIDEOS
 
 OP_NAME = 'video_resize_resolution_mapper'
@@ -28,6 +28,7 @@ class VideoResizeResolutionMapper(Mapper):
         with deep learning for future works.
     """
 
+    @AUTOINSTALL.check(['ffmpeg-python'])
     def __init__(self,
                  min_width: PositiveInt = 1,
                  max_width: PositiveInt = sys.maxsize,

diff --git a/data_juicer/ops/mapper/video_split_by_scene_mapper.py b/data_juicer/ops/mapper/video_split_by_scene_mapper.py
@@ -44,6 +44,7 @@ class VideoSplitBySceneMapper(Mapper):
         ['fade_bias', 'add_final_scene', 'method', 'block_size']
     }
 
+    @AUTOINSTALL.check(['scenedetect[opencv]'])
     def __init__(self,
                  detector: str = 'ContentDetector',
                  threshold: NonNegativeFloat = 27.0,

diff --git a/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py b/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py
@@ -27,6 +27,7 @@ class VideoTaggingFromAudioMapper(Mapper):
 
     _accelerator = 'cuda'
 
+    @AUTOINSTALL.check(['torch', 'transformers', 'torchaudio'])
     def __init__(self,
                  hf_ast='MIT/ast-finetuned-audioset-10-10-0.4593',
                  trust_remote_code=False,