diff --git a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py index 205118d24..c183c2715 100644 --- a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py @@ -105,6 +105,7 @@ class DocumentMinhashDeduplicator(Deduplicator): kept in the final dataset. """ + @AUTOINSTALL.check(['scipy']) def __init__( self, tokenization: str = 'space', diff --git a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py index 0b0283ab3..fdcaf4448 100644 --- a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py @@ -27,6 +27,7 @@ class DocumentSimhashDeduplicator(Deduplicator): """Deduplicator to deduplicate samples at document-level using SimHash.""" + @AUTOINSTALL.check(['simhash-pybind']) def __init__(self, tokenization: str = 'space', window_size: PositiveInt = 6, diff --git a/data_juicer/ops/deduplicator/image_deduplicator.py b/data_juicer/ops/deduplicator/image_deduplicator.py index a667a45cb..65f3d3fc5 100644 --- a/data_juicer/ops/deduplicator/image_deduplicator.py +++ b/data_juicer/ops/deduplicator/image_deduplicator.py @@ -40,6 +40,7 @@ class ImageDeduplicator(Deduplicator): of images between documents. """ + @AUTOINSTALL.check(['imagededup']) def __init__(self, method: str = 'phash', consider_text: bool = False, diff --git a/data_juicer/ops/deduplicator/ray_image_deduplicator.py b/data_juicer/ops/deduplicator/ray_image_deduplicator.py index d9d80c540..d6aad6eca 100644 --- a/data_juicer/ops/deduplicator/ray_image_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_image_deduplicator.py @@ -37,6 +37,7 @@ class RayImageDeduplicator(RayBasicDeduplicator): of images between documents. """ + @AUTOINSTALL.check(['imagededup']) def __init__(self, redis_host: str = 'localhost', redis_port: PositiveInt = 6380, diff --git a/data_juicer/ops/filter/alphanumeric_filter.py b/data_juicer/ops/filter/alphanumeric_filter.py index 1b3bfe902..80d2e6ca8 100644 --- a/data_juicer/ops/filter/alphanumeric_filter.py +++ b/data_juicer/ops/filter/alphanumeric_filter.py @@ -21,6 +21,7 @@ class AlphanumericFilter(Filter): """Filter to keep samples with alphabet/numeric ratio within a specific range.""" + @AUTOINSTALL.check(['transformers']) def __init__(self, tokenization: bool = False, min_ratio: float = 0.25, diff --git a/data_juicer/ops/filter/flagged_words_filter.py b/data_juicer/ops/filter/flagged_words_filter.py index d08c6242a..d3bfe04e4 100644 --- a/data_juicer/ops/filter/flagged_words_filter.py +++ b/data_juicer/ops/filter/flagged_words_filter.py @@ -27,6 +27,7 @@ class FlaggedWordFilter(Filter): """Filter to keep samples with flagged-word ratio less than a specific max value.""" + @AUTOINSTALL.check(['sentencepiece']) def __init__(self, lang: str = 'en', tokenization: bool = False, diff --git a/data_juicer/ops/filter/image_aesthetics_filter.py b/data_juicer/ops/filter/image_aesthetics_filter.py index b228427b5..71382d1b9 100644 --- a/data_juicer/ops/filter/image_aesthetics_filter.py +++ b/data_juicer/ops/filter/image_aesthetics_filter.py @@ -32,6 +32,7 @@ class ImageAestheticsFilter(Filter): _accelerator = 'cuda' + @AUTOINSTALL.check(['torch', 'transformers', 'simple-aesthetics-predictor']) def __init__(self, hf_scorer_model='', trust_remote_code=False, diff --git a/data_juicer/ops/filter/image_face_ratio_filter.py b/data_juicer/ops/filter/image_face_ratio_filter.py index f7ff76487..07eed294d 100644 --- a/data_juicer/ops/filter/image_face_ratio_filter.py +++ b/data_juicer/ops/filter/image_face_ratio_filter.py @@ -34,6 +34,7 @@ class ImageFaceRatioFilter(Filter): 'maxSize': None, } + @AUTOINSTALL.check(['opencv-python']) def __init__(self, cv_classifier='', min_ratio: ClosedUnitInterval = 0.0, diff --git a/data_juicer/ops/filter/image_nsfw_filter.py b/data_juicer/ops/filter/image_nsfw_filter.py index 3bc24509a..eee847161 100644 --- a/data_juicer/ops/filter/image_nsfw_filter.py +++ b/data_juicer/ops/filter/image_nsfw_filter.py @@ -27,6 +27,7 @@ class ImageNSFWFilter(Filter): _accelerator = 'cuda' + @AUTOINSTALL.check(['torch', 'transformers']) def __init__(self, hf_nsfw_model='Falconsai/nsfw_image_detection', trust_remote_code=False, diff --git a/data_juicer/ops/filter/image_text_matching_filter.py b/data_juicer/ops/filter/image_text_matching_filter.py index a64e1aeff..c1dbb73cd 100644 --- a/data_juicer/ops/filter/image_text_matching_filter.py +++ b/data_juicer/ops/filter/image_text_matching_filter.py @@ -30,6 +30,7 @@ class ImageTextMatchingFilter(Filter): _accelerator = 'cuda' + @AUTOINSTALL.check(['torch', 'transformers']) def __init__(self, hf_blip='Salesforce/blip-itm-base-coco', trust_remote_code=False, diff --git a/data_juicer/ops/filter/image_text_similarity_filter.py b/data_juicer/ops/filter/image_text_similarity_filter.py index 8a7aea4e7..c6f8160cf 100644 --- a/data_juicer/ops/filter/image_text_similarity_filter.py +++ b/data_juicer/ops/filter/image_text_similarity_filter.py @@ -31,6 +31,7 @@ class ImageTextSimilarityFilter(Filter): _accelerator = 'cuda' + @AUTOINSTALL.check(['torch', 'transformers']) def __init__(self, hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, diff --git a/data_juicer/ops/filter/image_watermark_filter.py b/data_juicer/ops/filter/image_watermark_filter.py index a7e7834ae..bcf1d2646 100644 --- a/data_juicer/ops/filter/image_watermark_filter.py +++ b/data_juicer/ops/filter/image_watermark_filter.py @@ -30,6 +30,7 @@ class ImageWatermarkFilter(Filter): _accelerator = 'cuda' + @AUTOINSTALL.check(['torch', 'transformers']) def __init__(self, hf_watermark_model='amrul-hzz/watermark_detector', trust_remote_code=False, diff --git a/data_juicer/ops/filter/language_id_score_filter.py b/data_juicer/ops/filter/language_id_score_filter.py index 81f882e6e..79a204c7e 100644 --- a/data_juicer/ops/filter/language_id_score_filter.py +++ b/data_juicer/ops/filter/language_id_score_filter.py @@ -21,6 +21,7 @@ class LanguageIDScoreFilter(Filter): """Filter to keep samples in a specific language with confidence score larger than a specific min value.""" + @AUTOINSTALL.check(['fasttext-wheel']) def __init__(self, lang: Union[str, List[str], Tuple[str]] = '', min_score: ClosedUnitInterval = 0.8, diff --git a/data_juicer/ops/filter/perplexity_filter.py b/data_juicer/ops/filter/perplexity_filter.py index 9bc4be3b0..b83328875 100644 --- a/data_juicer/ops/filter/perplexity_filter.py +++ b/data_juicer/ops/filter/perplexity_filter.py @@ -26,6 +26,7 @@ class PerplexityFilter(Filter): """Filter to keep samples with perplexity score less than a specific max value.""" + @AUTOINSTALL.check(['perplexity_filter']) def __init__(self, lang: str = 'en', max_ppl: PositiveFloat = 1500, diff --git a/data_juicer/ops/filter/phrase_grounding_recall_filter.py b/data_juicer/ops/filter/phrase_grounding_recall_filter.py index 3a5e28c12..ad45d2fb5 100644 --- a/data_juicer/ops/filter/phrase_grounding_recall_filter.py +++ b/data_juicer/ops/filter/phrase_grounding_recall_filter.py @@ -77,6 +77,7 @@ class PhraseGroundingRecallFilter(Filter): _accelerator = 'cuda' + @AUTOINSTALL.check(['torch', 'transformers', 'nltk']) def __init__(self, hf_owlvit='google/owlvit-base-patch32', trust_remote_code=False, diff --git a/data_juicer/ops/filter/stopwords_filter.py b/data_juicer/ops/filter/stopwords_filter.py index 82e78d1cf..58b386219 100644 --- a/data_juicer/ops/filter/stopwords_filter.py +++ b/data_juicer/ops/filter/stopwords_filter.py @@ -27,6 +27,7 @@ class StopWordsFilter(Filter): """Filter to keep samples with stopword ratio larger than a specific min value.""" + @AUTOINSTALL.check(['sentencepiece']) def __init__(self, lang: str = 'en', tokenization: bool = False, diff --git a/data_juicer/ops/filter/text_entity_dependency_filter.py b/data_juicer/ops/filter/text_entity_dependency_filter.py index 7db20ab43..39f3d973e 100644 --- a/data_juicer/ops/filter/text_entity_dependency_filter.py +++ b/data_juicer/ops/filter/text_entity_dependency_filter.py @@ -21,6 +21,7 @@ class TextEntityDependencyFilter(Filter): and filter them. The text containing no entities will be omitted. """ + @AUTOINSTALL.check(['spacy-pkuseg']) def __init__(self, lang: str = 'en', min_dependency_num: int = 1, diff --git a/data_juicer/ops/filter/token_num_filter.py b/data_juicer/ops/filter/token_num_filter.py index 6c743e216..83704f08b 100644 --- a/data_juicer/ops/filter/token_num_filter.py +++ b/data_juicer/ops/filter/token_num_filter.py @@ -21,6 +21,7 @@ class TokenNumFilter(Filter): """Filter to keep samples with total token number within a specific range.""" + @AUTOINSTALL.check(['transformers']) def __init__(self, hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped', min_num: PositiveInt = 10, diff --git a/data_juicer/ops/filter/video_aesthetics_filter.py b/data_juicer/ops/filter/video_aesthetics_filter.py index 4adaae6ac..55ed66fda 100644 --- a/data_juicer/ops/filter/video_aesthetics_filter.py +++ b/data_juicer/ops/filter/video_aesthetics_filter.py @@ -10,7 +10,7 @@ load_data_with_context, load_video) from ...utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, AUTOINSTALL, OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS OP_NAME = 'video_aesthetics_filter' diff --git a/data_juicer/ops/filter/video_frames_text_similarity_filter.py b/data_juicer/ops/filter/video_frames_text_similarity_filter.py index f52cdc484..e399d5abd 100644 --- a/data_juicer/ops/filter/video_frames_text_similarity_filter.py +++ b/data_juicer/ops/filter/video_frames_text_similarity_filter.py @@ -35,6 +35,7 @@ class VideoFramesTextSimilarityFilter(Filter): _accelerator = 'cuda' + @AUTOINSTALL.check(['torch', 'transformers']) def __init__(self, hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, diff --git a/data_juicer/ops/filter/video_motion_score_filter.py b/data_juicer/ops/filter/video_motion_score_filter.py index d4393a673..e54589a32 100644 --- a/data_juicer/ops/filter/video_motion_score_filter.py +++ b/data_juicer/ops/filter/video_motion_score_filter.py @@ -43,6 +43,7 @@ class VideoMotionScoreFilter(Filter): 'flags': 0 } + @AUTOINSTALL.check(['opencv-python']) def __init__(self, min_score: float = 0.25, max_score: float = sys.float_info.max, diff --git a/data_juicer/ops/filter/video_nsfw_filter.py b/data_juicer/ops/filter/video_nsfw_filter.py index 8ecc77c27..91e409676 100644 --- a/data_juicer/ops/filter/video_nsfw_filter.py +++ b/data_juicer/ops/filter/video_nsfw_filter.py @@ -31,6 +31,7 @@ class VideoNSFWFilter(Filter): _accelerator = 'cuda' + @AUTOINSTALL.check(['torch', 'transformers']) def __init__(self, hf_nsfw_model='Falconsai/nsfw_image_detection', trust_remote_code=False, diff --git a/data_juicer/ops/filter/video_ocr_area_ratio_filter.py b/data_juicer/ops/filter/video_ocr_area_ratio_filter.py index 4909614a3..07c419858 100644 --- a/data_juicer/ops/filter/video_ocr_area_ratio_filter.py +++ b/data_juicer/ops/filter/video_ocr_area_ratio_filter.py @@ -43,6 +43,7 @@ class VideoOcrAreaRatioFilter(Filter): _accelerator = 'cuda' + @AUTOINSTALL.check(['easyocr']) def __init__(self, min_area_ratio: ClosedUnitInterval = 0, max_area_ratio: ClosedUnitInterval = 1.0, diff --git a/data_juicer/ops/filter/video_tagging_from_frames_filter.py b/data_juicer/ops/filter/video_tagging_from_frames_filter.py index 16183c26d..49a46ee65 100644 --- a/data_juicer/ops/filter/video_tagging_from_frames_filter.py +++ b/data_juicer/ops/filter/video_tagging_from_frames_filter.py @@ -31,6 +31,7 @@ class VideoTaggingFromFramesFilter(Filter): _accelerator = 'cuda' + @AUTOINSTALL.check(['torch', 'git+https://github.com/xinyu1205/recognize-anything.git']) def __init__(self, tags: List[str] = ['people'], contain: str = 'any', diff --git a/data_juicer/ops/filter/video_watermark_filter.py b/data_juicer/ops/filter/video_watermark_filter.py index 64bfa2189..2c022826d 100644 --- a/data_juicer/ops/filter/video_watermark_filter.py +++ b/data_juicer/ops/filter/video_watermark_filter.py @@ -34,6 +34,7 @@ class VideoWatermarkFilter(Filter): _accelerator = 'cuda' + @AUTOINSTALL.check(['torch', 'transformers']) def __init__(self, hf_watermark_model='amrul-hzz/watermark_detector', trust_remote_code=False, diff --git a/data_juicer/ops/filter/word_repetition_filter.py b/data_juicer/ops/filter/word_repetition_filter.py index 3e1487520..5b2eb0402 100644 --- a/data_juicer/ops/filter/word_repetition_filter.py +++ b/data_juicer/ops/filter/word_repetition_filter.py @@ -26,6 +26,7 @@ class WordRepetitionFilter(Filter): """Filter to keep samples with word-level n-gram repetition ratio within a specific range.""" + @AUTOINSTALL.check(['sentencepiece']) def __init__(self, lang: str = 'en', tokenization: bool = False, diff --git a/data_juicer/ops/filter/words_num_filter.py b/data_juicer/ops/filter/words_num_filter.py index 87c7032c8..2d2ddb07e 100644 --- a/data_juicer/ops/filter/words_num_filter.py +++ b/data_juicer/ops/filter/words_num_filter.py @@ -24,6 +24,7 @@ class WordsNumFilter(Filter): """Filter to keep samples with total words number within a specific range.""" + @AUTOINSTALL.check(['sentencepiece']) def __init__(self, lang: str = 'en', tokenization: bool = False, diff --git a/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py b/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py index e2b4bb3fc..2e8804b75 100644 --- a/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py +++ b/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py @@ -19,6 +19,7 @@ class AudioFFmpegWrappedMapper(Mapper): """Simple wrapper for FFmpeg audio filters. """ + @AUTOINSTALL.check(['ffmpeg-python']) def __init__( self, filter_name: Optional[str] = None, diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py index 906e16fad..13cc4c59c 100644 --- a/data_juicer/ops/mapper/chinese_convert_mapper.py +++ b/data_juicer/ops/mapper/chinese_convert_mapper.py @@ -28,6 +28,7 @@ class ChineseConvertMapper(Mapper): """Mapper to convert Chinese between Traditional Chinese, Simplified Chinese and Japanese Kanji.""" + @AUTOINSTALL.check(['opencc']) def __init__(self, mode: str = 's2t', *args, **kwargs): """ Initialization method. diff --git a/data_juicer/ops/mapper/clean_html_mapper.py b/data_juicer/ops/mapper/clean_html_mapper.py index fbde43483..b6b816e45 100644 --- a/data_juicer/ops/mapper/clean_html_mapper.py +++ b/data_juicer/ops/mapper/clean_html_mapper.py @@ -17,6 +17,7 @@ class CleanHtmlMapper(Mapper): """Mapper to clean html code in text samples.""" + @AUTOINSTALL.check(['selectolax']) def __init__(self, *args, **kwargs): """ Initialization method. diff --git a/data_juicer/ops/mapper/fix_unicode_mapper.py b/data_juicer/ops/mapper/fix_unicode_mapper.py index ca15571bb..e9b3188eb 100644 --- a/data_juicer/ops/mapper/fix_unicode_mapper.py +++ b/data_juicer/ops/mapper/fix_unicode_mapper.py @@ -13,6 +13,7 @@ class FixUnicodeMapper(Mapper): """Mapper to fix unicode errors in text samples.""" + @AUTOINSTALL.check(['ftfy']) def __init__(self, normalization: str = None, *args, **kwargs): """ Initialization method. diff --git a/data_juicer/ops/mapper/image_captioning_mapper.py b/data_juicer/ops/mapper/image_captioning_mapper.py index 48ed03afa..326f29155 100644 --- a/data_juicer/ops/mapper/image_captioning_mapper.py +++ b/data_juicer/ops/mapper/image_captioning_mapper.py @@ -38,6 +38,7 @@ class ImageCaptioningMapper(Mapper): _accelerator = 'cuda' _batched_op = True + @AUTOINSTALL.check(['torch', 'transformers', 'simhash-pybind']) def __init__(self, hf_img2seq='Salesforce/blip2-opt-2.7b', trust_remote_code=False, diff --git a/data_juicer/ops/mapper/image_diffusion_mapper.py b/data_juicer/ops/mapper/image_diffusion_mapper.py index 511040c7d..8079ddaaa 100644 --- a/data_juicer/ops/mapper/image_diffusion_mapper.py +++ b/data_juicer/ops/mapper/image_diffusion_mapper.py @@ -37,6 +37,7 @@ class ImageDiffusionMapper(Mapper): _accelerator = 'cuda' _batched_op = True + @AUTOINSTALL.check(['diffusers', 'torch', 'transformers', 'simhash-pybind']) def __init__(self, hf_diffusion: str = 'CompVis/stable-diffusion-v1-4', trust_remote_code=False, diff --git a/data_juicer/ops/mapper/image_face_blur_mapper.py b/data_juicer/ops/mapper/image_face_blur_mapper.py index 9da102a23..c835c658e 100644 --- a/data_juicer/ops/mapper/image_face_blur_mapper.py +++ b/data_juicer/ops/mapper/image_face_blur_mapper.py @@ -34,6 +34,7 @@ class ImageFaceBlurMapper(Mapper): 'maxSize': None, } + @AUTOINSTALL.check(['opencv-python', 'Pillow']) def __init__(self, cv_classifier='', blur_type: str = 'gaussian', diff --git a/data_juicer/ops/mapper/nlpaug_en_mapper.py b/data_juicer/ops/mapper/nlpaug_en_mapper.py index 60d0735f6..c658c9f49 100644 --- a/data_juicer/ops/mapper/nlpaug_en_mapper.py +++ b/data_juicer/ops/mapper/nlpaug_en_mapper.py @@ -22,6 +22,7 @@ class NlpaugEnMapper(Mapper): _batched_op = True + @AUTOINSTALL.check(['nlpaug']) def __init__(self, sequential: bool = False, aug_num: int = 1, diff --git a/data_juicer/ops/mapper/nlpcda_zh_mapper.py b/data_juicer/ops/mapper/nlpcda_zh_mapper.py index 1a17c9c83..2ebac8949 100644 --- a/data_juicer/ops/mapper/nlpcda_zh_mapper.py +++ b/data_juicer/ops/mapper/nlpcda_zh_mapper.py @@ -20,6 +20,7 @@ class NlpcdaZhMapper(Mapper): _batched_op = True + @AUTOINSTALL.check(['nlpaug']) def __init__(self, sequential: bool = False, aug_num: int = 1, diff --git a/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py b/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py index 150936fa7..7486d0eb0 100644 --- a/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py +++ b/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py @@ -19,6 +19,7 @@ class RemoveWordsWithIncorrectSubstringsMapper(Mapper): """Mapper to remove words with incorrect substrings.""" + @AUTOINSTALL.check(['sentencepiece']) def __init__(self, lang: str = 'en', tokenization: bool = False, diff --git a/data_juicer/ops/mapper/sentence_split_mapper.py b/data_juicer/ops/mapper/sentence_split_mapper.py index 1a0b91c0a..d474b3386 100644 --- a/data_juicer/ops/mapper/sentence_split_mapper.py +++ b/data_juicer/ops/mapper/sentence_split_mapper.py @@ -15,6 +15,7 @@ class SentenceSplitMapper(Mapper): """Mapper to split text samples to sentences.""" + @AUTOINSTALL.check(['nltk']) def __init__(self, lang: str = 'en', *args, **kwargs): """ Initialization method. diff --git a/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py b/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py index f7ccda96c..96ce2df0d 100644 --- a/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py @@ -33,6 +33,7 @@ class VideoCaptioningFromAudioMapper(Mapper): _accelerator = 'cuda' _batched_op = True + @AUTOINSTALL.check(['transformers', 'transformers_stream_generator', 'einops', 'accelerate', 'tiktoken']) def __init__(self, keep_original_sample: bool = True, *args, **kwargs): """ Initialization method. diff --git a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py index 93091e9d9..ddf057bf7 100644 --- a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py @@ -45,6 +45,7 @@ class VideoCaptioningFromFramesMapper(Mapper): _accelerator = 'cuda' _batched_op = True + @AUTOINSTALL.check(['torch', 'transformers', 'simhash-pybind']) def __init__( self, hf_img2seq='Salesforce/blip2-opt-2.7b', diff --git a/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py b/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py index 8d5a3cb09..7c2fcd898 100644 --- a/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py @@ -53,6 +53,15 @@ class VideoCaptioningFromSummarizerMapper(Mapper): _accelerator = 'cuda' _batched_op = True + @AUTOINSTALL.check(['torch', + 'transformers', + 'simhash-pybind', # by video caption + 'transformers_stream_generator', + 'einops', + 'accelerate', + 'tiktoken', # by audio caption + 'torchaudio', # by audio tag + 'git+https://github.com/xinyu1205/recognize-anything.git']) def __init__(self, hf_summarizer: str = None, trust_remote_code=False, diff --git a/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py b/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py index 98948d450..0246851eb 100644 --- a/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py +++ b/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py @@ -19,6 +19,7 @@ class VideoFFmpegWrappedMapper(Mapper): """Simple wrapper for FFmpeg video filters. """ + @AUTOINSTALL.check(['ffmpeg-python']) def __init__( self, filter_name: Optional[str] = None, diff --git a/data_juicer/ops/mapper/video_remove_watermark_mapper.py b/data_juicer/ops/mapper/video_remove_watermark_mapper.py index 9dd654e19..43e5bce17 100644 --- a/data_juicer/ops/mapper/video_remove_watermark_mapper.py +++ b/data_juicer/ops/mapper/video_remove_watermark_mapper.py @@ -31,6 +31,7 @@ class VideoRemoveWatermarkMapper(Mapper): Remove the watermarks in videos given regions. """ + @AUTOINSTALL.check(['opencv-python']) def __init__(self, roi_strings: List[str] = ['0,0,0.1,0.1'], roi_type: str = 'ratio', diff --git a/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py b/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py index a2a91c5da..03d63babd 100644 --- a/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py +++ b/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py @@ -62,6 +62,7 @@ class VideoResizeAspectRatioMapper(Mapper): STRATEGY = ['decrease', 'increase'] + @AUTOINSTALL.check(['ffmpeg-python']) def __init__( self, min_ratio: str = '9/21', diff --git a/data_juicer/ops/mapper/video_resize_resolution_mapper.py b/data_juicer/ops/mapper/video_resize_resolution_mapper.py index 34e451ce9..eaffa4636 100644 --- a/data_juicer/ops/mapper/video_resize_resolution_mapper.py +++ b/data_juicer/ops/mapper/video_resize_resolution_mapper.py @@ -11,7 +11,7 @@ from data_juicer.utils.logger_utils import HiddenPrints from data_juicer.utils.mm_utils import close_video, load_video -from ..base_op import AUTOINSTALL, AUTOINSTALL, OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper from ..op_fusion import LOADED_VIDEOS OP_NAME = 'video_resize_resolution_mapper' @@ -28,6 +28,7 @@ class VideoResizeResolutionMapper(Mapper): with deep learning for future works. """ + @AUTOINSTALL.check(['ffmpeg-python']) def __init__(self, min_width: PositiveInt = 1, max_width: PositiveInt = sys.maxsize, diff --git a/data_juicer/ops/mapper/video_split_by_scene_mapper.py b/data_juicer/ops/mapper/video_split_by_scene_mapper.py index 16853669d..c3898c69c 100644 --- a/data_juicer/ops/mapper/video_split_by_scene_mapper.py +++ b/data_juicer/ops/mapper/video_split_by_scene_mapper.py @@ -44,6 +44,7 @@ class VideoSplitBySceneMapper(Mapper): ['fade_bias', 'add_final_scene', 'method', 'block_size'] } + @AUTOINSTALL.check(['scenedetect[opencv]']) def __init__(self, detector: str = 'ContentDetector', threshold: NonNegativeFloat = 27.0, diff --git a/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py b/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py index 5cc7f834e..59015aa4e 100644 --- a/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py +++ b/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py @@ -27,6 +27,7 @@ class VideoTaggingFromAudioMapper(Mapper): _accelerator = 'cuda' + @AUTOINSTALL.check(['torch', 'transformers', 'torchaudio']) def __init__(self, hf_ast='MIT/ast-finetuned-audioset-10-10-0.4593', trust_remote_code=False, diff --git a/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py b/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py index ded08f2e1..ba5817d3c 100644 --- a/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py +++ b/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py @@ -34,6 +34,7 @@ class VideoTaggingFromFramesMapper(Mapper): _accelerator = 'cuda' + @AUTOINSTALL.check(['torch', 'git+https://github.com/xinyu1205/recognize-anything.git']) def __init__(self, frame_sampling_method: str = 'all_keyframes', frame_num: PositiveInt = 3,