diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index 5f602e165..eb88bbd06 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -1,4 +1,5 @@ import copy +import os import traceback from functools import wraps @@ -6,6 +7,7 @@ from loguru import logger from data_juicer import is_cuda_available +from data_juicer.utils.auto_install_utils import from data_juicer.utils.constant import Fields from data_juicer.utils.mm_utils import size_to_bytes from data_juicer.utils.process_utils import calculate_np @@ -13,7 +15,9 @@ OPERATORS = Registry('Operators') UNFORKABLE = Registry('Unforkable') - +current_path = os.path.dirname(os.path.realpath(__file__)) +version_file_path = os.path.join(current_path, '../../environments/science_requires.txt') +AUTOINSTALL = AutoInstaller(version_file_path) def convert_list_dict_to_dict_list(samples): # reconstruct samples from "list of dicts" to "dict of lists" diff --git a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py index c183c2715..24c62486d 100644 --- a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py @@ -12,7 +12,6 @@ from loguru import logger from tqdm import tqdm -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import prepare_sentencepiece_model @@ -22,8 +21,7 @@ OP_NAME = 'document_minhash_deduplicator' -with AvailabilityChecking(['scipy'], OP_NAME): - from scipy.integrate import quad as integrate +integrate = LazyLoader('integrate', globals(), 'scipy.integrate') MERSENNE_PRIME = np.uint64((1 << 61) - 1) MAX_HASH = np.uint64((1 << 32) - 1) @@ -69,7 +67,7 @@ def false_positive_probability(th: float, band: int, rows: int): def proba(s): return 1 - (1 - s**float(rows))**float(band) - a, _ = integrate(proba, 0.0, th) + a, _ = integrate.quad(proba, 0.0, th) return a def false_negative_probability(th: float, band: int, rows: int): @@ -78,7 +76,7 @@ def false_negative_probability(th: float, band: int, rows: int): def proba(s): return 1 - (1 - (1 - s**float(rows))**float(band)) - a, _ = integrate(proba, th, 1.0) + a, _ = integrate.quad(proba, th, 1.0) return a # object: minimize the weighted FP and FN ratio diff --git a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py index fdcaf4448..02eb0042b 100644 --- a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py @@ -10,7 +10,6 @@ from jsonargparse.typing import PositiveInt from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys from data_juicer.utils.lazy_loader import LazyLoader @@ -19,8 +18,7 @@ OP_NAME = 'document_simhash_deduplicator' -with AvailabilityChecking(['simhash-pybind'], OP_NAME): - import simhash +simhash = LazyLoader('simhash', globals(), 'simhash') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/deduplicator/image_deduplicator.py b/data_juicer/ops/deduplicator/image_deduplicator.py index 65f3d3fc5..960eedf22 100644 --- a/data_juicer/ops/deduplicator/image_deduplicator.py +++ b/data_juicer/ops/deduplicator/image_deduplicator.py @@ -3,7 +3,6 @@ import numpy as np -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image @@ -14,22 +13,21 @@ OP_NAME = 'image_deduplicator' -with AvailabilityChecking(['imagededup'], OP_NAME): - import imagededup # noqa: F401 +imagededup = LazyLoader('imagededup', globals(), 'imagededup') - HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} +HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} - def get_hash_method(method_name): - from imagededup.methods import AHash, DHash, PHash, WHash +def get_hash_method(method_name): + from imagededup.methods import AHash, DHash, PHash, WHash - mapping = { - 'phash': PHash, - 'dhash': DHash, - 'whash': WHash, - 'ahash': AHash - } + mapping = { + 'phash': PHash, + 'dhash': DHash, + 'whash': WHash, + 'ahash': AHash + } - return mapping[method_name] + return mapping[method_name] @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/deduplicator/ray_image_deduplicator.py b/data_juicer/ops/deduplicator/ray_image_deduplicator.py index d6aad6eca..cac805e85 100644 --- a/data_juicer/ops/deduplicator/ray_image_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_image_deduplicator.py @@ -1,7 +1,6 @@ import numpy as np from jsonargparse.typing import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image @@ -11,22 +10,21 @@ OP_NAME = 'ray_image_deduplicator' -with AvailabilityChecking(['imagededup'], OP_NAME): - import imagededup # noqa: F401 +imagededup = LazyLoader('imagededup', globals(), 'imagededup') - HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} +HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} - def get_hash_method(method_name): - from imagededup.methods import AHash, DHash, PHash, WHash +def get_hash_method(method_name): + from imagededup.methods import AHash, DHash, PHash, WHash - mapping = { - 'phash': PHash, - 'dhash': DHash, - 'whash': WHash, - 'ahash': AHash - } + mapping = { + 'phash': PHash, + 'dhash': DHash, + 'whash': WHash, + 'ahash': AHash + } - return mapping[method_name] + return mapping[method_name] @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/alphanumeric_filter.py b/data_juicer/ops/filter/alphanumeric_filter.py index 80d2e6ca8..6b94dbb5e 100644 --- a/data_juicer/ops/filter/alphanumeric_filter.py +++ b/data_juicer/ops/filter/alphanumeric_filter.py @@ -2,9 +2,7 @@ from jsonargparse.typing import PositiveFloat -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model from ..base_op import AUTOINSTALL, OPERATORS, Filter @@ -12,9 +10,6 @@ OP_NAME = 'alphanumeric_filter' -with AvailabilityChecking(['transformers'], OP_NAME): - import transformers # noqa: F401 - @OPERATORS.register_module('alphanumeric_filter') class AlphanumericFilter(Filter): diff --git a/data_juicer/ops/filter/flagged_words_filter.py b/data_juicer/ops/filter/flagged_words_filter.py index d3bfe04e4..6be79fa25 100644 --- a/data_juicer/ops/filter/flagged_words_filter.py +++ b/data_juicer/ops/filter/flagged_words_filter.py @@ -4,9 +4,7 @@ from jsonargparse.typing import ClosedUnitInterval, List -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model from ...utils.asset_utils import ASSET_DIR, load_words_asset @@ -17,9 +15,6 @@ OP_NAME = 'flagged_words_filter' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 - @OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_aesthetics_filter.py b/data_juicer/ops/filter/image_aesthetics_filter.py index 71382d1b9..f8ac5b55d 100644 --- a/data_juicer/ops/filter/image_aesthetics_filter.py +++ b/data_juicer/ops/filter/image_aesthetics_filter.py @@ -2,7 +2,6 @@ from jsonargparse.typing import ClosedUnitInterval from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image @@ -14,14 +13,7 @@ OP_NAME = 'image_aesthetics_filter' CHECK_PKGs = ['torch', 'transformers', 'simple-aesthetics-predictor'] -with AvailabilityChecking(CHECK_PKGs, OP_NAME): - - import aesthetics_predictor # noqa: F401 - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) +torch = LazyLoader('torch', globals(), 'torch') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_face_ratio_filter.py b/data_juicer/ops/filter/image_face_ratio_filter.py index 07eed294d..b7cd5c8ce 100644 --- a/data_juicer/ops/filter/image_face_ratio_filter.py +++ b/data_juicer/ops/filter/image_face_ratio_filter.py @@ -4,7 +4,6 @@ from jsonargparse.typing import ClosedUnitInterval from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (detect_faces, load_data_with_context, @@ -16,8 +15,7 @@ OP_NAME = 'image_face_ratio_filter' -with AvailabilityChecking(['opencv-python'], OP_NAME): - import cv2 +cv2 = LazyLoader('cv2', globals(), 'cv2') @UNFORKABLE.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_nsfw_filter.py b/data_juicer/ops/filter/image_nsfw_filter.py index eee847161..40df5b8c3 100644 --- a/data_juicer/ops/filter/image_nsfw_filter.py +++ b/data_juicer/ops/filter/image_nsfw_filter.py @@ -1,7 +1,6 @@ import numpy as np from jsonargparse.typing import ClosedUnitInterval -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image @@ -12,12 +11,8 @@ OP_NAME = 'image_nsfw_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - import torch - import transformers # noqa: F401 - - # avoid hanging when calling nsfw detection in multiprocessing - torch.set_num_threads(1) +torch = LazyLoader('torch', globals(), 'torch') +transformers = LazyLoader('transformers', globals(), 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_text_matching_filter.py b/data_juicer/ops/filter/image_text_matching_filter.py index c1dbb73cd..acc7f1f4b 100644 --- a/data_juicer/ops/filter/image_text_matching_filter.py +++ b/data_juicer/ops/filter/image_text_matching_filter.py @@ -2,7 +2,6 @@ from jsonargparse.typing import ClosedUnitInterval from PIL import ImageOps -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context, @@ -14,12 +13,8 @@ OP_NAME = 'image_text_matching_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - import torch - import transformers # noqa: F401 - - # avoid hanging when calling blip in multiprocessing - torch.set_num_threads(1) +torch = LazyLoader('torch', globals(), 'torch') +transformers = LazyLoader('transformers', globals(), 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_text_similarity_filter.py b/data_juicer/ops/filter/image_text_similarity_filter.py index c6f8160cf..921cd3517 100644 --- a/data_juicer/ops/filter/image_text_similarity_filter.py +++ b/data_juicer/ops/filter/image_text_similarity_filter.py @@ -2,7 +2,6 @@ from jsonargparse.typing import ClosedUnitInterval from PIL import ImageOps -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context, @@ -14,13 +13,8 @@ OP_NAME = 'image_text_similarity_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) +torch = LazyLoader('torch', globals(), 'torch') +transformers = LazyLoader('transformers', globals(), 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_watermark_filter.py b/data_juicer/ops/filter/image_watermark_filter.py index bcf1d2646..cee31caf9 100644 --- a/data_juicer/ops/filter/image_watermark_filter.py +++ b/data_juicer/ops/filter/image_watermark_filter.py @@ -1,7 +1,6 @@ import numpy as np from jsonargparse.typing import ClosedUnitInterval -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image @@ -12,12 +11,8 @@ OP_NAME = 'image_watermark_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - import torch - import transformers # noqa: F401 - - # avoid hanging when calling watermark detection in multiprocessing - torch.set_num_threads(1) +torch = LazyLoader('torch', globals(), 'torch') +transformers = LazyLoader('transformers', globals(), 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/language_id_score_filter.py b/data_juicer/ops/filter/language_id_score_filter.py index 79a204c7e..6e4a771e3 100644 --- a/data_juicer/ops/filter/language_id_score_filter.py +++ b/data_juicer/ops/filter/language_id_score_filter.py @@ -3,7 +3,6 @@ from jsonargparse.typing import ClosedUnitInterval from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model @@ -12,8 +11,7 @@ OP_NAME = 'language_id_score_filter' -with AvailabilityChecking(['fasttext-wheel'], OP_NAME): - import fasttext # noqa: F401 +fasttext = LazyLoader('fasttext', globals(), 'fasttext') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/perplexity_filter.py b/data_juicer/ops/filter/perplexity_filter.py index b83328875..1921fcf65 100644 --- a/data_juicer/ops/filter/perplexity_filter.py +++ b/data_juicer/ops/filter/perplexity_filter.py @@ -4,7 +4,6 @@ from jsonargparse.typing import PositiveFloat -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model @@ -15,10 +14,8 @@ OP_NAME = 'perplexity_filter' -with AvailabilityChecking(['sentencepiece', 'kenlm'], OP_NAME): - import kenlm # noqa: F401 - import sentencepiece # noqa: F401 - +kenlm = LazyLoader('kenlm', globals(), 'kenlm') +sentencepiece = LazyLoader('sentencepiece', globals(), 'sentencepiece') @OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/phrase_grounding_recall_filter.py b/data_juicer/ops/filter/phrase_grounding_recall_filter.py index ad45d2fb5..196db04e3 100644 --- a/data_juicer/ops/filter/phrase_grounding_recall_filter.py +++ b/data_juicer/ops/filter/phrase_grounding_recall_filter.py @@ -5,7 +5,6 @@ from loguru import logger from PIL import ImageOps -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, iou, @@ -18,15 +17,9 @@ OP_NAME = 'phrase_grounding_recall_filter' -with AvailabilityChecking(['torch', 'transformers', 'nltk'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) - - import nltk +torch = LazyLoader('torch', globals(), 'torch') +transformers = LazyLoader('transformers', globals(), 'transformers') +nltk = LazyLoader('nltk', globals(), 'nltk') # NER algorithm adapted from GLIP starts diff --git a/data_juicer/ops/filter/stopwords_filter.py b/data_juicer/ops/filter/stopwords_filter.py index 58b386219..21c753282 100644 --- a/data_juicer/ops/filter/stopwords_filter.py +++ b/data_juicer/ops/filter/stopwords_filter.py @@ -5,7 +5,6 @@ from jsonargparse.typing import ClosedUnitInterval, List from data_juicer.utils.asset_utils import ASSET_DIR, load_words_asset -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model @@ -17,8 +16,7 @@ OP_NAME = 'stopwords_filter' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 +sentencepiece = LazyLoader('sentencepiece', globals(), 'sentencepiece') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/text_action_filter.py b/data_juicer/ops/filter/text_action_filter.py index d39ab88e9..aea26c19e 100644 --- a/data_juicer/ops/filter/text_action_filter.py +++ b/data_juicer/ops/filter/text_action_filter.py @@ -1,6 +1,4 @@ -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import remove_special_tokens from data_juicer.utils.model_utils import get_model, prepare_model @@ -8,9 +6,6 @@ OP_NAME = 'text_action_filter' -with AvailabilityChecking(['spacy-pkuseg'], OP_NAME): - import spacy_pkuseg # noqa: F401 - @OPERATORS.register_module(OP_NAME) class TextActionFilter(Filter): diff --git a/data_juicer/ops/filter/text_entity_dependency_filter.py b/data_juicer/ops/filter/text_entity_dependency_filter.py index 39f3d973e..49e5dd8a4 100644 --- a/data_juicer/ops/filter/text_entity_dependency_filter.py +++ b/data_juicer/ops/filter/text_entity_dependency_filter.py @@ -1,8 +1,6 @@ import numpy as np -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import remove_special_tokens from data_juicer.utils.model_utils import get_model, prepare_model @@ -10,9 +8,6 @@ OP_NAME = 'text_entity_dependency_filter' -with AvailabilityChecking(['spacy-pkuseg'], OP_NAME): - import spacy_pkuseg # noqa: F401 - @OPERATORS.register_module(OP_NAME) class TextEntityDependencyFilter(Filter): diff --git a/data_juicer/ops/filter/token_num_filter.py b/data_juicer/ops/filter/token_num_filter.py index 83704f08b..b90f9572f 100644 --- a/data_juicer/ops/filter/token_num_filter.py +++ b/data_juicer/ops/filter/token_num_filter.py @@ -2,7 +2,6 @@ from jsonargparse.typing import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model @@ -12,8 +11,7 @@ OP_NAME = 'token_num_filter' -with AvailabilityChecking(['transformers'], OP_NAME): - import transformers # noqa: F401 +transformers = LazyLoader('transformers', globals(), 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/video_aesthetics_filter.py b/data_juicer/ops/filter/video_aesthetics_filter.py index 55ed66fda..aacf44158 100644 --- a/data_juicer/ops/filter/video_aesthetics_filter.py +++ b/data_juicer/ops/filter/video_aesthetics_filter.py @@ -2,7 +2,6 @@ from jsonargparse.typing import ClosedUnitInterval, PositiveInt from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (close_video, extract_key_frames, @@ -14,11 +13,8 @@ from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS OP_NAME = 'video_aesthetics_filter' -CHECK_PKGS = ['torch', 'transformers', 'simple-aesthetics-predictor'] torch = LazyLoader('torch', globals(), 'torch') -transformers = LazyLoader('transformers', globals(), 'transformers') -aesthetics_predictor = LazyLoader('aesthetics_predictor', globals(), 'aesthetics_predictor') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/video_frames_text_similarity_filter.py b/data_juicer/ops/filter/video_frames_text_similarity_filter.py index e399d5abd..75bb8a374 100644 --- a/data_juicer/ops/filter/video_frames_text_similarity_filter.py +++ b/data_juicer/ops/filter/video_frames_text_similarity_filter.py @@ -2,7 +2,6 @@ from jsonargparse.typing import ClosedUnitInterval, PositiveInt from PIL import ImageOps -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, close_video, @@ -17,13 +16,8 @@ OP_NAME = 'video_frames_text_similarity_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) +torch = LazyLoader('torch', globals(), 'torch') +transformers = LazyLoader('transformers', globals(), 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/video_motion_score_filter.py b/data_juicer/ops/filter/video_motion_score_filter.py index e54589a32..572c24960 100644 --- a/data_juicer/ops/filter/video_motion_score_filter.py +++ b/data_juicer/ops/filter/video_motion_score_filter.py @@ -5,7 +5,6 @@ import numpy as np from jsonargparse.typing import PositiveFloat, PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader @@ -13,8 +12,7 @@ OP_NAME = 'video_motion_score_filter' -with AvailabilityChecking(['opencv-python'], OP_NAME): - import cv2 +cv2 = LazyLoader('cv2', globals(), 'cv2') @contextmanager diff --git a/data_juicer/ops/filter/video_nsfw_filter.py b/data_juicer/ops/filter/video_nsfw_filter.py index 91e409676..108add7db 100644 --- a/data_juicer/ops/filter/video_nsfw_filter.py +++ b/data_juicer/ops/filter/video_nsfw_filter.py @@ -1,7 +1,6 @@ import numpy as np from jsonargparse.typing import ClosedUnitInterval, PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (close_video, extract_key_frames, @@ -14,13 +13,8 @@ OP_NAME = 'video_nsfw_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling nsfw detection in multiprocessing - torch.set_num_threads(1) +torch = LazyLoader('torch', globals(), 'torch') +transformers = LazyLoader('transformers', globals(), 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/video_ocr_area_ratio_filter.py b/data_juicer/ops/filter/video_ocr_area_ratio_filter.py index 07c419858..f47186e45 100644 --- a/data_juicer/ops/filter/video_ocr_area_ratio_filter.py +++ b/data_juicer/ops/filter/video_ocr_area_ratio_filter.py @@ -4,7 +4,6 @@ from jsonargparse.typing import ClosedUnitInterval, PositiveInt from data_juicer import cuda_device_count -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (close_video, @@ -16,8 +15,7 @@ OP_NAME = 'video_ocr_area_ratio_filter' -with AvailabilityChecking(['easyocr'], OP_NAME): - import easyocr +easyocr = LazyLoader('easyocr', globals(), 'easyocr') def triangle_area(p1, p2, p3): diff --git a/data_juicer/ops/filter/video_tagging_from_frames_filter.py b/data_juicer/ops/filter/video_tagging_from_frames_filter.py index 49a46ee65..4b0e0e2e1 100644 --- a/data_juicer/ops/filter/video_tagging_from_frames_filter.py +++ b/data_juicer/ops/filter/video_tagging_from_frames_filter.py @@ -1,9 +1,7 @@ import numpy as np from jsonargparse.typing import List, PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields -from data_juicer.utils.lazy_loader import LazyLoader from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter from ..mapper.video_tagging_from_frames_mapper import \ @@ -12,15 +10,6 @@ OP_NAME = 'video_tagging_from_frames_filter' -with AvailabilityChecking( - ['torch', 'git+https://github.com/xinyu1205/recognize-anything.git'], - OP_NAME): - import ram # noqa: F401 - import torch - - # avoid hanging when calling recognizeAnything in multiprocessing - torch.set_num_threads(1) - @UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/video_watermark_filter.py b/data_juicer/ops/filter/video_watermark_filter.py index 2c022826d..0406040a8 100644 --- a/data_juicer/ops/filter/video_watermark_filter.py +++ b/data_juicer/ops/filter/video_watermark_filter.py @@ -1,7 +1,6 @@ import numpy as np from jsonargparse.typing import ClosedUnitInterval, PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (close_video, extract_key_frames, @@ -14,13 +13,7 @@ OP_NAME = 'video_watermark_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling watermark detection in multiprocessing - torch.set_num_threads(1) +torch = LazyLoader('torch', globals(), 'torch') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/word_repetition_filter.py b/data_juicer/ops/filter/word_repetition_filter.py index 5b2eb0402..d5223c3da 100644 --- a/data_juicer/ops/filter/word_repetition_filter.py +++ b/data_juicer/ops/filter/word_repetition_filter.py @@ -4,9 +4,7 @@ from jsonargparse.typing import ClosedUnitInterval, PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model from ..base_op import AUTOINSTALL, OPERATORS, Filter @@ -16,9 +14,6 @@ OP_NAME = 'word_repetition_filter' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 - @OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/words_num_filter.py b/data_juicer/ops/filter/words_num_filter.py index 2d2ddb07e..f272b6ce8 100644 --- a/data_juicer/ops/filter/words_num_filter.py +++ b/data_juicer/ops/filter/words_num_filter.py @@ -2,9 +2,7 @@ from jsonargparse.typing import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model from ..base_op import AUTOINSTALL, OPERATORS, Filter @@ -14,9 +12,6 @@ OP_NAME = 'words_num_filter' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 - @OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py b/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py index 2e8804b75..0737c4b4b 100644 --- a/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py +++ b/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py @@ -1,6 +1,5 @@ from typing import Dict, List, Optional -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.file_utils import transfer_filename @@ -10,8 +9,7 @@ OP_NAME = 'audio_ffmpeg_wrapped_mapper' -with AvailabilityChecking(['ffmpeg-python'], OP_NAME), HiddenPrints(): - import ffmpeg +ffmpeg = LazyLoader('ffmpeg', globals(), 'ffmpeg') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py index 13cc4c59c..97620a335 100644 --- a/data_juicer/ops/mapper/chinese_convert_mapper.py +++ b/data_juicer/ops/mapper/chinese_convert_mapper.py @@ -1,12 +1,10 @@ -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.lazy_loader import LazyLoader from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'chinese_convert_mapper' -with AvailabilityChecking(['opencc'], OP_NAME): - import opencc # noqa: F401 +opencc = LazyLoader('opencc', globals(), 'opencc') OPENCC_CONVERTER = None diff --git a/data_juicer/ops/mapper/clean_html_mapper.py b/data_juicer/ops/mapper/clean_html_mapper.py index b6b816e45..907bee65a 100644 --- a/data_juicer/ops/mapper/clean_html_mapper.py +++ b/data_juicer/ops/mapper/clean_html_mapper.py @@ -2,15 +2,13 @@ # https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/ # -------------------------------------------------------- -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.lazy_loader import LazyLoader from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'clean_html_mapper' -with AvailabilityChecking(['selectolax'], OP_NAME): - from selectolax.parser import HTMLParser +selectolax = LazyLoader('selectolax', globals(), 'selectolax') @OPERATORS.register_module(OP_NAME) @@ -34,7 +32,7 @@ def _clean_html(raw_html): raw_html = raw_html.replace('', '') raw_html = raw_html.replace('
    ', '\n*') raw_html = raw_html.replace('
', '') - parser = HTMLParser(raw_html) + parser = selectolax.parser.HTMLParser(raw_html) return parser.text() sample[self.text_key] = _clean_html(sample[self.text_key]) diff --git a/data_juicer/ops/mapper/fix_unicode_mapper.py b/data_juicer/ops/mapper/fix_unicode_mapper.py index e9b3188eb..8f55ab4cd 100644 --- a/data_juicer/ops/mapper/fix_unicode_mapper.py +++ b/data_juicer/ops/mapper/fix_unicode_mapper.py @@ -1,12 +1,10 @@ -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.lazy_loader import LazyLoader from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'fix_unicode_mapper' -with AvailabilityChecking(['ftfy'], OP_NAME): - import ftfy +ftfy = LazyLoader('ftfy', globals(), 'ftfy') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/image_captioning_mapper.py b/data_juicer/ops/mapper/image_captioning_mapper.py index 326f29155..f042e5961 100644 --- a/data_juicer/ops/mapper/image_captioning_mapper.py +++ b/data_juicer/ops/mapper/image_captioning_mapper.py @@ -5,9 +5,7 @@ from jsonargparse.typing import PositiveInt from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, insert_texts_after_placeholders, load_image, remove_non_special_tokens, @@ -19,15 +17,6 @@ OP_NAME = 'image_captioning_mapper' -with AvailabilityChecking(['torch', 'transformers', 'simhash-pybind'], - OP_NAME): - import simhash # noqa: F401 - import torch - import transformers # noqa: F401 - - # avoid hanging when calling model in multiprocessing - torch.set_num_threads(1) - @OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/image_diffusion_mapper.py b/data_juicer/ops/mapper/image_diffusion_mapper.py index 8079ddaaa..f82e3ab68 100644 --- a/data_juicer/ops/mapper/image_diffusion_mapper.py +++ b/data_juicer/ops/mapper/image_diffusion_mapper.py @@ -3,9 +3,7 @@ from PIL import Image -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.file_utils import transfer_filename from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context, load_image, remove_special_tokens) @@ -16,16 +14,6 @@ OP_NAME = 'image_diffusion_mapper' -check_list = ['diffusers', 'torch', 'transformers', 'simhash-pybind'] -with AvailabilityChecking(check_list, OP_NAME): - import diffusers # noqa: F401 - import simhash # noqa: F401 - import torch - import transformers # noqa: F401 - - # avoid hanging when calling stable diffusion in multiprocessing - torch.set_num_threads(1) - @OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/image_face_blur_mapper.py b/data_juicer/ops/mapper/image_face_blur_mapper.py index c835c658e..d5c8cb363 100644 --- a/data_juicer/ops/mapper/image_face_blur_mapper.py +++ b/data_juicer/ops/mapper/image_face_blur_mapper.py @@ -2,7 +2,6 @@ from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.file_utils import transfer_filename @@ -15,9 +14,8 @@ OP_NAME = 'image_face_blur_mapper' -with AvailabilityChecking(['opencv-python', 'Pillow'], OP_NAME): - import cv2 - from PIL import ImageFilter +cv2 = LazyLoader('cv2', globals(), 'cv2') +PIL = LazyLoader('PIL', globals(), 'PIL') @UNFORKABLE.register_module(OP_NAME) @@ -66,11 +64,11 @@ def __init__(self, raise ValueError('Radius must be >= 0. ') if blur_type == 'mean': - self.blur = ImageFilter.BLUR + self.blur = PIL.ImageFilter.BLUR elif blur_type == 'box': - self.blur = ImageFilter.BoxBlur(radius) + self.blur = PIL.ImageFilter.BoxBlur(radius) else: - self.blur = ImageFilter.GaussianBlur(radius) + self.blur = PIL.ImageFilter.GaussianBlur(radius) self.blur_type = blur_type self.radius = radius diff --git a/data_juicer/ops/mapper/nlpaug_en_mapper.py b/data_juicer/ops/mapper/nlpaug_en_mapper.py index c658c9f49..6c7a6b0a8 100644 --- a/data_juicer/ops/mapper/nlpaug_en_mapper.py +++ b/data_juicer/ops/mapper/nlpaug_en_mapper.py @@ -2,18 +2,13 @@ from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.lazy_loader import LazyLoader from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'nlpaug_en_mapper' -with AvailabilityChecking(['nlpaug'], OP_NAME): - import nlpaug.augmenter.char as nac - import nlpaug.augmenter.word as naw - import nlpaug.flow as naf - from nlpaug.util import Action +nlpaug = LazyLoader('nlpaug', globals(), 'nlpaug') @OPERATORS.register_module(OP_NAME) @@ -99,6 +94,8 @@ def __init__(self, aug_pipeline = [] # word level + naw = nlpaug.augmenter.word + Action = nlpaug.util.Action if delete_random_word: aug_pipeline.append(naw.RandomWordAug(action=Action.DELETE)) if swap_random_word: @@ -109,6 +106,7 @@ def __init__(self, aug_pipeline.append(naw.SplitAug()) # char level + nac = nlpaug.augmenter.char if keyboard_error_char: aug_pipeline.append(nac.KeyboardAug()) if ocr_error_char: @@ -121,7 +119,7 @@ def __init__(self, aug_pipeline.append(nac.RandomCharAug(action=Action.INSERT)) if self.sequential: - self.aug = naf.Sequential(aug_pipeline) + self.aug = nlpaug.flow.Sequential(aug_pipeline) else: self.aug = aug_pipeline diff --git a/data_juicer/ops/mapper/nlpcda_zh_mapper.py b/data_juicer/ops/mapper/nlpcda_zh_mapper.py index 2ebac8949..e52977dd2 100644 --- a/data_juicer/ops/mapper/nlpcda_zh_mapper.py +++ b/data_juicer/ops/mapper/nlpcda_zh_mapper.py @@ -2,7 +2,6 @@ from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.logger_utils import HiddenPrints @@ -10,8 +9,7 @@ OP_NAME = 'nlpcda_zh_mapper' -with AvailabilityChecking(['nlpcda'], OP_NAME), HiddenPrints(): - import nlpcda +nlpcda = LazyLoader('nlpcda', globals(), 'nlpcda') @OPERATORS.register_module(OP_NAME) @@ -20,7 +18,7 @@ class NlpcdaZhMapper(Mapper): _batched_op = True - @AUTOINSTALL.check(['nlpaug']) + @AUTOINSTALL.check(['nlpcda']) def __init__(self, sequential: bool = False, aug_num: int = 1, diff --git a/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py b/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py index 7486d0eb0..1a617fca7 100644 --- a/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py +++ b/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py @@ -1,7 +1,5 @@ from jsonargparse.typing import List -from data_juicer.utils.availability_utils import AvailabilityChecking -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model from ..base_op import AUTOINSTALL, OPERATORS, Mapper @@ -11,9 +9,6 @@ OP_NAME = 'remove_words_with_incorrect_substrings_mapper' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 - @OPERATORS.register_module(OP_NAME) class RemoveWordsWithIncorrectSubstringsMapper(Mapper): diff --git a/data_juicer/ops/mapper/sentence_split_mapper.py b/data_juicer/ops/mapper/sentence_split_mapper.py index d474b3386..5dca3a002 100644 --- a/data_juicer/ops/mapper/sentence_split_mapper.py +++ b/data_juicer/ops/mapper/sentence_split_mapper.py @@ -1,5 +1,3 @@ -from data_juicer.utils.availability_utils import AvailabilityChecking -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model from ..base_op import AUTOINSTALL, OPERATORS, Mapper @@ -7,9 +5,6 @@ OP_NAME = 'sentence_split_mapper' -with AvailabilityChecking(['nltk'], OP_NAME): - import nltk # noqa: F401 - @OPERATORS.register_module(OP_NAME) class SentenceSplitMapper(Mapper): diff --git a/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py b/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py index 96ce2df0d..0fd2ad6a8 100644 --- a/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py @@ -3,25 +3,12 @@ import regex as re -from data_juicer.utils.availability_utils import AvailabilityChecking -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import SpecialTokens, extract_audio_from_video from data_juicer.utils.model_utils import get_model, prepare_model from ..base_op import AUTOINSTALL, OPERATORS, Mapper NAME = 'video_captioning_from_audio_mapper' -CHECK_PKGS = [ - 'transformers', 'transformers_stream_generator', 'einops', 'accelerate', - 'tiktoken' -] - -with AvailabilityChecking(CHECK_PKGS, NAME): - import accelerate # noqa: F401 - import einops # noqa: F401 - import tiktoken # noqa: F401 - import transformers # noqa: F401 - import transformers_stream_generator # noqa: F401 @OPERATORS.register_module(NAME) diff --git a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py index ddf057bf7..38bc3e9c9 100644 --- a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py @@ -7,9 +7,7 @@ from loguru import logger from PIL import ImageOps -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, close_video, extract_key_frames, extract_video_frames_uniformly, @@ -24,16 +22,6 @@ OP_NAME = 'video_captioning_from_frames_mapper' -with AvailabilityChecking(['torch', 'transformers', 'simhash-pybind'], - OP_NAME): - - import simhash # noqa: F401 - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) - @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py b/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py index 7c2fcd898..6f0bcfad8 100644 --- a/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py @@ -3,44 +3,13 @@ from jsonargparse.typing import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import SpecialTokens, remove_special_tokens from data_juicer.utils.model_utils import get_model, prepare_model from ..base_op import AUTOINSTALL, OPERATORS, Mapper NAME = 'video_captioning_from_summarizer_mapper' -CHECK_PKGS = [ - 'torch', - 'transformers', - 'simhash-pybind', # by video caption - 'transformers_stream_generator', - 'einops', - 'accelerate', - 'tiktoken', # by audio caption - 'torchaudio', # by audio tag - 'git+https://github.com/xinyu1205/recognize-anything.git', # by frame tag -] - -with AvailabilityChecking(CHECK_PKGS, NAME): - # video caption - # audio caption - import accelerate # noqa: F401 - import einops # noqa: F401 - # frame tag - import ram # noqa: F401 - import simhash # noqa: F401 - import tiktoken # noqa: F401 - import torch - # audio tag - import torchaudio # noqa: F401 - import transformers # noqa: F401 - import transformers_stream_generator # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) @OPERATORS.register_module(NAME) diff --git a/data_juicer/ops/mapper/video_captioning_from_video_mapper.py b/data_juicer/ops/mapper/video_captioning_from_video_mapper.py index 6661ed76d..57db603a7 100644 --- a/data_juicer/ops/mapper/video_captioning_from_video_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_video_mapper.py @@ -7,9 +7,7 @@ from loguru import logger from PIL import ImageOps -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, close_video, extract_key_frames, extract_video_frames_uniformly, @@ -24,16 +22,6 @@ OP_NAME = 'video_captioning_from_video_mapper' -with AvailabilityChecking(['torch', 'transformers', 'simhash-pybind'], - OP_NAME): - - import simhash # noqa: F401 - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) - @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) @@ -44,6 +32,7 @@ class VideoCaptioningFromVideoMapper(Mapper): _accelerator = 'cuda' _batched_op = True + @AUTOINSTALL.check(['torch', 'transformers', 'simhash-pybind']) def __init__( self, hf_video_blip='kpyu/video-blip-opt-2.7b-ego4d', diff --git a/data_juicer/ops/mapper/video_face_blur_mapper.py b/data_juicer/ops/mapper/video_face_blur_mapper.py index dd9d603cf..cda5c33f1 100644 --- a/data_juicer/ops/mapper/video_face_blur_mapper.py +++ b/data_juicer/ops/mapper/video_face_blur_mapper.py @@ -2,7 +2,6 @@ import av -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.file_utils import transfer_filename @@ -16,9 +15,8 @@ OP_NAME = 'video_face_blur_mapper' -with AvailabilityChecking(['opencv-python', 'Pillow'], OP_NAME): - import cv2 - from PIL import ImageFilter +cv2 = LazyLoader('cv2', globals(), 'cv2') +PIL = LazyLoader('PIL', globals(), 'PIL') @UNFORKABLE.register_module(OP_NAME) @@ -35,6 +33,7 @@ class VideoFaceBlurMapper(Mapper): 'maxSize': None, } + @AUTOINSTALL.check(['opencv-python', 'Pillow']) def __init__(self, cv_classifier='', blur_type: str = 'gaussian', @@ -66,11 +65,11 @@ def __init__(self, raise ValueError('Radius must be >= 0. ') if blur_type == 'mean': - self.blur = ImageFilter.BLUR + self.blur = PIL.ImageFilter.BLUR elif blur_type == 'box': - self.blur = ImageFilter.BoxBlur(radius) + self.blur = PIL.ImageFilter.BoxBlur(radius) else: - self.blur = ImageFilter.GaussianBlur(radius) + self.blur = PIL.ImageFilter.GaussianBlur(radius) self.blur_type = blur_type self.radius = radius diff --git a/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py b/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py index 0246851eb..ed84c7e5b 100644 --- a/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py +++ b/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py @@ -1,6 +1,5 @@ from typing import Dict, List, Optional -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.file_utils import transfer_filename @@ -10,8 +9,7 @@ OP_NAME = 'video_ffmpeg_wrapped_mapper' -with AvailabilityChecking(['ffmpeg-python'], OP_NAME), HiddenPrints(): - import ffmpeg +ffmpeg = LazyLoader('ffmpeg', globals(), 'ffmpeg') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/video_remove_watermark_mapper.py b/data_juicer/ops/mapper/video_remove_watermark_mapper.py index 43e5bce17..798a70560 100644 --- a/data_juicer/ops/mapper/video_remove_watermark_mapper.py +++ b/data_juicer/ops/mapper/video_remove_watermark_mapper.py @@ -4,7 +4,6 @@ import numpy as np from jsonargparse.typing import List, PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.file_utils import transfer_filename @@ -20,8 +19,7 @@ OP_NAME = 'video_remove_watermark_mapper' -with AvailabilityChecking(['opencv-python'], OP_NAME), HiddenPrints(): - import cv2 as cv +cv2 = LazyLoader('cv2', globals(), 'cv2') @OPERATORS.register_module(OP_NAME) @@ -115,9 +113,9 @@ def _detect_watermark_via_pixel_value(self, frames, rois): for roi in rois: # dimension of ndarray frame: height x width x channel roi_frame = frame[roi[1]:roi[3], roi[0]:roi[2]] - gray_frame = cv.cvtColor(roi_frame, cv.COLOR_BGR2GRAY) - _, binary_frame = cv.threshold( - gray_frame, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU) + gray_frame = cv2.cvtColor(roi_frame, cv2.COLOR_BGR2GRAY) + _, binary_frame = cv2.threshold( + gray_frame, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) # assume the watermark is located in the box, so the pixel in # the edge must be 0, if not, reverse binary_frame @@ -156,8 +154,8 @@ def _detect_watermark_via_pixel_diversity(self, frames, rois): else: scaled_diversity = np.zeros_like(pixel_diversity) scaled_diversity = scaled_diversity.astype(np.uint8) - _, binary_frame = cv.threshold(scaled_diversity, 0, 255, - cv.THRESH_BINARY + cv.THRESH_OTSU) + _, binary_frame = cv2.threshold(scaled_diversity, 0, 255, + cv2.THRESH_BINARY + cv2.THRESH_OTSU) # the watermark pixels have less diversity binary_frame = ~binary_frame mask[roi[1]:roi[3], @@ -196,11 +194,11 @@ def _generate_watermark_mask(self, video, sample): mask = self._detect_watermark_via_pixel_diversity(frames, rois) kernel = np.ones((5, 5), np.uint8) - return cv.dilate(mask, kernel) + return cv2.dilate(mask, kernel) def _clean_watermark(self, frame, watermark_mask): np_frame = frame.to_ndarray(format='bgr24') - new_np_frame = cv.inpaint(np_frame, watermark_mask, 3, cv.INPAINT_NS) + new_np_frame = cv2.inpaint(np_frame, watermark_mask, 3, cv2.INPAINT_NS) return av.VideoFrame.from_ndarray(new_np_frame, format='bgr24') def process(self, sample, context=False): diff --git a/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py b/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py index 03d63babd..f89897830 100644 --- a/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py +++ b/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py @@ -2,7 +2,6 @@ import os from fractions import Fraction -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.file_utils import transfer_filename @@ -13,8 +12,7 @@ OP_NAME = 'video_resize_aspect_ratio_mapper' -with AvailabilityChecking(['ffmpeg-python'], OP_NAME), HiddenPrints(): - import ffmpeg +ffmpeg = LazyLoader('ffmpeg', globals(), 'ffmpeg') def rescale(width, height, ori_ratio, min_ratio, max_ratio, strategy): diff --git a/data_juicer/ops/mapper/video_resize_resolution_mapper.py b/data_juicer/ops/mapper/video_resize_resolution_mapper.py index eaffa4636..bac221a9c 100644 --- a/data_juicer/ops/mapper/video_resize_resolution_mapper.py +++ b/data_juicer/ops/mapper/video_resize_resolution_mapper.py @@ -4,7 +4,6 @@ from jsonargparse.typing import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.file_utils import transfer_filename @@ -16,8 +15,7 @@ OP_NAME = 'video_resize_resolution_mapper' -with AvailabilityChecking(['ffmpeg-python'], OP_NAME), HiddenPrints(): - import ffmpeg +ffmpeg = LazyLoader('ffmpeg', globals(), 'ffmpeg') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/video_split_by_scene_mapper.py b/data_juicer/ops/mapper/video_split_by_scene_mapper.py index c3898c69c..5d59b01ff 100644 --- a/data_juicer/ops/mapper/video_split_by_scene_mapper.py +++ b/data_juicer/ops/mapper/video_split_by_scene_mapper.py @@ -4,7 +4,6 @@ from jsonargparse.typing import NonNegativeFloat, NonNegativeInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.file_utils import (add_suffix_to_filename, @@ -15,9 +14,7 @@ OP_NAME = 'video_split_by_scene_mapper' -with AvailabilityChecking(['scenedetect[opencv]'], OP_NAME): - import scenedetect.detectors - from scenedetect import detect, split_video_ffmpeg +scenedetect = LazyLoader('scenedetect', globals(), 'scenedetect') def replace_func(match, scene_counts_iter): @@ -109,7 +106,7 @@ def process(self, sample, context=False): # detect scenes detector = self.detector_class(self.threshold, self.min_scene_len, **self.detector_kwargs) - scene_list = detect(video_key, + scene_list = scenedetect.detect(video_key, detector, show_progress=self.show_progress, start_in_scene=True) @@ -124,7 +121,7 @@ def process(self, sample, context=False): for i in range(len(scene_list)) ] # split video into clips - split_video_ffmpeg(input_video_path=video_key, + scenedetect.split_video_ffmpeg(input_video_path=video_key, scene_list=scene_list, output_file_template=output_template, show_progress=self.show_progress) diff --git a/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py b/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py index 59015aa4e..e9a8d78b8 100644 --- a/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py +++ b/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py @@ -1,6 +1,5 @@ import librosa -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import extract_audio_from_video @@ -10,13 +9,7 @@ OP_NAME = 'video_tagging_from_audio_mapper' -with AvailabilityChecking(['torch', 'transformers', 'torchaudio'], OP_NAME): - import torch - import torchaudio # noqa: F401 - import transformers # noqa: F401 - - # avoid hanging when calling recognizeAnything in multiprocessing - torch.set_num_threads(1) +torch = LazyLoader('torch', globals(), 'torch') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py b/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py index ba5817d3c..820b779eb 100644 --- a/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py +++ b/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py @@ -2,7 +2,6 @@ from jsonargparse.typing import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (close_video, extract_key_frames, @@ -15,14 +14,8 @@ OP_NAME = 'video_tagging_from_frames_mapper' -with AvailabilityChecking( - ['torch', 'git+https://github.com/xinyu1205/recognize-anything.git'], - OP_NAME): - import ram # noqa: F401 - import torch - - # avoid hanging when calling recognizeAnything in multiprocessing - torch.set_num_threads(1) +ram = LazyLoader('ram', globals(), 'ram') +torch = LazyLoader('torch', globals(), 'torch') @UNFORKABLE.register_module(OP_NAME) @@ -70,8 +63,7 @@ def __init__(self, input_size=384) self.frame_sampling_method = frame_sampling_method self.frame_num = frame_num - from ram import get_transform - self.transform = get_transform(image_size=384) + self.transform = ram.get_transform(image_size=384) def process(self, sample, rank=None, context=False): # check if it's generated already diff --git a/data_juicer/utils/auto_install_utils.py b/data_juicer/utils/auto_install_utils.py index 307fcbc35..a4f285bec 100644 --- a/data_juicer/utils/auto_install_utils.py +++ b/data_juicer/utils/auto_install_utils.py @@ -50,7 +50,7 @@ def inner_wrapper(*args, **kwargs): pkg = self.version_map[pkg] subprocess.check_call(["pip", "install", pkg]) logger.info(f"The {pkg} installed.") - if pkg == torch: + if pkg == 'torch': _torch_check_and_set() return func(*args, **kwargs) return inner_wrapper