diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index 5f602e165..eb88bbd06 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -1,4 +1,5 @@ import copy +import os import traceback from functools import wraps @@ -6,6 +7,7 @@ from loguru import logger from data_juicer import is_cuda_available +from data_juicer.utils.auto_install_utils import from data_juicer.utils.constant import Fields from data_juicer.utils.mm_utils import size_to_bytes from data_juicer.utils.process_utils import calculate_np @@ -13,7 +15,9 @@ OPERATORS = Registry('Operators') UNFORKABLE = Registry('Unforkable') - +current_path = os.path.dirname(os.path.realpath(__file__)) +version_file_path = os.path.join(current_path, '../../environments/science_requires.txt') +AUTOINSTALL = AutoInstaller(version_file_path) def convert_list_dict_to_dict_list(samples): # reconstruct samples from "list of dicts" to "dict of lists" diff --git a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py index c183c2715..24c62486d 100644 --- a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py @@ -12,7 +12,6 @@ from loguru import logger from tqdm import tqdm -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import prepare_sentencepiece_model @@ -22,8 +21,7 @@ OP_NAME = 'document_minhash_deduplicator' -with AvailabilityChecking(['scipy'], OP_NAME): - from scipy.integrate import quad as integrate +integrate = LazyLoader('integrate', globals(), 'scipy.integrate') MERSENNE_PRIME = np.uint64((1 << 61) - 1) MAX_HASH = np.uint64((1 << 32) - 1) @@ -69,7 +67,7 @@ def false_positive_probability(th: float, band: int, rows: int): def proba(s): return 1 - (1 - s**float(rows))**float(band) - a, _ = integrate(proba, 0.0, th) + a, _ = integrate.quad(proba, 0.0, th) return a def false_negative_probability(th: float, band: int, rows: int): @@ -78,7 +76,7 @@ def false_negative_probability(th: float, band: int, rows: int): def proba(s): return 1 - (1 - (1 - s**float(rows))**float(band)) - a, _ = integrate(proba, th, 1.0) + a, _ = integrate.quad(proba, th, 1.0) return a # object: minimize the weighted FP and FN ratio diff --git a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py index fdcaf4448..02eb0042b 100644 --- a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py @@ -10,7 +10,6 @@ from jsonargparse.typing import PositiveInt from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys from data_juicer.utils.lazy_loader import LazyLoader @@ -19,8 +18,7 @@ OP_NAME = 'document_simhash_deduplicator' -with AvailabilityChecking(['simhash-pybind'], OP_NAME): - import simhash +simhash = LazyLoader('simhash', globals(), 'simhash') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/deduplicator/image_deduplicator.py b/data_juicer/ops/deduplicator/image_deduplicator.py index 65f3d3fc5..960eedf22 100644 --- a/data_juicer/ops/deduplicator/image_deduplicator.py +++ b/data_juicer/ops/deduplicator/image_deduplicator.py @@ -3,7 +3,6 @@ import numpy as np -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image @@ -14,22 +13,21 @@ OP_NAME = 'image_deduplicator' -with AvailabilityChecking(['imagededup'], OP_NAME): - import imagededup # noqa: F401 +imagededup = LazyLoader('imagededup', globals(), 'imagededup') - HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} +HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} - def get_hash_method(method_name): - from imagededup.methods import AHash, DHash, PHash, WHash +def get_hash_method(method_name): + from imagededup.methods import AHash, DHash, PHash, WHash - mapping = { - 'phash': PHash, - 'dhash': DHash, - 'whash': WHash, - 'ahash': AHash - } + mapping = { + 'phash': PHash, + 'dhash': DHash, + 'whash': WHash, + 'ahash': AHash + } - return mapping[method_name] + return mapping[method_name] @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/deduplicator/ray_image_deduplicator.py b/data_juicer/ops/deduplicator/ray_image_deduplicator.py index d6aad6eca..cac805e85 100644 --- a/data_juicer/ops/deduplicator/ray_image_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_image_deduplicator.py @@ -1,7 +1,6 @@ import numpy as np from jsonargparse.typing import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image @@ -11,22 +10,21 @@ OP_NAME = 'ray_image_deduplicator' -with AvailabilityChecking(['imagededup'], OP_NAME): - import imagededup # noqa: F401 +imagededup = LazyLoader('imagededup', globals(), 'imagededup') - HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} +HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} - def get_hash_method(method_name): - from imagededup.methods import AHash, DHash, PHash, WHash +def get_hash_method(method_name): + from imagededup.methods import AHash, DHash, PHash, WHash - mapping = { - 'phash': PHash, - 'dhash': DHash, - 'whash': WHash, - 'ahash': AHash - } + mapping = { + 'phash': PHash, + 'dhash': DHash, + 'whash': WHash, + 'ahash': AHash + } - return mapping[method_name] + return mapping[method_name] @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/alphanumeric_filter.py b/data_juicer/ops/filter/alphanumeric_filter.py index 80d2e6ca8..6b94dbb5e 100644 --- a/data_juicer/ops/filter/alphanumeric_filter.py +++ b/data_juicer/ops/filter/alphanumeric_filter.py @@ -2,9 +2,7 @@ from jsonargparse.typing import PositiveFloat -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model from ..base_op import AUTOINSTALL, OPERATORS, Filter @@ -12,9 +10,6 @@ OP_NAME = 'alphanumeric_filter' -with AvailabilityChecking(['transformers'], OP_NAME): - import transformers # noqa: F401 - @OPERATORS.register_module('alphanumeric_filter') class AlphanumericFilter(Filter): diff --git a/data_juicer/ops/filter/flagged_words_filter.py b/data_juicer/ops/filter/flagged_words_filter.py index d3bfe04e4..6be79fa25 100644 --- a/data_juicer/ops/filter/flagged_words_filter.py +++ b/data_juicer/ops/filter/flagged_words_filter.py @@ -4,9 +4,7 @@ from jsonargparse.typing import ClosedUnitInterval, List -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model from ...utils.asset_utils import ASSET_DIR, load_words_asset @@ -17,9 +15,6 @@ OP_NAME = 'flagged_words_filter' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 - @OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_aesthetics_filter.py b/data_juicer/ops/filter/image_aesthetics_filter.py index 71382d1b9..f8ac5b55d 100644 --- a/data_juicer/ops/filter/image_aesthetics_filter.py +++ b/data_juicer/ops/filter/image_aesthetics_filter.py @@ -2,7 +2,6 @@ from jsonargparse.typing import ClosedUnitInterval from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image @@ -14,14 +13,7 @@ OP_NAME = 'image_aesthetics_filter' CHECK_PKGs = ['torch', 'transformers', 'simple-aesthetics-predictor'] -with AvailabilityChecking(CHECK_PKGs, OP_NAME): - - import aesthetics_predictor # noqa: F401 - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) +torch = LazyLoader('torch', globals(), 'torch') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_face_ratio_filter.py b/data_juicer/ops/filter/image_face_ratio_filter.py index 07eed294d..b7cd5c8ce 100644 --- a/data_juicer/ops/filter/image_face_ratio_filter.py +++ b/data_juicer/ops/filter/image_face_ratio_filter.py @@ -4,7 +4,6 @@ from jsonargparse.typing import ClosedUnitInterval from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (detect_faces, load_data_with_context, @@ -16,8 +15,7 @@ OP_NAME = 'image_face_ratio_filter' -with AvailabilityChecking(['opencv-python'], OP_NAME): - import cv2 +cv2 = LazyLoader('cv2', globals(), 'cv2') @UNFORKABLE.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_nsfw_filter.py b/data_juicer/ops/filter/image_nsfw_filter.py index eee847161..40df5b8c3 100644 --- a/data_juicer/ops/filter/image_nsfw_filter.py +++ b/data_juicer/ops/filter/image_nsfw_filter.py @@ -1,7 +1,6 @@ import numpy as np from jsonargparse.typing import ClosedUnitInterval -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image @@ -12,12 +11,8 @@ OP_NAME = 'image_nsfw_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - import torch - import transformers # noqa: F401 - - # avoid hanging when calling nsfw detection in multiprocessing - torch.set_num_threads(1) +torch = LazyLoader('torch', globals(), 'torch') +transformers = LazyLoader('transformers', globals(), 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_text_matching_filter.py b/data_juicer/ops/filter/image_text_matching_filter.py index c1dbb73cd..acc7f1f4b 100644 --- a/data_juicer/ops/filter/image_text_matching_filter.py +++ b/data_juicer/ops/filter/image_text_matching_filter.py @@ -2,7 +2,6 @@ from jsonargparse.typing import ClosedUnitInterval from PIL import ImageOps -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context, @@ -14,12 +13,8 @@ OP_NAME = 'image_text_matching_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - import torch - import transformers # noqa: F401 - - # avoid hanging when calling blip in multiprocessing - torch.set_num_threads(1) +torch = LazyLoader('torch', globals(), 'torch') +transformers = LazyLoader('transformers', globals(), 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_text_similarity_filter.py b/data_juicer/ops/filter/image_text_similarity_filter.py index c6f8160cf..921cd3517 100644 --- a/data_juicer/ops/filter/image_text_similarity_filter.py +++ b/data_juicer/ops/filter/image_text_similarity_filter.py @@ -2,7 +2,6 @@ from jsonargparse.typing import ClosedUnitInterval from PIL import ImageOps -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context, @@ -14,13 +13,8 @@ OP_NAME = 'image_text_similarity_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) +torch = LazyLoader('torch', globals(), 'torch') +transformers = LazyLoader('transformers', globals(), 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_watermark_filter.py b/data_juicer/ops/filter/image_watermark_filter.py index bcf1d2646..cee31caf9 100644 --- a/data_juicer/ops/filter/image_watermark_filter.py +++ b/data_juicer/ops/filter/image_watermark_filter.py @@ -1,7 +1,6 @@ import numpy as np from jsonargparse.typing import ClosedUnitInterval -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image @@ -12,12 +11,8 @@ OP_NAME = 'image_watermark_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - import torch - import transformers # noqa: F401 - - # avoid hanging when calling watermark detection in multiprocessing - torch.set_num_threads(1) +torch = LazyLoader('torch', globals(), 'torch') +transformers = LazyLoader('transformers', globals(), 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/language_id_score_filter.py b/data_juicer/ops/filter/language_id_score_filter.py index 79a204c7e..6e4a771e3 100644 --- a/data_juicer/ops/filter/language_id_score_filter.py +++ b/data_juicer/ops/filter/language_id_score_filter.py @@ -3,7 +3,6 @@ from jsonargparse.typing import ClosedUnitInterval from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model @@ -12,8 +11,7 @@ OP_NAME = 'language_id_score_filter' -with AvailabilityChecking(['fasttext-wheel'], OP_NAME): - import fasttext # noqa: F401 +fasttext = LazyLoader('fasttext', globals(), 'fasttext') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/perplexity_filter.py b/data_juicer/ops/filter/perplexity_filter.py index b83328875..1921fcf65 100644 --- a/data_juicer/ops/filter/perplexity_filter.py +++ b/data_juicer/ops/filter/perplexity_filter.py @@ -4,7 +4,6 @@ from jsonargparse.typing import PositiveFloat -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model @@ -15,10 +14,8 @@ OP_NAME = 'perplexity_filter' -with AvailabilityChecking(['sentencepiece', 'kenlm'], OP_NAME): - import kenlm # noqa: F401 - import sentencepiece # noqa: F401 - +kenlm = LazyLoader('kenlm', globals(), 'kenlm') +sentencepiece = LazyLoader('sentencepiece', globals(), 'sentencepiece') @OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/phrase_grounding_recall_filter.py b/data_juicer/ops/filter/phrase_grounding_recall_filter.py index ad45d2fb5..196db04e3 100644 --- a/data_juicer/ops/filter/phrase_grounding_recall_filter.py +++ b/data_juicer/ops/filter/phrase_grounding_recall_filter.py @@ -5,7 +5,6 @@ from loguru import logger from PIL import ImageOps -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, iou, @@ -18,15 +17,9 @@ OP_NAME = 'phrase_grounding_recall_filter' -with AvailabilityChecking(['torch', 'transformers', 'nltk'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) - - import nltk +torch = LazyLoader('torch', globals(), 'torch') +transformers = LazyLoader('transformers', globals(), 'transformers') +nltk = LazyLoader('nltk', globals(), 'nltk') # NER algorithm adapted from GLIP starts diff --git a/data_juicer/ops/filter/stopwords_filter.py b/data_juicer/ops/filter/stopwords_filter.py index 58b386219..21c753282 100644 --- a/data_juicer/ops/filter/stopwords_filter.py +++ b/data_juicer/ops/filter/stopwords_filter.py @@ -5,7 +5,6 @@ from jsonargparse.typing import ClosedUnitInterval, List from data_juicer.utils.asset_utils import ASSET_DIR, load_words_asset -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model @@ -17,8 +16,7 @@ OP_NAME = 'stopwords_filter' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 +sentencepiece = LazyLoader('sentencepiece', globals(), 'sentencepiece') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/text_action_filter.py b/data_juicer/ops/filter/text_action_filter.py index d39ab88e9..aea26c19e 100644 --- a/data_juicer/ops/filter/text_action_filter.py +++ b/data_juicer/ops/filter/text_action_filter.py @@ -1,6 +1,4 @@ -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import remove_special_tokens from data_juicer.utils.model_utils import get_model, prepare_model @@ -8,9 +6,6 @@ OP_NAME = 'text_action_filter' -with AvailabilityChecking(['spacy-pkuseg'], OP_NAME): - import spacy_pkuseg # noqa: F401 - @OPERATORS.register_module(OP_NAME) class TextActionFilter(Filter): diff --git a/data_juicer/ops/filter/text_entity_dependency_filter.py b/data_juicer/ops/filter/text_entity_dependency_filter.py index 39f3d973e..49e5dd8a4 100644 --- a/data_juicer/ops/filter/text_entity_dependency_filter.py +++ b/data_juicer/ops/filter/text_entity_dependency_filter.py @@ -1,8 +1,6 @@ import numpy as np -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import remove_special_tokens from data_juicer.utils.model_utils import get_model, prepare_model @@ -10,9 +8,6 @@ OP_NAME = 'text_entity_dependency_filter' -with AvailabilityChecking(['spacy-pkuseg'], OP_NAME): - import spacy_pkuseg # noqa: F401 - @OPERATORS.register_module(OP_NAME) class TextEntityDependencyFilter(Filter): diff --git a/data_juicer/ops/filter/token_num_filter.py b/data_juicer/ops/filter/token_num_filter.py index 83704f08b..b90f9572f 100644 --- a/data_juicer/ops/filter/token_num_filter.py +++ b/data_juicer/ops/filter/token_num_filter.py @@ -2,7 +2,6 @@ from jsonargparse.typing import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model @@ -12,8 +11,7 @@ OP_NAME = 'token_num_filter' -with AvailabilityChecking(['transformers'], OP_NAME): - import transformers # noqa: F401 +transformers = LazyLoader('transformers', globals(), 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/video_aesthetics_filter.py b/data_juicer/ops/filter/video_aesthetics_filter.py index 55ed66fda..aacf44158 100644 --- a/data_juicer/ops/filter/video_aesthetics_filter.py +++ b/data_juicer/ops/filter/video_aesthetics_filter.py @@ -2,7 +2,6 @@ from jsonargparse.typing import ClosedUnitInterval, PositiveInt from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (close_video, extract_key_frames, @@ -14,11 +13,8 @@ from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS OP_NAME = 'video_aesthetics_filter' -CHECK_PKGS = ['torch', 'transformers', 'simple-aesthetics-predictor'] torch = LazyLoader('torch', globals(), 'torch') -transformers = LazyLoader('transformers', globals(), 'transformers') -aesthetics_predictor = LazyLoader('aesthetics_predictor', globals(), 'aesthetics_predictor') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/video_frames_text_similarity_filter.py b/data_juicer/ops/filter/video_frames_text_similarity_filter.py index e399d5abd..75bb8a374 100644 --- a/data_juicer/ops/filter/video_frames_text_similarity_filter.py +++ b/data_juicer/ops/filter/video_frames_text_similarity_filter.py @@ -2,7 +2,6 @@ from jsonargparse.typing import ClosedUnitInterval, PositiveInt from PIL import ImageOps -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, close_video, @@ -17,13 +16,8 @@ OP_NAME = 'video_frames_text_similarity_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) +torch = LazyLoader('torch', globals(), 'torch') +transformers = LazyLoader('transformers', globals(), 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/video_motion_score_filter.py b/data_juicer/ops/filter/video_motion_score_filter.py index e54589a32..572c24960 100644 --- a/data_juicer/ops/filter/video_motion_score_filter.py +++ b/data_juicer/ops/filter/video_motion_score_filter.py @@ -5,7 +5,6 @@ import numpy as np from jsonargparse.typing import PositiveFloat, PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader @@ -13,8 +12,7 @@ OP_NAME = 'video_motion_score_filter' -with AvailabilityChecking(['opencv-python'], OP_NAME): - import cv2 +cv2 = LazyLoader('cv2', globals(), 'cv2') @contextmanager diff --git a/data_juicer/ops/filter/video_nsfw_filter.py b/data_juicer/ops/filter/video_nsfw_filter.py index 91e409676..108add7db 100644 --- a/data_juicer/ops/filter/video_nsfw_filter.py +++ b/data_juicer/ops/filter/video_nsfw_filter.py @@ -1,7 +1,6 @@ import numpy as np from jsonargparse.typing import ClosedUnitInterval, PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (close_video, extract_key_frames, @@ -14,13 +13,8 @@ OP_NAME = 'video_nsfw_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling nsfw detection in multiprocessing - torch.set_num_threads(1) +torch = LazyLoader('torch', globals(), 'torch') +transformers = LazyLoader('transformers', globals(), 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/video_ocr_area_ratio_filter.py b/data_juicer/ops/filter/video_ocr_area_ratio_filter.py index 07c419858..f47186e45 100644 --- a/data_juicer/ops/filter/video_ocr_area_ratio_filter.py +++ b/data_juicer/ops/filter/video_ocr_area_ratio_filter.py @@ -4,7 +4,6 @@ from jsonargparse.typing import ClosedUnitInterval, PositiveInt from data_juicer import cuda_device_count -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (close_video, @@ -16,8 +15,7 @@ OP_NAME = 'video_ocr_area_ratio_filter' -with AvailabilityChecking(['easyocr'], OP_NAME): - import easyocr +easyocr = LazyLoader('easyocr', globals(), 'easyocr') def triangle_area(p1, p2, p3): diff --git a/data_juicer/ops/filter/video_tagging_from_frames_filter.py b/data_juicer/ops/filter/video_tagging_from_frames_filter.py index 49a46ee65..4b0e0e2e1 100644 --- a/data_juicer/ops/filter/video_tagging_from_frames_filter.py +++ b/data_juicer/ops/filter/video_tagging_from_frames_filter.py @@ -1,9 +1,7 @@ import numpy as np from jsonargparse.typing import List, PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields -from data_juicer.utils.lazy_loader import LazyLoader from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter from ..mapper.video_tagging_from_frames_mapper import \ @@ -12,15 +10,6 @@ OP_NAME = 'video_tagging_from_frames_filter' -with AvailabilityChecking( - ['torch', 'git+https://github.com/xinyu1205/recognize-anything.git'], - OP_NAME): - import ram # noqa: F401 - import torch - - # avoid hanging when calling recognizeAnything in multiprocessing - torch.set_num_threads(1) - @UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/video_watermark_filter.py b/data_juicer/ops/filter/video_watermark_filter.py index 2c022826d..0406040a8 100644 --- a/data_juicer/ops/filter/video_watermark_filter.py +++ b/data_juicer/ops/filter/video_watermark_filter.py @@ -1,7 +1,6 @@ import numpy as np from jsonargparse.typing import ClosedUnitInterval, PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (close_video, extract_key_frames, @@ -14,13 +13,7 @@ OP_NAME = 'video_watermark_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling watermark detection in multiprocessing - torch.set_num_threads(1) +torch = LazyLoader('torch', globals(), 'torch') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/word_repetition_filter.py b/data_juicer/ops/filter/word_repetition_filter.py index 5b2eb0402..d5223c3da 100644 --- a/data_juicer/ops/filter/word_repetition_filter.py +++ b/data_juicer/ops/filter/word_repetition_filter.py @@ -4,9 +4,7 @@ from jsonargparse.typing import ClosedUnitInterval, PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model from ..base_op import AUTOINSTALL, OPERATORS, Filter @@ -16,9 +14,6 @@ OP_NAME = 'word_repetition_filter' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 - @OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/words_num_filter.py b/data_juicer/ops/filter/words_num_filter.py index 2d2ddb07e..f272b6ce8 100644 --- a/data_juicer/ops/filter/words_num_filter.py +++ b/data_juicer/ops/filter/words_num_filter.py @@ -2,9 +2,7 @@ from jsonargparse.typing import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model from ..base_op import AUTOINSTALL, OPERATORS, Filter @@ -14,9 +12,6 @@ OP_NAME = 'words_num_filter' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 - @OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py b/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py index 2e8804b75..0737c4b4b 100644 --- a/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py +++ b/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py @@ -1,6 +1,5 @@ from typing import Dict, List, Optional -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.file_utils import transfer_filename @@ -10,8 +9,7 @@ OP_NAME = 'audio_ffmpeg_wrapped_mapper' -with AvailabilityChecking(['ffmpeg-python'], OP_NAME), HiddenPrints(): - import ffmpeg +ffmpeg = LazyLoader('ffmpeg', globals(), 'ffmpeg') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py index 13cc4c59c..97620a335 100644 --- a/data_juicer/ops/mapper/chinese_convert_mapper.py +++ b/data_juicer/ops/mapper/chinese_convert_mapper.py @@ -1,12 +1,10 @@ -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.lazy_loader import LazyLoader from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'chinese_convert_mapper' -with AvailabilityChecking(['opencc'], OP_NAME): - import opencc # noqa: F401 +opencc = LazyLoader('opencc', globals(), 'opencc') OPENCC_CONVERTER = None diff --git a/data_juicer/ops/mapper/clean_html_mapper.py b/data_juicer/ops/mapper/clean_html_mapper.py index b6b816e45..907bee65a 100644 --- a/data_juicer/ops/mapper/clean_html_mapper.py +++ b/data_juicer/ops/mapper/clean_html_mapper.py @@ -2,15 +2,13 @@ # https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/ # -------------------------------------------------------- -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.lazy_loader import LazyLoader from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'clean_html_mapper' -with AvailabilityChecking(['selectolax'], OP_NAME): - from selectolax.parser import HTMLParser +selectolax = LazyLoader('selectolax', globals(), 'selectolax') @OPERATORS.register_module(OP_NAME) @@ -34,7 +32,7 @@ def _clean_html(raw_html): raw_html = raw_html.replace('', '') raw_html = raw_html.replace('