From fcf8748192613a822034b2c09d83aac2a23b5430 Mon Sep 17 00:00:00 2001 From: Haibin Date: Thu, 17 Oct 2024 19:25:35 +0800 Subject: [PATCH] autoinstall in lazyloader --- data_juicer/analysis/collector.py | 19 +-- data_juicer/analysis/measure.py | 23 +-- data_juicer/core/ray_data.py | 9 +- data_juicer/core/ray_executor.py | 7 +- data_juicer/format/empty_formatter.py | 5 +- data_juicer/ops/base_op.py | 6 - .../document_minhash_deduplicator.py | 7 +- .../document_simhash_deduplicator.py | 7 +- .../ops/deduplicator/image_deduplicator.py | 15 +- .../deduplicator/ray_basic_deduplicator.py | 12 +- .../deduplicator/ray_image_deduplicator.py | 15 +- data_juicer/ops/filter/alphanumeric_filter.py | 3 +- .../ops/filter/flagged_words_filter.py | 3 +- .../ops/filter/image_aesthetics_filter.py | 9 +- .../ops/filter/image_face_ratio_filter.py | 7 +- data_juicer/ops/filter/image_nsfw_filter.py | 8 +- .../filter/image_pair_similarity_filter.py | 12 +- .../ops/filter/image_text_matching_filter.py | 7 +- .../filter/image_text_similarity_filter.py | 7 +- .../ops/filter/image_watermark_filter.py | 8 +- .../ops/filter/language_id_score_filter.py | 7 +- data_juicer/ops/filter/perplexity_filter.py | 7 +- .../filter/phrase_grounding_recall_filter.py | 8 +- data_juicer/ops/filter/stopwords_filter.py | 6 +- data_juicer/ops/filter/text_action_filter.py | 6 +- .../filter/text_entity_dependency_filter.py | 6 +- data_juicer/ops/filter/token_num_filter.py | 6 +- .../ops/filter/video_aesthetics_filter.py | 8 +- .../video_frames_text_similarity_filter.py | 7 +- .../ops/filter/video_motion_score_filter.py | 7 +- data_juicer/ops/filter/video_nsfw_filter.py | 8 +- .../ops/filter/video_ocr_area_ratio_filter.py | 7 +- .../video_tagging_from_frames_filter.py | 6 +- .../ops/filter/video_watermark_filter.py | 7 +- .../ops/filter/word_repetition_filter.py | 3 +- data_juicer/ops/filter/words_num_filter.py | 3 +- data_juicer/ops/load.py | 9 -- .../ops/mapper/audio_ffmpeg_wrapped_mapper.py | 7 +- .../ops/mapper/chinese_convert_mapper.py | 7 +- data_juicer/ops/mapper/clean_html_mapper.py | 7 +- data_juicer/ops/mapper/extract_qa_mapper.py | 17 +-- data_juicer/ops/mapper/fix_unicode_mapper.py | 7 +- .../ops/mapper/generate_instruction_mapper.py | 17 +-- .../ops/mapper/image_captioning_mapper.py | 9 +- .../ops/mapper/image_diffusion_mapper.py | 4 +- .../ops/mapper/image_face_blur_mapper.py | 15 +- .../ops/mapper/image_tagging_mapper.py | 17 +-- data_juicer/ops/mapper/nlpaug_en_mapper.py | 7 +- data_juicer/ops/mapper/nlpcda_zh_mapper.py | 7 +- .../ops/mapper/optimize_instruction_mapper.py | 18 +-- ..._words_with_incorrect_substrings_mapper.py | 3 +- .../ops/mapper/sentence_split_mapper.py | 3 +- .../video_captioning_from_audio_mapper.py | 3 +- .../video_captioning_from_frames_mapper.py | 10 +- ...video_captioning_from_summarizer_mapper.py | 5 +- .../video_captioning_from_video_mapper.py | 10 +- .../ops/mapper/video_face_blur_mapper.py | 15 +- .../ops/mapper/video_ffmpeg_wrapped_mapper.py | 7 +- .../mapper/video_remove_watermark_mapper.py | 7 +- .../video_resize_aspect_ratio_mapper.py | 7 +- .../mapper/video_resize_resolution_mapper.py | 7 +- .../ops/mapper/video_split_by_scene_mapper.py | 7 +- .../mapper/video_tagging_from_audio_mapper.py | 10 +- .../video_tagging_from_frames_mapper.py | 10 +- data_juicer/utils/auto_install_mapping.py | 103 ++++++++++++++ data_juicer/utils/auto_install_utils.py | 68 +++++---- data_juicer/utils/availability_utils.py | 104 ++++---------- data_juicer/utils/lazy_loader.py | 27 +++- data_juicer/utils/mm_utils.py | 5 +- data_juicer/utils/model_utils.py | 133 ++++++++---------- data_juicer/utils/unittest_utils.py | 6 +- docs/DeveloperGuide.md | 7 +- docs/DeveloperGuide_ZH.md | 7 +- environments/minimal_requires.txt | 1 + environments/science_requires.txt | 3 +- tests/ops/mapper/test_extract_qa_mapper.py | 1 + .../test_generate_instruction_mapper.py | 1 + tests/ops/mapper/test_image_blur_mapper.py | 7 +- .../test_video_resize_resolution_mapper.py | 4 +- 79 files changed, 491 insertions(+), 529 deletions(-) create mode 100644 data_juicer/utils/auto_install_mapping.py diff --git a/data_juicer/analysis/collector.py b/data_juicer/analysis/collector.py index c55cde352..d36b8a532 100644 --- a/data_juicer/analysis/collector.py +++ b/data_juicer/analysis/collector.py @@ -1,10 +1,10 @@ from itertools import chain -import torch -from torch.distributions import Categorical -from transformers import AutoTokenizer - from data_juicer.format import load_formatter +from data_juicer.utils.lazy_loader import LazyLoader + +torch = LazyLoader('torch', 'torch') +transformers = LazyLoader('transformers', 'transformers') class TextTokenDistCollector(object): @@ -18,11 +18,14 @@ def __init__(self, tokenizer): :param tokenizer: tokenizer name on huggingface """ - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer, - trust_remote_code=True) + self.tokenizer = transformers.AutoTokenizer.from_pretrained( + tokenizer, trust_remote_code=True) self.vocab_size = len(self.tokenizer) - def collect(self, data_path, text_key, num_proc=1) -> 'Categorical': + def collect(self, + data_path, + text_key, + num_proc=1) -> 'torch.distributions.Categorical': """ Tokenize and collect tokens distribution of input dataset :param data_path: path to input dataset. @@ -63,5 +66,5 @@ def _tokenize_fn(example, ): list(chain.from_iterable(dataset['input_ids']))) indices, counts = token_ids.unique(return_counts=True) token_count.scatter_(0, indices, counts.to(token_count.dtype)) - dist = Categorical(token_count) + dist = torch.distributions.Categorical(token_count) return dist diff --git a/data_juicer/analysis/measure.py b/data_juicer/analysis/measure.py index 80e7c42f8..fe54cdabd 100644 --- a/data_juicer/analysis/measure.py +++ b/data_juicer/analysis/measure.py @@ -1,7 +1,8 @@ -import torch -import torch.nn.functional as F -from torch import Tensor -from torch.distributions import Categorical +from data_juicer.utils.lazy_loader import LazyLoader + +torch = LazyLoader('torch', 'torch') +td = LazyLoader('td', 'torch.distributions') +F = LazyLoader('F', 'torch.nn.functional') class Measure(object): @@ -22,9 +23,9 @@ def _convert_to_tensor(self, p): [`scalar`,`list`, `tuple`, `torch binary file`, and `Categorical`]. :return: torch tensor """ - if isinstance(p, Tensor): + if isinstance(p, torch.Tensor): return p - elif isinstance(p, Categorical): + elif isinstance(p, td.Categorical): return p.probs elif isinstance(p, str): return torch.load(p) @@ -38,14 +39,14 @@ def _convert_to_categorical(self, p): [`scalar`,`list`, `tuple`, `torch binary file`, and `Categorical`]. :return: torch Categorical """ - if isinstance(p, Categorical): + if isinstance(p, td.Categorical): return p - elif isinstance(p, Tensor): - return Categorical(p) + elif isinstance(p, torch.Tensor): + return td.Categorical(p) elif isinstance(p, str): - return Categorical(torch.load(p)) + return td.Categorical(torch.load(p)) else: - return Categorical(torch.tensor(p)) + return td.Categorical(torch.tensor(p)) class KLDivMeasure(Measure): diff --git a/data_juicer/core/ray_data.py b/data_juicer/core/ray_data.py index ce964a3af..0c131561e 100644 --- a/data_juicer/core/ray_data.py +++ b/data_juicer/core/ray_data.py @@ -6,12 +6,11 @@ from data_juicer import cuda_device_count from data_juicer.core.data import DJDataset from data_juicer.ops import Filter, Mapper -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.process_utils import calculate_np -with AvailabilityChecking(['ray'], requires_type='dist'): - from ray.data import Dataset +rd = LazyLoader('rd', 'ray.data') def is_valid_path(item, dataset_dir): @@ -54,7 +53,7 @@ def set_dataset_to_absolute_path(dataset, dataset_path, cfg): return dataset -def preprocess_dataset(dataset: Dataset, dataset_path, cfg) -> Dataset: +def preprocess_dataset(dataset: rd.Dataset, dataset_path, cfg) -> rd.Dataset: if dataset_path: dataset = set_dataset_to_absolute_path(dataset, dataset_path, cfg) columns = dataset.columns() @@ -81,7 +80,7 @@ def get_num_gpus(op, op_proc): class RayDataset(DJDataset): def __init__(self, - dataset: Dataset, + dataset: rd.Dataset, dataset_path: str = None, cfg=None) -> None: self.data = preprocess_dataset(dataset, dataset_path, cfg) diff --git a/data_juicer/core/ray_executor.py b/data_juicer/core/ray_executor.py index a071c2dea..6b93fd3dd 100644 --- a/data_juicer/core/ray_executor.py +++ b/data_juicer/core/ray_executor.py @@ -5,11 +5,10 @@ from data_juicer.config import init_configs from data_juicer.core.ray_data import RayDataset from data_juicer.ops import load_ops -from data_juicer.utils.availability_utils import AvailabilityChecking +from data_juicer.utils.lazy_loader import LazyLoader -with AvailabilityChecking(['ray'], requires_type='dist'): - import ray - import ray.data as rd +ray = LazyLoader('ray', 'ray') +rd = LazyLoader('rd', 'ray.data') class RayExecutor: diff --git a/data_juicer/format/empty_formatter.py b/data_juicer/format/empty_formatter.py index 527cd9851..5fe9a6999 100644 --- a/data_juicer/format/empty_formatter.py +++ b/data_juicer/format/empty_formatter.py @@ -1,11 +1,14 @@ from typing import List import pandas as pd -import ray from datasets import Dataset, Features, Value +from data_juicer.utils.lazy_loader import LazyLoader + from .formatter import FORMATTERS, BaseFormatter +ray = LazyLoader('ray', 'ray') + @FORMATTERS.register_module() class EmptyFormatter(BaseFormatter): diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index 901e8523e..a3f5c17e4 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -1,5 +1,4 @@ import copy -import os import traceback from functools import wraps @@ -7,7 +6,6 @@ from loguru import logger from data_juicer import is_cuda_available -from data_juicer.utils.auto_install_utils import AutoInstaller from data_juicer.utils.constant import Fields from data_juicer.utils.mm_utils import size_to_bytes from data_juicer.utils.process_utils import calculate_np @@ -15,10 +13,6 @@ OPERATORS = Registry('Operators') UNFORKABLE = Registry('Unforkable') -current_path = os.path.dirname(os.path.realpath(__file__)) -version_file_path = os.path.join(current_path, - '../../environments/science_requires.txt') -AUTOINSTALL = AutoInstaller([version_file_path]) def convert_list_dict_to_dict_list(samples): diff --git a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py index 4d524de7d..4982eb2c9 100644 --- a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py @@ -18,13 +18,13 @@ from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import prepare_sentencepiece_model -from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator +from ..base_op import OPERATORS, Deduplicator from ..common.helper_func import UnionFind, split_on_whitespace -OP_NAME = 'document_minhash_deduplicator' - integrate = LazyLoader('integrate', 'scipy.integrate') +OP_NAME = 'document_minhash_deduplicator' + MERSENNE_PRIME = np.uint64((1 << 61) - 1) MAX_HASH = np.uint64((1 << 32) - 1) @@ -151,7 +151,6 @@ def __init__( sentencepiece tokenization. """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['scipy']) # about minhash computation self.tokenization = tokenization self.window_size = window_size diff --git a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py index 4228fba1f..901bbe2f2 100644 --- a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py @@ -13,13 +13,13 @@ from data_juicer.utils.constant import HashKeys from data_juicer.utils.lazy_loader import LazyLoader -from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator +from ..base_op import OPERATORS, Deduplicator from ..common.helper_func import split_on_whitespace -OP_NAME = 'document_simhash_deduplicator' - simhash = LazyLoader('simhash', 'simhash') +OP_NAME = 'document_simhash_deduplicator' + @OPERATORS.register_module(OP_NAME) class DocumentSimhashDeduplicator(Deduplicator): @@ -56,7 +56,6 @@ def __init__(self, """ # about simhash computation super().__init__(*args, **kwargs) - AUTOINSTALL.check(['simhash-pybind']) self.tokenization = tokenization self.window_size = window_size self.lowercase = lowercase diff --git a/data_juicer/ops/deduplicator/image_deduplicator.py b/data_juicer/ops/deduplicator/image_deduplicator.py index 0a217a889..20b201aaf 100644 --- a/data_juicer/ops/deduplicator/image_deduplicator.py +++ b/data_juicer/ops/deduplicator/image_deduplicator.py @@ -7,21 +7,25 @@ from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image -from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator +from ..base_op import OPERATORS, Deduplicator from ..op_fusion import LOADED_IMAGES from .document_deduplicator import DocumentDeduplicator -OP_NAME = 'image_deduplicator' +imgdedup_methods = LazyLoader('imgdedup_methods', 'imagededup.methods') -imagededup = LazyLoader('imagededup', 'imagededup') +OP_NAME = 'image_deduplicator' HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} def get_hash_method(method_name): - from imagededup.methods import AHash, DHash, PHash, WHash - mapping = {'phash': PHash, 'dhash': DHash, 'whash': WHash, 'ahash': AHash} + mapping = { + 'phash': imgdedup_methods.PHash, + 'dhash': imgdedup_methods.DHash, + 'whash': imgdedup_methods.WHash, + 'ahash': imgdedup_methods.AHash + } return mapping[method_name] @@ -49,7 +53,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['imagededup']) if method not in HASH_METHOD: raise ValueError(f'Keep strategy [{method}] is not supported. ' f'Can only be one of {HASH_METHOD}.') diff --git a/data_juicer/ops/deduplicator/ray_basic_deduplicator.py b/data_juicer/ops/deduplicator/ray_basic_deduplicator.py index f8c40525e..bb107ef88 100644 --- a/data_juicer/ops/deduplicator/ray_basic_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_basic_deduplicator.py @@ -1,19 +1,11 @@ -from typing import Any - from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys +from data_juicer.utils.lazy_loader import LazyLoader from ..base_op import Filter -with AvailabilityChecking(['redis'], - op_name='ray_xxx_deduplicator', - requires_type='dist'): - try: - import redis - except Exception: - redis = Any +redis = LazyLoader('redis', 'redis') class RayBasicDeduplicator(Filter): diff --git a/data_juicer/ops/deduplicator/ray_image_deduplicator.py b/data_juicer/ops/deduplicator/ray_image_deduplicator.py index 22ebca47f..7ca0d10f2 100644 --- a/data_juicer/ops/deduplicator/ray_image_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_image_deduplicator.py @@ -4,21 +4,25 @@ from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image -from ..base_op import AUTOINSTALL, OPERATORS +from ..base_op import OPERATORS from ..op_fusion import LOADED_IMAGES from .ray_basic_deduplicator import RayBasicDeduplicator -OP_NAME = 'ray_image_deduplicator' +imgdedup_methods = LazyLoader('imgdedup_methods', 'imagededup.methods') -imagededup = LazyLoader('imagededup', 'imagededup') +OP_NAME = 'ray_image_deduplicator' HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} def get_hash_method(method_name): - from imagededup.methods import AHash, DHash, PHash, WHash - mapping = {'phash': PHash, 'dhash': DHash, 'whash': WHash, 'ahash': AHash} + mapping = { + 'phash': imgdedup_methods.PHash, + 'dhash': imgdedup_methods.DHash, + 'whash': imgdedup_methods.WHash, + 'ahash': imgdedup_methods.AHash + } return mapping[method_name] @@ -48,7 +52,6 @@ def __init__(self, redis_port=redis_port, *args, **kwargs) - AUTOINSTALL.check(['imagededup']) if method not in HASH_METHOD: raise ValueError(f'Keep strategy [{method}] is not supported. ' f'Can only be one of {HASH_METHOD}.') diff --git a/data_juicer/ops/filter/alphanumeric_filter.py b/data_juicer/ops/filter/alphanumeric_filter.py index e6ea7cc7e..dbe96f37a 100644 --- a/data_juicer/ops/filter/alphanumeric_filter.py +++ b/data_juicer/ops/filter/alphanumeric_filter.py @@ -3,7 +3,7 @@ from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter from ..common import get_words_from_document OP_NAME = 'alphanumeric_filter' @@ -39,7 +39,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['transformers']) self.tokenization = tokenization self.min_ratio = min_ratio self.max_ratio = max_ratio diff --git a/data_juicer/ops/filter/flagged_words_filter.py b/data_juicer/ops/filter/flagged_words_filter.py index 2966313fc..8fe7aa80b 100644 --- a/data_juicer/ops/filter/flagged_words_filter.py +++ b/data_juicer/ops/filter/flagged_words_filter.py @@ -10,7 +10,7 @@ from data_juicer.utils.model_utils import get_model, prepare_model from ...utils.asset_utils import ASSET_DIR, load_words_asset -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter from ..common import (SPECIAL_CHARACTERS, get_words_from_document, words_refinement) from ..op_fusion import INTER_WORDS @@ -54,7 +54,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['sentencepiece']) self.lang = lang self.max_ratio = max_ratio self.use_words_aug = use_words_aug diff --git a/data_juicer/ops/filter/image_aesthetics_filter.py b/data_juicer/ops/filter/image_aesthetics_filter.py index bcb4284ef..699aae3b1 100644 --- a/data_juicer/ops/filter/image_aesthetics_filter.py +++ b/data_juicer/ops/filter/image_aesthetics_filter.py @@ -6,14 +6,13 @@ from data_juicer.utils.mm_utils import load_data_with_context, load_image from ...utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter from ..op_fusion import LOADED_IMAGES -OP_NAME = 'image_aesthetics_filter' -CHECK_PKGs = ['torch', 'transformers', 'simple-aesthetics-predictor'] - torch = LazyLoader('torch', 'torch') +OP_NAME = 'image_aesthetics_filter' + @OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) @@ -49,8 +48,6 @@ def __init__(self, """ super().__init__(*args, **kwargs) - AUTOINSTALL.check( - ['torch', 'transformers', 'simple-aesthetics-predictor']) if hf_scorer_model == '': hf_scorer_model = \ 'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE' diff --git a/data_juicer/ops/filter/image_face_ratio_filter.py b/data_juicer/ops/filter/image_face_ratio_filter.py index 6cc73e32c..dc5c3edb2 100644 --- a/data_juicer/ops/filter/image_face_ratio_filter.py +++ b/data_juicer/ops/filter/image_face_ratio_filter.py @@ -9,13 +9,13 @@ load_image) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter +from ..base_op import OPERATORS, UNFORKABLE, Filter from ..op_fusion import LOADED_IMAGES -OP_NAME = 'image_face_ratio_filter' - cv2 = LazyLoader('cv2', 'cv2') +OP_NAME = 'image_face_ratio_filter' + @UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) @@ -53,7 +53,6 @@ def __init__(self, :param kwargs: Extra keyword arguments. """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['opencv-python']) if cv_classifier == '': cv_classifier = os.path.join(cv2.data.haarcascades, diff --git a/data_juicer/ops/filter/image_nsfw_filter.py b/data_juicer/ops/filter/image_nsfw_filter.py index 357b81a10..03d0160d3 100644 --- a/data_juicer/ops/filter/image_nsfw_filter.py +++ b/data_juicer/ops/filter/image_nsfw_filter.py @@ -5,13 +5,12 @@ from data_juicer.utils.mm_utils import load_data_with_context, load_image from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter from ..op_fusion import LOADED_IMAGES -OP_NAME = 'image_nsfw_filter' - torch = LazyLoader('torch', 'torch') -transformers = LazyLoader('transformers', 'transformers') + +OP_NAME = 'image_nsfw_filter' @OPERATORS.register_module(OP_NAME) @@ -43,7 +42,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['torch', 'transformers']) self.score_threshold = score_threshold if any_or_all not in ['any', 'all']: raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' diff --git a/data_juicer/ops/filter/image_pair_similarity_filter.py b/data_juicer/ops/filter/image_pair_similarity_filter.py index dcb1b7059..e43e3b214 100644 --- a/data_juicer/ops/filter/image_pair_similarity_filter.py +++ b/data_juicer/ops/filter/image_pair_similarity_filter.py @@ -3,20 +3,14 @@ from data_juicer.ops.base_op import OPERATORS, Filter from data_juicer.ops.op_fusion import LOADED_IMAGES -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image from data_juicer.utils.model_utils import get_model, prepare_model -OP_NAME = 'image_pair_similarity_filter' - -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): +torch = LazyLoader('torch', 'torch') - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) +OP_NAME = 'image_pair_similarity_filter' @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_text_matching_filter.py b/data_juicer/ops/filter/image_text_matching_filter.py index c47982eae..205e0484c 100644 --- a/data_juicer/ops/filter/image_text_matching_filter.py +++ b/data_juicer/ops/filter/image_text_matching_filter.py @@ -2,19 +2,15 @@ from PIL import ImageOps from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context, load_image, remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_text_matching_filter' -torch = LazyLoader('torch', 'torch') -transformers = LazyLoader('transformers', 'transformers') - @OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) @@ -57,7 +53,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['torch', 'transformers']) self.min_score = min_score self.max_score = max_score if reduce_mode not in ['avg', 'max', 'min']: diff --git a/data_juicer/ops/filter/image_text_similarity_filter.py b/data_juicer/ops/filter/image_text_similarity_filter.py index d70568ba6..de90659e6 100644 --- a/data_juicer/ops/filter/image_text_similarity_filter.py +++ b/data_juicer/ops/filter/image_text_similarity_filter.py @@ -2,19 +2,15 @@ from PIL import ImageOps from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context, load_image, remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_text_similarity_filter' -torch = LazyLoader('torch', 'torch') -transformers = LazyLoader('transformers', 'transformers') - @OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) @@ -57,7 +53,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['torch', 'transformers']) self.min_score = min_score self.max_score = max_score if reduce_mode not in ['avg', 'max', 'min']: diff --git a/data_juicer/ops/filter/image_watermark_filter.py b/data_juicer/ops/filter/image_watermark_filter.py index 1d34d8ad6..568ec9d83 100644 --- a/data_juicer/ops/filter/image_watermark_filter.py +++ b/data_juicer/ops/filter/image_watermark_filter.py @@ -5,13 +5,12 @@ from data_juicer.utils.mm_utils import load_data_with_context, load_image from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter from ..op_fusion import LOADED_IMAGES -OP_NAME = 'image_watermark_filter' - torch = LazyLoader('torch', 'torch') -transformers = LazyLoader('transformers', 'transformers') + +OP_NAME = 'image_watermark_filter' @OPERATORS.register_module(OP_NAME) @@ -47,7 +46,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['torch', 'transformers']) self.prob_threshold = prob_threshold if any_or_all not in ['any', 'all']: raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' diff --git a/data_juicer/ops/filter/language_id_score_filter.py b/data_juicer/ops/filter/language_id_score_filter.py index 6aa892274..8e23a51bd 100644 --- a/data_juicer/ops/filter/language_id_score_filter.py +++ b/data_juicer/ops/filter/language_id_score_filter.py @@ -6,12 +6,12 @@ from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter - -OP_NAME = 'language_id_score_filter' +from ..base_op import OPERATORS, Filter fasttext = LazyLoader('fasttext', 'fasttext') +OP_NAME = 'language_id_score_filter' + @OPERATORS.register_module(OP_NAME) class LanguageIDScoreFilter(Filter): @@ -33,7 +33,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['fasttext', 'fasttext-wheel']) if not lang: # lang is [], '' or None self.lang = None diff --git a/data_juicer/ops/filter/perplexity_filter.py b/data_juicer/ops/filter/perplexity_filter.py index 97fb3fc01..7092ed95f 100644 --- a/data_juicer/ops/filter/perplexity_filter.py +++ b/data_juicer/ops/filter/perplexity_filter.py @@ -3,18 +3,14 @@ # -------------------------------------------------------- from data_juicer.utils.constant import Fields, InterVars, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter from ..common import get_words_from_document from ..op_fusion import INTER_WORDS OP_NAME = 'perplexity_filter' -kenlm = LazyLoader('kenlm', 'kenlm') -sentencepiece = LazyLoader('sentencepiece', 'sentencepiece') - @OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) @@ -39,7 +35,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['sentencepiece', 'kenlm']) self.max_ppl = max_ppl self.lang = lang self.sp_model_key = prepare_model(model_type='sentencepiece', diff --git a/data_juicer/ops/filter/phrase_grounding_recall_filter.py b/data_juicer/ops/filter/phrase_grounding_recall_filter.py index 0f00781f0..36210c9c0 100644 --- a/data_juicer/ops/filter/phrase_grounding_recall_filter.py +++ b/data_juicer/ops/filter/phrase_grounding_recall_filter.py @@ -11,15 +11,14 @@ remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter from ..op_fusion import LOADED_IMAGES -OP_NAME = 'phrase_grounding_recall_filter' - torch = LazyLoader('torch', 'torch') -transformers = LazyLoader('transformers', 'transformers') nltk = LazyLoader('nltk', 'nltk') +OP_NAME = 'phrase_grounding_recall_filter' + # NER algorithm adapted from GLIP starts # https://github.com/microsoft/GLIP/blob/main/maskrcnn_benchmark/engine/predictor_glip.py#L107-L127 @@ -116,7 +115,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['torch', 'transformers', 'nltk']) self.min_recall = min_recall self.max_recall = max_recall if reduce_mode not in ['avg', 'max', 'min']: diff --git a/data_juicer/ops/filter/stopwords_filter.py b/data_juicer/ops/filter/stopwords_filter.py index fde4c2321..4a9014945 100644 --- a/data_juicer/ops/filter/stopwords_filter.py +++ b/data_juicer/ops/filter/stopwords_filter.py @@ -8,18 +8,15 @@ from data_juicer.utils.asset_utils import ASSET_DIR, load_words_asset from data_juicer.utils.constant import Fields, InterVars, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter from ..common import (SPECIAL_CHARACTERS, get_words_from_document, words_refinement) from ..op_fusion import INTER_WORDS OP_NAME = 'stopwords_filter' -sentencepiece = LazyLoader('sentencepiece', 'sentencepiece') - @OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) @@ -56,7 +53,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['sentencepiece']) self.lang = lang self.min_ratio = min_ratio self.use_words_aug = use_words_aug diff --git a/data_juicer/ops/filter/text_action_filter.py b/data_juicer/ops/filter/text_action_filter.py index 44c67920d..c2734f492 100644 --- a/data_juicer/ops/filter/text_action_filter.py +++ b/data_juicer/ops/filter/text_action_filter.py @@ -1,8 +1,9 @@ from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import AUTOINSTALL from data_juicer.utils.mm_utils import remove_special_tokens from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter OP_NAME = 'text_action_filter' @@ -28,7 +29,8 @@ def __init__(self, parameter. """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['spacy-pkuseg']) + # '--no-deps' do not update numpy + AUTOINSTALL.check(['spacy-pkuseg'], '--no-deps') if lang not in ['en', 'zh']: raise ValueError( diff --git a/data_juicer/ops/filter/text_entity_dependency_filter.py b/data_juicer/ops/filter/text_entity_dependency_filter.py index 6e4ec9f36..23d6e237d 100644 --- a/data_juicer/ops/filter/text_entity_dependency_filter.py +++ b/data_juicer/ops/filter/text_entity_dependency_filter.py @@ -1,10 +1,11 @@ import numpy as np from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import AUTOINSTALL from data_juicer.utils.mm_utils import remove_special_tokens from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter OP_NAME = 'text_entity_dependency_filter' @@ -35,7 +36,8 @@ def __init__(self, sample only if all images are dependent. """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['spacy-pkuseg']) + # '--no-deps' do not update numpy + AUTOINSTALL.check(['spacy-pkuseg'], '--no-deps') if lang not in ['en', 'zh']: raise ValueError( diff --git a/data_juicer/ops/filter/token_num_filter.py b/data_juicer/ops/filter/token_num_filter.py index 3f9a72a40..30fc18c80 100644 --- a/data_juicer/ops/filter/token_num_filter.py +++ b/data_juicer/ops/filter/token_num_filter.py @@ -1,16 +1,13 @@ import sys from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter from ..common import get_words_from_document OP_NAME = 'token_num_filter' -transformers = LazyLoader('transformers', 'transformers') - @OPERATORS.register_module(OP_NAME) class TokenNumFilter(Filter): @@ -37,7 +34,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['transformers']) self.min_num = min_num self.max_num = max_num self.hf_tokenizer = hf_tokenizer diff --git a/data_juicer/ops/filter/video_aesthetics_filter.py b/data_juicer/ops/filter/video_aesthetics_filter.py index ccdcbffce..77972a47e 100644 --- a/data_juicer/ops/filter/video_aesthetics_filter.py +++ b/data_juicer/ops/filter/video_aesthetics_filter.py @@ -9,13 +9,13 @@ load_data_with_context, load_video) from ...utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS -OP_NAME = 'video_aesthetics_filter' - torch = LazyLoader('torch', 'torch') +OP_NAME = 'video_aesthetics_filter' + @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) @@ -75,8 +75,6 @@ def __init__(self, """ super().__init__(*args, **kwargs) - AUTOINSTALL.check( - ['torch', 'transformers', 'simple-aesthetics-predictor']) if hf_scorer_model == '': hf_scorer_model = \ 'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE' diff --git a/data_juicer/ops/filter/video_frames_text_similarity_filter.py b/data_juicer/ops/filter/video_frames_text_similarity_filter.py index 6b0d1d2c6..990944490 100644 --- a/data_juicer/ops/filter/video_frames_text_similarity_filter.py +++ b/data_juicer/ops/filter/video_frames_text_similarity_filter.py @@ -3,7 +3,6 @@ from pydantic import PositiveInt from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, close_video, extract_key_frames, extract_video_frames_uniformly, @@ -11,14 +10,11 @@ remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS OP_NAME = 'video_frames_text_similarity_filter' -torch = LazyLoader('torch', 'torch') -transformers = LazyLoader('transformers', 'transformers') - @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) @@ -79,7 +75,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['torch', 'transformers']) self.min_score = min_score self.max_score = max_score if frame_sampling_method not in ['all_keyframes', 'uniform']: diff --git a/data_juicer/ops/filter/video_motion_score_filter.py b/data_juicer/ops/filter/video_motion_score_filter.py index 30bbc48ee..59b573202 100644 --- a/data_juicer/ops/filter/video_motion_score_filter.py +++ b/data_juicer/ops/filter/video_motion_score_filter.py @@ -8,12 +8,12 @@ from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.lazy_loader import LazyLoader -from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter - -OP_NAME = 'video_motion_score_filter' +from ..base_op import OPERATORS, UNFORKABLE, Filter cv2 = LazyLoader('cv2', 'cv2') +OP_NAME = 'video_motion_score_filter' + @contextmanager def VideoCapture(*args, **kwargs): @@ -79,7 +79,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['opencv-python']) self.min_score = min_score self.max_score = max_score self.sampling_fps = sampling_fps diff --git a/data_juicer/ops/filter/video_nsfw_filter.py b/data_juicer/ops/filter/video_nsfw_filter.py index a4de77aa0..17193bd21 100644 --- a/data_juicer/ops/filter/video_nsfw_filter.py +++ b/data_juicer/ops/filter/video_nsfw_filter.py @@ -8,13 +8,12 @@ load_data_with_context, load_video) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS -OP_NAME = 'video_nsfw_filter' - torch = LazyLoader('torch', 'torch') -transformers = LazyLoader('transformers', 'transformers') + +OP_NAME = 'video_nsfw_filter' @OPERATORS.register_module(OP_NAME) @@ -67,7 +66,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['torch', 'transformers']) self.score_threshold = score_threshold if frame_sampling_method not in ['all_keyframes', 'uniform']: raise ValueError( diff --git a/data_juicer/ops/filter/video_ocr_area_ratio_filter.py b/data_juicer/ops/filter/video_ocr_area_ratio_filter.py index bb069f4c5..b774e9f62 100644 --- a/data_juicer/ops/filter/video_ocr_area_ratio_filter.py +++ b/data_juicer/ops/filter/video_ocr_area_ratio_filter.py @@ -10,13 +10,13 @@ extract_video_frames_uniformly, load_data_with_context, load_video) -from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter +from ..base_op import OPERATORS, UNFORKABLE, Filter from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS -OP_NAME = 'video_ocr_area_ratio_filter' - easyocr = LazyLoader('easyocr', 'easyocr') +OP_NAME = 'video_ocr_area_ratio_filter' + def triangle_area(p1, p2, p3): """ @@ -72,7 +72,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['easyocr']) self.min_area_ratio = min_area_ratio self.max_area_ratio = max_area_ratio self.frame_sample_num = frame_sample_num diff --git a/data_juicer/ops/filter/video_tagging_from_frames_filter.py b/data_juicer/ops/filter/video_tagging_from_frames_filter.py index f85cfaa54..ad44d5995 100644 --- a/data_juicer/ops/filter/video_tagging_from_frames_filter.py +++ b/data_juicer/ops/filter/video_tagging_from_frames_filter.py @@ -5,7 +5,7 @@ from data_juicer.utils.constant import Fields -from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter +from ..base_op import OPERATORS, UNFORKABLE, Filter from ..mapper.video_tagging_from_frames_mapper import \ VideoTaggingFromFramesMapper from ..op_fusion import LOADED_VIDEOS @@ -62,10 +62,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check([ - 'torch', - 'ram@git+https://github.com/xinyu1205/recognize-anything.git' - ]) if contain not in ['any', 'all']: raise ValueError(f'the containing type [{contain}] is not ' f'supported. Can only be one of ["any", "all"].') diff --git a/data_juicer/ops/filter/video_watermark_filter.py b/data_juicer/ops/filter/video_watermark_filter.py index 2642bce2d..a2e926a52 100644 --- a/data_juicer/ops/filter/video_watermark_filter.py +++ b/data_juicer/ops/filter/video_watermark_filter.py @@ -8,13 +8,13 @@ load_data_with_context, load_video) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS -OP_NAME = 'video_watermark_filter' - torch = LazyLoader('torch', 'torch') +OP_NAME = 'video_watermark_filter' + @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) @@ -70,7 +70,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['torch', 'transformers']) self.prob_threshold = prob_threshold if frame_sampling_method not in ['all_keyframes', 'uniform']: raise ValueError( diff --git a/data_juicer/ops/filter/word_repetition_filter.py b/data_juicer/ops/filter/word_repetition_filter.py index 41a081694..8aaa994b0 100644 --- a/data_juicer/ops/filter/word_repetition_filter.py +++ b/data_juicer/ops/filter/word_repetition_filter.py @@ -7,7 +7,7 @@ from data_juicer.utils.constant import Fields, InterVars, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter from ..common import (SPECIAL_CHARACTERS, get_words_from_document, words_refinement) from ..op_fusion import INTER_WORDS @@ -47,7 +47,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['sentencepiece']) self.n = rep_len self.min_ratio = min_ratio self.max_ratio = max_ratio diff --git a/data_juicer/ops/filter/words_num_filter.py b/data_juicer/ops/filter/words_num_filter.py index 413a2171d..6913678d6 100644 --- a/data_juicer/ops/filter/words_num_filter.py +++ b/data_juicer/ops/filter/words_num_filter.py @@ -3,7 +3,7 @@ from data_juicer.utils.constant import Fields, InterVars, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Filter +from ..base_op import OPERATORS, Filter from ..common import (SPECIAL_CHARACTERS, get_words_from_document, words_refinement) from ..op_fusion import INTER_WORDS @@ -41,7 +41,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['sentencepiece']) self.min_num = min_num self.max_num = max_num self.model_key = None diff --git a/data_juicer/ops/load.py b/data_juicer/ops/load.py index e82ebb16a..cf10cc51a 100644 --- a/data_juicer/ops/load.py +++ b/data_juicer/ops/load.py @@ -1,9 +1,3 @@ -import sys - -from loguru import logger - -from data_juicer.utils.availability_utils import UNAVAILABLE_OPERATORS - from .base_op import OPERATORS from .op_fusion import fuse_operators @@ -22,9 +16,6 @@ def load_ops(process_list, op_fusion=False): new_process_list = [] for process in process_list: op_name, args = list(process.items())[0] - if op_name in UNAVAILABLE_OPERATORS: - logger.error(UNAVAILABLE_OPERATORS[op_name].get_warning_msg()) - sys.exit(UNAVAILABLE_OPERATORS[op_name].get_warning_msg()) ops.append(OPERATORS.modules[op_name](**args)) new_process_list.append(process) diff --git a/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py b/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py index a640b11da..73eba57a0 100644 --- a/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py +++ b/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py @@ -5,13 +5,13 @@ from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.logger_utils import HiddenPrints -from ..base_op import AUTOINSTALL, OPERATORS, Mapper - -OP_NAME = 'audio_ffmpeg_wrapped_mapper' +from ..base_op import OPERATORS, Mapper with HiddenPrints(): ffmpeg = LazyLoader('ffmpeg', 'ffmpeg') +OP_NAME = 'audio_ffmpeg_wrapped_mapper' + @OPERATORS.register_module(OP_NAME) class AudioFFmpegWrappedMapper(Mapper): @@ -40,7 +40,6 @@ def __init__( :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['ffmpeg-python']) self._init_parameters = self.remove_extra_parameters(locals()) self.filter_name = filter_name diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py index e44127c4d..3e6ba4d33 100644 --- a/data_juicer/ops/mapper/chinese_convert_mapper.py +++ b/data_juicer/ops/mapper/chinese_convert_mapper.py @@ -1,11 +1,11 @@ from data_juicer.utils.lazy_loader import LazyLoader -from ..base_op import AUTOINSTALL, OPERATORS, Mapper - -OP_NAME = 'chinese_convert_mapper' +from ..base_op import OPERATORS, Mapper opencc = LazyLoader('opencc', 'opencc') +OP_NAME = 'chinese_convert_mapper' + OPENCC_CONVERTER = None @@ -74,7 +74,6 @@ def __init__(self, mode: str = 's2t', *args, **kwargs): :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['opencc']) mode_list = [ 's2t', 't2s', 's2tw', 'tw2s', 's2hk', 'hk2s', 's2twp', 'tw2sp', 't2tw', 'tw2t', 'hk2t', 't2hk', 't2jp', 'jp2t' diff --git a/data_juicer/ops/mapper/clean_html_mapper.py b/data_juicer/ops/mapper/clean_html_mapper.py index b088394d0..8653c907a 100644 --- a/data_juicer/ops/mapper/clean_html_mapper.py +++ b/data_juicer/ops/mapper/clean_html_mapper.py @@ -4,12 +4,12 @@ from data_juicer.utils.lazy_loader import LazyLoader -from ..base_op import AUTOINSTALL, OPERATORS, Mapper - -OP_NAME = 'clean_html_mapper' +from ..base_op import OPERATORS, Mapper selectolax = LazyLoader('selectolax', 'selectolax') +OP_NAME = 'clean_html_mapper' + @OPERATORS.register_module(OP_NAME) class CleanHtmlMapper(Mapper): @@ -25,7 +25,6 @@ def __init__(self, *args, **kwargs): :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['selectolax']) def process(self, samples): diff --git a/data_juicer/ops/mapper/extract_qa_mapper.py b/data_juicer/ops/mapper/extract_qa_mapper.py index 8a41efeb4..749a64486 100644 --- a/data_juicer/ops/mapper/extract_qa_mapper.py +++ b/data_juicer/ops/mapper/extract_qa_mapper.py @@ -5,18 +5,13 @@ from loguru import logger from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper -from data_juicer.utils.availability_utils import AvailabilityChecking +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model -OP_NAME = 'extract_qa_mapper' - -with AvailabilityChecking(['torch', 'transformers', 'vllm'], OP_NAME): - import torch - import transformers # noqa: F401 - import vllm # noqa: F401 +torch = LazyLoader('torch', 'torch') +vllm = LazyLoader('vllm', 'vllm') - # avoid hanging when calling model in multiprocessing - torch.set_num_threads(1) +OP_NAME = 'extract_qa_mapper' # TODO: Extend LLM-based OPs into API-based implementation. @@ -97,8 +92,6 @@ def __init__(self, self.enable_vllm = enable_vllm if enable_vllm: - import torch - from vllm import SamplingParams assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' if not tensor_parallel_size: @@ -112,7 +105,7 @@ def __init__(self, tensor_parallel_size=tensor_parallel_size, max_model_len=max_model_len, max_num_seqs=max_num_seqs) - self.sampling_params = SamplingParams(**sampling_params) + self.sampling_params = vllm.SamplingParams(**sampling_params) else: self.model_key = prepare_model( model_type='huggingface', diff --git a/data_juicer/ops/mapper/fix_unicode_mapper.py b/data_juicer/ops/mapper/fix_unicode_mapper.py index daa98a47b..13843873c 100644 --- a/data_juicer/ops/mapper/fix_unicode_mapper.py +++ b/data_juicer/ops/mapper/fix_unicode_mapper.py @@ -1,11 +1,11 @@ from data_juicer.utils.lazy_loader import LazyLoader -from ..base_op import AUTOINSTALL, OPERATORS, Mapper - -OP_NAME = 'fix_unicode_mapper' +from ..base_op import OPERATORS, Mapper ftfy = LazyLoader('ftfy', 'ftfy') +OP_NAME = 'fix_unicode_mapper' + @OPERATORS.register_module(OP_NAME) class FixUnicodeMapper(Mapper): @@ -24,7 +24,6 @@ def __init__(self, normalization: str = None, *args, **kwargs): :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['ftfy']) if normalization and len(normalization) > 0: self.normalization = normalization.upper() else: diff --git a/data_juicer/ops/mapper/generate_instruction_mapper.py b/data_juicer/ops/mapper/generate_instruction_mapper.py index 9fafa94e3..537230d07 100644 --- a/data_juicer/ops/mapper/generate_instruction_mapper.py +++ b/data_juicer/ops/mapper/generate_instruction_mapper.py @@ -6,11 +6,14 @@ from loguru import logger from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model from ..base_op import OPERATORS, UNFORKABLE, Mapper +torch = LazyLoader('torch', 'torch') +vllm = LazyLoader('vllm', 'vllm') + DEFAULT_PROMPT_TEMPLATE = """ 请你仔细观察多个示例数据的输入和输出,按照你的理解,总结出相应规矩,然后写出一个新的【问题】和【回答】。注意,新生成的【问题】和【回答】需要满足如下要求: 1. 生成的【问题】和【回答】不能与输入的【问题】和【回答】一致,但是需要保持格式相同。 @@ -25,14 +28,6 @@ OP_NAME = 'generate_instruction_mapper' -with AvailabilityChecking(['torch', 'transformers', 'vllm'], OP_NAME): - import torch - import transformers # noqa: F401 - import vllm # noqa: F401 - - # avoid hanging when calling model in multiprocessing - torch.set_num_threads(1) - # TODO: Extend LLM-based OPs into API-based implementation. @UNFORKABLE.register_module(OP_NAME) @@ -138,8 +133,6 @@ def __init__(self, self.enable_vllm = enable_vllm if enable_vllm: - import torch - from vllm import SamplingParams assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' if not tensor_parallel_size: @@ -153,7 +146,7 @@ def __init__(self, tensor_parallel_size=tensor_parallel_size, max_model_len=max_model_len, max_num_seqs=max_num_seqs) - self.sampling_params = SamplingParams(**sampling_params) + self.sampling_params = vllm.SamplingParams(**sampling_params) else: self.model_key = prepare_model( model_type='huggingface', diff --git a/data_juicer/ops/mapper/image_captioning_mapper.py b/data_juicer/ops/mapper/image_captioning_mapper.py index 5f04fa97f..adbd5d337 100644 --- a/data_juicer/ops/mapper/image_captioning_mapper.py +++ b/data_juicer/ops/mapper/image_captioning_mapper.py @@ -7,15 +7,18 @@ from pydantic import PositiveInt from data_juicer.utils.constant import HashKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, insert_texts_after_placeholders, load_image, remove_non_special_tokens, remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Mapper +from ..base_op import OPERATORS, Mapper from ..op_fusion import LOADED_IMAGES +simhash = LazyLoader('simhash', 'simhash') + OP_NAME = 'image_captioning_mapper' @@ -79,7 +82,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['torch', 'transformers', 'simhash-pybind']) if keep_candidate_mode not in [ 'random_any', 'similar_one_simhash', 'all' @@ -237,7 +239,6 @@ def _reduce_captions_per_image(self, chunk, new_generated_text_per_chunk.extend( generated_text_candidates_single_chunk) elif self.keep_candidate_mode == 'similar_one_simhash': - from simhash import num_differing_bits from ..deduplicator.document_simhash_deduplicator import \ DocumentSimhashDeduplicator @@ -259,7 +260,7 @@ def _reduce_captions_per_image(self, chunk, for candidate_text in generated_text_candidates_single_chunk ] hamming_distances = [ - num_differing_bits(ori_text_hash, generated_text_hash) + simhash.num_differing_bits(ori_text_hash, generated_text_hash) for generated_text_hash in generated_text_hashes ] max_index = min(range(len(hamming_distances)), diff --git a/data_juicer/ops/mapper/image_diffusion_mapper.py b/data_juicer/ops/mapper/image_diffusion_mapper.py index a69d8ac6a..df2cb9eb7 100644 --- a/data_juicer/ops/mapper/image_diffusion_mapper.py +++ b/data_juicer/ops/mapper/image_diffusion_mapper.py @@ -12,7 +12,7 @@ load_image, remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Mapper +from ..base_op import OPERATORS, Mapper from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_diffusion_mapper' @@ -92,8 +92,6 @@ def __init__(self, caption_key is None. """ super().__init__(*args, **kwargs) - AUTOINSTALL.check( - ['diffusers', 'torch', 'transformers', 'simhash-pybind']) self._init_parameters = self.remove_extra_parameters(locals()) self.strength = strength self.guidance_scale = guidance_scale diff --git a/data_juicer/ops/mapper/image_face_blur_mapper.py b/data_juicer/ops/mapper/image_face_blur_mapper.py index 0afbf7bbb..f1b53ebc9 100644 --- a/data_juicer/ops/mapper/image_face_blur_mapper.py +++ b/data_juicer/ops/mapper/image_face_blur_mapper.py @@ -1,6 +1,7 @@ import os from loguru import logger +from PIL import ImageFilter from pydantic import NonNegativeFloat from data_juicer.utils.constant import Fields @@ -10,13 +11,12 @@ load_image) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Mapper +from ..base_op import OPERATORS, UNFORKABLE, Mapper from ..op_fusion import LOADED_IMAGES -OP_NAME = 'image_face_blur_mapper' - cv2 = LazyLoader('cv2', 'cv2') -PIL = LazyLoader('PIL', 'PIL') + +OP_NAME = 'image_face_blur_mapper' @UNFORKABLE.register_module(OP_NAME) @@ -51,7 +51,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['opencv-python', 'Pillow']) self._init_parameters = self.remove_extra_parameters(locals()) if cv_classifier == '': @@ -65,11 +64,11 @@ def __init__(self, raise ValueError('Radius must be >= 0. ') if blur_type == 'mean': - self.blur = PIL.ImageFilter.BLUR + self.blur = ImageFilter.BLUR elif blur_type == 'box': - self.blur = PIL.ImageFilter.BoxBlur(radius) + self.blur = ImageFilter.BoxBlur(radius) else: - self.blur = PIL.ImageFilter.GaussianBlur(radius) + self.blur = ImageFilter.GaussianBlur(radius) self.blur_type = blur_type self.radius = radius diff --git a/data_juicer/ops/mapper/image_tagging_mapper.py b/data_juicer/ops/mapper/image_tagging_mapper.py index 0bd2b89e2..bbdeea914 100644 --- a/data_juicer/ops/mapper/image_tagging_mapper.py +++ b/data_juicer/ops/mapper/image_tagging_mapper.py @@ -2,24 +2,18 @@ import numpy as np -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image from data_juicer.utils.model_utils import get_model, prepare_model from ..base_op import OPERATORS, UNFORKABLE, Mapper from ..op_fusion import LOADED_IMAGES -OP_NAME = 'image_tagging_mapper' - -with AvailabilityChecking( - ['torch', 'git+https://github.com/xinyu1205/recognize-anything.git'], - OP_NAME): - import ram # noqa: F401 - import torch +torch = LazyLoader('torch', 'torch') +ram = LazyLoader('ram', 'ram') - # avoid hanging when calling recognizeAnything in multiprocessing - torch.set_num_threads(1) +OP_NAME = 'image_tagging_mapper' @UNFORKABLE.register_module(OP_NAME) @@ -47,8 +41,7 @@ def __init__(self, model_type='recognizeAnything', pretrained_model_name_or_path='ram_plus_swin_large_14m.pth', input_size=384) - from ram import get_transform - self.transform = get_transform(image_size=384) + self.transform = ram.get_transform(image_size=384) self.tag_field_name = tag_field_name def process(self, sample, rank=None, context=False): diff --git a/data_juicer/ops/mapper/nlpaug_en_mapper.py b/data_juicer/ops/mapper/nlpaug_en_mapper.py index fffb3488b..22ecc4b62 100644 --- a/data_juicer/ops/mapper/nlpaug_en_mapper.py +++ b/data_juicer/ops/mapper/nlpaug_en_mapper.py @@ -5,15 +5,15 @@ from data_juicer.utils.lazy_loader import LazyLoader -from ..base_op import AUTOINSTALL, OPERATORS, Mapper - -OP_NAME = 'nlpaug_en_mapper' +from ..base_op import OPERATORS, Mapper nlpaug = LazyLoader('nlpaug', 'nlpaug') nac = LazyLoader('nac', 'nlpaug.augmenter.char') naw = LazyLoader('naw', 'nlpaug.augmenter.word') naf = LazyLoader('naf', 'nlpaug.flow') +OP_NAME = 'nlpaug_en_mapper' + @OPERATORS.register_module(OP_NAME) class NlpaugEnMapper(Mapper): @@ -86,7 +86,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['nlpaug']) self.aug_num = aug_num if aug_num >= 10: diff --git a/data_juicer/ops/mapper/nlpcda_zh_mapper.py b/data_juicer/ops/mapper/nlpcda_zh_mapper.py index 32f33368b..8a72ec613 100644 --- a/data_juicer/ops/mapper/nlpcda_zh_mapper.py +++ b/data_juicer/ops/mapper/nlpcda_zh_mapper.py @@ -6,12 +6,12 @@ from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.logger_utils import HiddenPrints -from ..base_op import AUTOINSTALL, OPERATORS, Mapper - -OP_NAME = 'nlpcda_zh_mapper' +from ..base_op import OPERATORS, Mapper nlpcda = LazyLoader('nlpcda', 'nlpcda') +OP_NAME = 'nlpcda_zh_mapper' + @OPERATORS.register_module(OP_NAME) class NlpcdaZhMapper(Mapper): @@ -70,7 +70,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['nlpcda']) self.aug_num = aug_num if aug_num >= 10: diff --git a/data_juicer/ops/mapper/optimize_instruction_mapper.py b/data_juicer/ops/mapper/optimize_instruction_mapper.py index a9ec0564c..752aec408 100644 --- a/data_juicer/ops/mapper/optimize_instruction_mapper.py +++ b/data_juicer/ops/mapper/optimize_instruction_mapper.py @@ -3,21 +3,16 @@ from loguru import logger from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper -from data_juicer.utils.availability_utils import AvailabilityChecking +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model +torch = LazyLoader('torch', 'torch') +vllm = LazyLoader('vllm', 'vllm') + DEFAULT_SYSTEM_PROMPT = '请优化这个指令,将其修改为一个更详细具体的指令。' OP_NAME = 'optimize_instruction_mapper' -with AvailabilityChecking(['torch', 'transformers', 'vllm'], OP_NAME): - import torch - import transformers # noqa: F401 - import vllm # noqa: F401 - - # avoid hanging when calling model in multiprocessing - torch.set_num_threads(1) - # TODO: Extend LLM-based OPs into API-based implementation. @UNFORKABLE.register_module(OP_NAME) @@ -70,9 +65,6 @@ def __init__(self, self.enable_vllm = enable_vllm if enable_vllm: - import torch - from vllm import SamplingParams - assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' if not tensor_parallel_size: tensor_parallel_size = torch.cuda.device_count() @@ -85,7 +77,7 @@ def __init__(self, tensor_parallel_size=tensor_parallel_size, max_model_len=max_model_len, max_num_seqs=max_num_seqs) - self.sampling_params = SamplingParams(**sampling_params) + self.sampling_params = vllm.SamplingParams(**sampling_params) else: self.model_key = prepare_model( model_type='huggingface', diff --git a/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py b/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py index eea948097..bfc564433 100644 --- a/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py +++ b/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py @@ -2,7 +2,7 @@ from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Mapper +from ..base_op import OPERATORS, Mapper from ..common import (SPECIAL_CHARACTERS, get_words_from_document, merge_on_whitespace_tab_newline, split_on_newline_tab_whitespace, strip) @@ -34,7 +34,6 @@ def __init__(self, if substrings is None: substrings = ['http', 'www', '.com', 'href', '//'] super().__init__(*args, **kwargs) - AUTOINSTALL.check(['sentencepiece']) self.tokenization = tokenization self.substrings = substrings self.lang = lang diff --git a/data_juicer/ops/mapper/sentence_split_mapper.py b/data_juicer/ops/mapper/sentence_split_mapper.py index c71479de4..9a811d40d 100644 --- a/data_juicer/ops/mapper/sentence_split_mapper.py +++ b/data_juicer/ops/mapper/sentence_split_mapper.py @@ -1,6 +1,6 @@ from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Mapper +from ..base_op import OPERATORS, Mapper from ..common import get_sentences_from_document OP_NAME = 'sentence_split_mapper' @@ -21,7 +21,6 @@ def __init__(self, lang: str = 'en', *args, **kwargs): :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['nltk']) self.lang = lang self.model_key = prepare_model(model_type='nltk', lang=lang) diff --git a/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py b/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py index 9eaa960e7..2b216f308 100644 --- a/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py @@ -3,10 +3,11 @@ import regex as re +from data_juicer.utils.lazy_loader import AUTOINSTALL from data_juicer.utils.mm_utils import SpecialTokens, extract_audio_from_video from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Mapper +from ..base_op import OPERATORS, Mapper NAME = 'video_captioning_from_audio_mapper' diff --git a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py index 08eee2add..d583b7fc2 100644 --- a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py @@ -9,6 +9,7 @@ from pydantic import PositiveInt from data_juicer.utils.constant import HashKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, close_video, extract_key_frames, extract_video_frames_uniformly, @@ -18,9 +19,11 @@ remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Mapper +from ..base_op import OPERATORS, Mapper from ..op_fusion import LOADED_VIDEOS +simhash = LazyLoader('simhash', 'simhash') + OP_NAME = 'video_captioning_from_frames_mapper' @@ -106,7 +109,6 @@ def __init__( :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['torch', 'transformers', 'simhash-pybind']) if keep_candidate_mode not in [ 'random_any', 'similar_one_simhash', 'all' @@ -292,8 +294,6 @@ def _reduce_captions(self, chunk, generated_text_candidates_single_chunk): generated_text_per_chunk.extend( generated_text_candidates_single_chunk) elif self.keep_candidate_mode == 'similar_one_simhash': - from simhash import num_differing_bits - from ..deduplicator.document_simhash_deduplicator import \ DocumentSimhashDeduplicator @@ -315,7 +315,7 @@ def _reduce_captions(self, chunk, generated_text_candidates_single_chunk): for candidate_text in generated_text_candidates_single_chunk ] hamming_distances = [ - num_differing_bits(ori_text_hash, generated_text_hash) + simhash.num_differing_bits(ori_text_hash, generated_text_hash) for generated_text_hash in generated_text_hashes ] max_index = min(range(len(hamming_distances)), diff --git a/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py b/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py index 02cf781a0..4ab2e0ff4 100644 --- a/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py @@ -4,10 +4,11 @@ from pydantic import PositiveInt from data_juicer.utils.constant import Fields +from data_juicer.utils.lazy_loader import AUTOINSTALL from data_juicer.utils.mm_utils import SpecialTokens, remove_special_tokens from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Mapper +from ..base_op import OPERATORS, Mapper NAME = 'video_captioning_from_summarizer_mapper' @@ -84,13 +85,11 @@ def __init__(self, AUTOINSTALL.check([ 'torch', 'transformers', - 'simhash-pybind', # by video caption 'transformers_stream_generator', 'einops', 'accelerate', 'tiktoken', # by audio caption 'torchaudio', # by audio tag - 'ram@git+https://github.com/xinyu1205/recognize-anything.git' ]) self.keep_original_sample = keep_original_sample diff --git a/data_juicer/ops/mapper/video_captioning_from_video_mapper.py b/data_juicer/ops/mapper/video_captioning_from_video_mapper.py index ebb26c53b..8000780bc 100644 --- a/data_juicer/ops/mapper/video_captioning_from_video_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_video_mapper.py @@ -9,6 +9,7 @@ from pydantic import PositiveInt from data_juicer.utils.constant import HashKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, close_video, extract_key_frames, extract_video_frames_uniformly, @@ -18,9 +19,11 @@ remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Mapper +from ..base_op import OPERATORS, Mapper from ..op_fusion import LOADED_VIDEOS +simhash = LazyLoader('simhash', 'simhash') + OP_NAME = 'video_captioning_from_video_mapper' @@ -106,7 +109,6 @@ def __init__( :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['torch', 'transformers', 'simhash-pybind']) if keep_candidate_mode not in [ 'random_any', 'similar_one_simhash', 'all' @@ -299,8 +301,6 @@ def _reduce_captions(self, chunk, generated_text_candidates_single_chunk): generated_text_per_chunk.extend( generated_text_candidates_single_chunk) elif self.keep_candidate_mode == 'similar_one_simhash': - from simhash import num_differing_bits - from ..deduplicator.document_simhash_deduplicator import \ DocumentSimhashDeduplicator @@ -322,7 +322,7 @@ def _reduce_captions(self, chunk, generated_text_candidates_single_chunk): for candidate_text in generated_text_candidates_single_chunk ] hamming_distances = [ - num_differing_bits(ori_text_hash, generated_text_hash) + simhash.num_differing_bits(ori_text_hash, generated_text_hash) for generated_text_hash in generated_text_hashes ] max_index = min(range(len(hamming_distances)), diff --git a/data_juicer/ops/mapper/video_face_blur_mapper.py b/data_juicer/ops/mapper/video_face_blur_mapper.py index 48fa0ce7b..2122a3eed 100644 --- a/data_juicer/ops/mapper/video_face_blur_mapper.py +++ b/data_juicer/ops/mapper/video_face_blur_mapper.py @@ -1,6 +1,7 @@ import os import av +from PIL import ImageFilter from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename @@ -10,13 +11,12 @@ process_each_frame) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Mapper +from ..base_op import OPERATORS, UNFORKABLE, Mapper from ..op_fusion import LOADED_VIDEOS -OP_NAME = 'video_face_blur_mapper' - cv2 = LazyLoader('cv2', 'cv2') -PIL = LazyLoader('PIL', 'PIL') + +OP_NAME = 'video_face_blur_mapper' @UNFORKABLE.register_module(OP_NAME) @@ -51,7 +51,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['opencv-python', 'Pillow']) self._init_parameters = self.remove_extra_parameters(locals()) if cv_classifier == '': @@ -65,11 +64,11 @@ def __init__(self, raise ValueError('Radius must be >= 0. ') if blur_type == 'mean': - self.blur = PIL.ImageFilter.BLUR + self.blur = ImageFilter.BLUR elif blur_type == 'box': - self.blur = PIL.ImageFilter.BoxBlur(radius) + self.blur = ImageFilter.BoxBlur(radius) else: - self.blur = PIL.ImageFilter.GaussianBlur(radius) + self.blur = ImageFilter.GaussianBlur(radius) self.blur_type = blur_type self.radius = radius diff --git a/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py b/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py index 4a3cf0053..b9688bb9d 100644 --- a/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py +++ b/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py @@ -5,13 +5,13 @@ from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.logger_utils import HiddenPrints -from ..base_op import AUTOINSTALL, OPERATORS, Mapper - -OP_NAME = 'video_ffmpeg_wrapped_mapper' +from ..base_op import OPERATORS, Mapper with HiddenPrints(): ffmpeg = LazyLoader('ffmpeg', 'ffmpeg') +OP_NAME = 'video_ffmpeg_wrapped_mapper' + @OPERATORS.register_module(OP_NAME) class VideoFFmpegWrappedMapper(Mapper): @@ -40,7 +40,6 @@ def __init__( :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['ffmpeg-python']) self._init_parameters = self.remove_extra_parameters(locals()) self.filter_name = filter_name diff --git a/data_juicer/ops/mapper/video_remove_watermark_mapper.py b/data_juicer/ops/mapper/video_remove_watermark_mapper.py index 054a4f9f0..2deb50adb 100644 --- a/data_juicer/ops/mapper/video_remove_watermark_mapper.py +++ b/data_juicer/ops/mapper/video_remove_watermark_mapper.py @@ -15,14 +15,14 @@ parse_string_to_roi, process_each_frame) -from ..base_op import AUTOINSTALL, OPERATORS, Mapper +from ..base_op import OPERATORS, Mapper from ..op_fusion import LOADED_VIDEOS -OP_NAME = 'video_remove_watermark_mapper' - with HiddenPrints(): cv2 = LazyLoader('cv2', 'cv2') +OP_NAME = 'video_remove_watermark_mapper' + @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) @@ -69,7 +69,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['opencv-python']) self._init_parameters = self.remove_extra_parameters(locals()) if roi_type not in ['ratio', 'pixel']: diff --git a/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py b/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py index ac49e29e7..04a658452 100644 --- a/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py +++ b/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py @@ -8,13 +8,13 @@ from data_juicer.utils.logger_utils import HiddenPrints from data_juicer.utils.mm_utils import close_video, load_video -from ..base_op import AUTOINSTALL, OPERATORS, Mapper - -OP_NAME = 'video_resize_aspect_ratio_mapper' +from ..base_op import OPERATORS, Mapper with HiddenPrints(): ffmpeg = LazyLoader('ffmpeg', 'ffmpeg') +OP_NAME = 'video_resize_aspect_ratio_mapper' + def rescale(width, height, ori_ratio, min_ratio, max_ratio, strategy): @@ -88,7 +88,6 @@ def __init__( :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['ffmpeg-python']) self._init_parameters = self.remove_extra_parameters(locals()) strategy = strategy.lower() diff --git a/data_juicer/ops/mapper/video_resize_resolution_mapper.py b/data_juicer/ops/mapper/video_resize_resolution_mapper.py index 961c755c1..065bf483a 100644 --- a/data_juicer/ops/mapper/video_resize_resolution_mapper.py +++ b/data_juicer/ops/mapper/video_resize_resolution_mapper.py @@ -10,14 +10,14 @@ from data_juicer.utils.logger_utils import HiddenPrints from data_juicer.utils.mm_utils import close_video, load_video -from ..base_op import AUTOINSTALL, OPERATORS, Mapper +from ..base_op import OPERATORS, Mapper from ..op_fusion import LOADED_VIDEOS -OP_NAME = 'video_resize_resolution_mapper' - with HiddenPrints(): ffmpeg = LazyLoader('ffmpeg', 'ffmpeg') +OP_NAME = 'video_resize_resolution_mapper' + @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) @@ -59,7 +59,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['ffmpeg-python']) self._init_parameters = self.remove_extra_parameters(locals()) force_original_aspect_ratio = force_original_aspect_ratio.lower() diff --git a/data_juicer/ops/mapper/video_split_by_scene_mapper.py b/data_juicer/ops/mapper/video_split_by_scene_mapper.py index 6c042d254..7a4d038ce 100644 --- a/data_juicer/ops/mapper/video_split_by_scene_mapper.py +++ b/data_juicer/ops/mapper/video_split_by_scene_mapper.py @@ -10,12 +10,12 @@ from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import SpecialTokens -from ..base_op import AUTOINSTALL, OPERATORS, Mapper - -OP_NAME = 'video_split_by_scene_mapper' +from ..base_op import OPERATORS, Mapper scenedetect = LazyLoader('scenedetect', 'scenedetect') +OP_NAME = 'video_split_by_scene_mapper' + def replace_func(match, scene_counts_iter): try: @@ -60,7 +60,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['scenedetect[opencv]']) self._init_parameters = self.remove_extra_parameters(locals()) if detector not in self.avaliable_detectors: diff --git a/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py b/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py index 5dcf3b71a..1b251d272 100644 --- a/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py +++ b/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py @@ -2,16 +2,16 @@ import numpy as np from data_juicer.utils.constant import Fields -from data_juicer.utils.lazy_loader import LazyLoader +from data_juicer.utils.lazy_loader import AUTOINSTALL, LazyLoader from data_juicer.utils.mm_utils import extract_audio_from_video from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, Mapper - -OP_NAME = 'video_tagging_from_audio_mapper' +from ..base_op import OPERATORS, Mapper torch = LazyLoader('torch', 'torch') +OP_NAME = 'video_tagging_from_audio_mapper' + @OPERATORS.register_module(OP_NAME) class VideoTaggingFromAudioMapper(Mapper): @@ -38,7 +38,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check(['torch', 'transformers', 'torchaudio']) + AUTOINSTALL.check(['torchaudio']) self.model_key = prepare_model(model_type='huggingface', pretrained_model_name_or_path=hf_ast, trust_remote_code=trust_remote_code) diff --git a/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py b/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py index a8a70fb82..55fcc2ca1 100644 --- a/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py +++ b/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py @@ -10,14 +10,14 @@ load_data_with_context, load_video) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Mapper +from ..base_op import OPERATORS, UNFORKABLE, Mapper from ..op_fusion import LOADED_VIDEOS -OP_NAME = 'video_tagging_from_frames_mapper' - ram = LazyLoader('ram', 'ram') torch = LazyLoader('torch', 'torch') +OP_NAME = 'video_tagging_from_frames_mapper' + @UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) @@ -56,10 +56,6 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) - AUTOINSTALL.check([ - 'torch', - 'ram@git+https://github.com/xinyu1205/recognize-anything.git' - ]) if frame_sampling_method not in ['all_keyframes', 'uniform']: raise ValueError( f'Frame sampling method [{frame_sampling_method}] is not ' diff --git a/data_juicer/utils/auto_install_mapping.py b/data_juicer/utils/auto_install_mapping.py new file mode 100644 index 000000000..1705afa18 --- /dev/null +++ b/data_juicer/utils/auto_install_mapping.py @@ -0,0 +1,103 @@ +# Map the imported module to the require package we need to install +MODULE_TO_PKGS = { + 'aesthetics_predictor': ['simple-aesthetics-predictor'], + 'cv2': ['opencv-python'], + 'fasttext': ['fasttext', 'fasttext-wheel'], + 'ffmpeg': ['ffmpeg-python'], + 'PIL': ['Pillow'], + 'ram': ['ram@git+https://github.com/xinyu1205/recognize-anything.git'], + 'scenedetect': ['scenedetect[opencv]'], + 'simhash': ['simhash-pybind'], +} + +# Packages to corresponding ops that require them +PKG_TO_OPS = { + 'torch': [ + 'image_aesthetics_filter', 'image_nsfw_filter', + 'image_text_matching_filter', 'image_text_similarity_filter', + 'image_watermark_filter', 'phrase_grounding_recall_filter', + 'video_aesthetics_filter', 'video_frames_text_similarity_filter', + 'video_nsfw_filter', 'video_tagging_from_frames_filter', + 'video_watermark_filter', 'extract_qa_mapper', + 'generate_instruction_mapper', 'image_captioning_mapper', + 'image_diffusion_mapper', 'image_tagging_mapper', + 'optimize_instruction_mapper', 'video_captioning_from_frames_mapper', + 'video_captioning_from_summarizer_mapper', + 'video_captioning_from_video_mapper', + 'video_tagging_from_audio_mapper', 'video_tagging_from_frames_mapper' + ], + 'torchaudio': [ + 'video_captioning_from_summarizer_mapper', + 'video_tagging_from_audio_mapper' + ], + 'easyocr': ['video_ocr_area_ratio_filter'], + 'fasttext': ['language_id_score_filter'], + 'fasttext-wheel': ['language_id_score_filter'], + 'kenlm': ['perplexity_filter'], + 'sentencepiece': [ + 'flagged_words_filter', 'perplexity_filter', 'stopwords_filter', + 'word_repetition_filter', 'words_num_filter' + ], + 'scipy': ['document_minhash_deduplicator'], + 'ftfy': ['fix_unicode_mapper'], + 'simhash-pybind': [ + 'document_simhash_deduplicator', 'image_captioning_mapper', + 'image_diffusion_mapper', 'video_captioning_from_frames_mapper', + 'video_captioning_from_summarizer_mapper', + 'video_captioning_from_video_mapper' + ], + 'selectolax': ['clean_html_mapper'], + 'nlpaug': ['nlpaug_en_mapper'], + 'nlpcda': ['nlpcda'], + 'nltk': ['phrase_grounding_recall_filter', 'sentence_split_mapper'], + 'transformers': [ + 'alphanumeric_filter', 'image_aesthetics_filter', 'image_nsfw_filter', + 'image_text_matching_filter', 'image_text_similarity_filter', + 'image_watermark_filter', 'phrase_grounding_recall_filter', + 'token_num_filter', 'video_aesthetics_filter', + 'video_frames_text_similarity_filter', 'video_nsfw_filter', + 'extract_qa_mapper', 'generate_instruction_mapper', + 'image_captioning_mapper', 'image_diffusion_mapper', + 'optimize_instruction_mapper', 'video_captioning_from_audio_mapper', + 'video_captioning_from_frames_mapper', + 'video_captioning_from_summarizer_mapper', + 'video_captioning_from_video_mapper', 'video_tagging_from_audio_mapper' + ], + 'transformers_stream_generator': [ + 'video_captioning_from_audio_mapper', + 'video_captioning_from_summarizer_mapper' + ], + 'einops': [ + 'video_captioning_from_audio_mapper', + 'video_captioning_from_summarizer_mapper' + ], + 'accelerate': [ + 'video_captioning_from_audio_mapper', + 'video_captioning_from_summarizer_mapper' + ], + 'tiktoken': [ + 'video_captioning_from_audio_mapper', + 'video_captioning_from_summarizer_mapper' + ], + 'opencc': ['chinese_convert_mapper'], + 'imagededup': ['image_deduplicator', 'ray_image_deduplicator'], + 'spacy-pkuseg': ['text_action_filter', 'text_entity_dependency_filter'], + 'diffusers': ['image_diffusion_mapper'], + 'simple-aesthetics-predictor': + ['image_aesthetics_filter', 'video_aesthetics_filter'], + 'scenedetect[opencv]': ['video_split_by_scene_mapper'], + 'ffmpeg-python': [ + 'audio_ffmpeg_wrapped_mapper', 'video_ffmpeg_wrapped_mapper', + 'video_resize_aspect_ratio_mapper', 'video_resize_resolution_mapper' + ], + 'opencv-python': [ + 'image_face_ratio_filter', 'video_motion_score_filter', + 'image_face_blur_mapper', 'video_face_blur_mapper', + 'video_remove_watermark_mapper' + ], + 'vllm': [ + 'extract_qa_mapper', 'generate_instruction_mapper', + 'optimize_instruction_mapper' + ], + 'ram': ['image_tagging_mapper', 'video_tagging_from_frames_mapper'] +} diff --git a/data_juicer/utils/auto_install_utils.py b/data_juicer/utils/auto_install_utils.py index 038637879..9d8091eae 100644 --- a/data_juicer/utils/auto_install_utils.py +++ b/data_juicer/utils/auto_install_utils.py @@ -1,12 +1,22 @@ -import importlib import os -import platform import subprocess import sys from loguru import logger -CHECK_SYSTEM_INFO_ONCE = True +from data_juicer.utils.auto_install_mapping import MODULE_TO_PKGS +from data_juicer.utils.availability_utils import _torch_check_and_set + + +def _is_module_installed(module_name): + if module_name in MODULE_TO_PKGS: + pkgs = MODULE_TO_PKGS[module_name] + else: + pkgs = [module_name] + for pkg in pkgs: + if not _is_package_installed(pkg): + return False + return True def _is_package_installed(package_name): @@ -16,34 +26,13 @@ def _is_package_installed(package_name): package_name = package_name.split('[')[0] try: subprocess.check_output( - [sys.executable, '-m', 'pip', 'show', package_name], + [sys.executable, '-m', 'pip', 'show', '-q', package_name], stderr=subprocess.STDOUT) return True except subprocess.CalledProcessError: return False -def _torch_check_and_set(): - # only for python3.8 on mac - global CHECK_SYSTEM_INFO_ONCE - if CHECK_SYSTEM_INFO_ONCE and importlib.util.find_spec( - 'torch') is not None: - major, minor = sys.version_info[:2] - system = platform.system() - if major == 3 and minor == 8 and system == 'Darwin': - logger.warning( - 'The torch.set_num_threads function does not ' - 'work in python3.8 version on Mac systems. We will set ' - 'OMP_NUM_THREADS to 1 manually before importing torch') - - os.environ['OMP_NUM_THREADS'] = str(1) - CHECK_SYSTEM_INFO_ONCE = False - import torch - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) - - class AutoInstaller(object): """ This class is used to install the required @@ -68,17 +57,38 @@ def __init__(self, require_f_paths=[]): '=', ' ').split(' ')[0] self.version_map[clean_req] = req - def check(self, check_pkgs): + def check(self, check_pkgs, param=None): """ - install if the package is not importable. + install if the package is not installed. """ for pkg in check_pkgs: if not _is_package_installed(pkg): - logger.info(f'Installing {pkg} ...') + logger.warning(f'Installing {pkg} ...') if pkg in self.version_map: pkg = self.version_map[pkg] - pip_cmd = [sys.executable, '-m', 'pip', 'install', pkg] + # not install the dependency of this pkg + if param is None: + pip_cmd = [sys.executable, '-m', 'pip', 'install', pkg] + else: + pip_cmd = [ + sys.executable, '-m', 'pip', 'install', param, pkg + ] subprocess.check_call(pip_cmd) logger.info(f'The {pkg} installed.') if pkg == 'torch': _torch_check_and_set() + + def install(self, module): + """ + install package for given module. + """ + if module in MODULE_TO_PKGS: + pkgs = MODULE_TO_PKGS[module] + else: + pkgs = [module] + for pkg in pkgs: + if pkg in self.version_map: + pkg = self.version_map[pkg] + pip_cmd = [sys.executable, '-m', 'pip', 'install', pkg] + subprocess.check_call(pip_cmd) + logger.info(f'The {pkg} installed.') diff --git a/data_juicer/utils/availability_utils.py b/data_juicer/utils/availability_utils.py index 5f16939af..d31a564a4 100644 --- a/data_juicer/utils/availability_utils.py +++ b/data_juicer/utils/availability_utils.py @@ -1,88 +1,13 @@ import importlib.metadata import importlib.util +import os +import platform +import sys from typing import Tuple, Union from loguru import logger -from data_juicer.utils.auto_install_utils import _torch_check_and_set - -UNAVAILABLE_OPERATORS = {} - - -class UnavailableOperator: - - def __init__(self, op_name, requires): - self.op_name = op_name - self.requires = requires - - def get_warning_msg(self): - return f'This OP [{self.op_name}] is unavailable due to importing ' \ - f'third-party requirements of this OP failure: ' \ - f'{self.requires}. You can either run ' \ - f'`pip install -v -e .[sci]` to install all requirements for ' \ - f'all OPs, or run `pip install {" ".join(self.requires)}` ' \ - f'with library version specified by ' \ - f'`environments/science_requires.txt` to install libraries ' \ - f'required by this OP. Data processing will skip this OP later.' - - -class AvailabilityChecking: - """Define a range that checks the availability of third-party libraries for - OPs or other situations. If the checking failed, add corresponding OP to - the unavailable OP - list and skip them when initializing OPs with warnings. - """ - - def __init__( - self, - requires_list, - op_name=None, - requires_type=None, - ): - """ - Initialization method. - - :param requires_list: libraries to import in this range - :param op_name: which op requires these libraries. In default, it's - None, which means the importing process is not in an OP. - """ - self.requires_list = requires_list - self.op_name = op_name - self.requires_type = requires_type - - self.error_msg = f'No module named {self.requires_list}. You might ' \ - f'need to install it by running `pip install ' \ - f'{" ".join(self.requires_list)}`.' - if self.requires_type: - self.error_msg += f' Or install all related requires by running ' \ - f'`pip install -v -e .[{self.requires_type}]`' - - def __enter__(self): - _torch_check_and_set() - - def __exit__(self, exc_type, exc_val, exc_tb): - if exc_type is ModuleNotFoundError: - if self.op_name: - # ModuleNotFoundError for OP: register to UNAVAILABLE_OPERATORS - UNAVAILABLE_OPERATORS[self.op_name] = UnavailableOperator( - op_name=self.op_name, - requires=self.requires_list, - ) - else: - # other situations: print error message and exit - logger.error(f'{exc_type.__name__}: {exc_val}') - logger.error(f'{exc_tb.tb_frame}') - logger.error(self.error_msg) - exit(0) - elif exc_type is None: - # import libs successfully - pass - else: - # other exceptions: raise the exception directly - return False - - # return True to suppress the exception - return True +CHECK_SYSTEM_INFO_ONCE = True def _is_package_available( @@ -103,3 +28,24 @@ def _is_package_available( return package_exists, package_version else: return package_exists + + +def _torch_check_and_set(): + # only for python3.8 on mac + global CHECK_SYSTEM_INFO_ONCE + if CHECK_SYSTEM_INFO_ONCE and importlib.util.find_spec( + 'torch') is not None: + major, minor = sys.version_info[:2] + system = platform.system() + if major == 3 and minor == 8 and system == 'Darwin': + logger.warning( + 'The torch.set_num_threads function does not ' + 'work in python3.8 version on Mac systems. We will set ' + 'OMP_NUM_THREADS to 1 manually before importing torch') + + os.environ['OMP_NUM_THREADS'] = str(1) + CHECK_SYSTEM_INFO_ONCE = False + import torch + + # avoid hanging when calling clip in multiprocessing + torch.set_num_threads(1) diff --git a/data_juicer/utils/lazy_loader.py b/data_juicer/utils/lazy_loader.py index cc8b392ce..9e5c414f4 100644 --- a/data_juicer/utils/lazy_loader.py +++ b/data_juicer/utils/lazy_loader.py @@ -3,8 +3,22 @@ import importlib import inspect +import os import types +from loguru import logger + +from data_juicer.utils.auto_install_utils import (AutoInstaller, + _is_module_installed) +from data_juicer.utils.availability_utils import _torch_check_and_set + +current_path = os.path.dirname(os.path.realpath(__file__)) +science_file_path = os.path.join(current_path, + '../../environments/science_requires.txt') +dist_file_path = os.path.join(current_path, + '../../environments/dist_requires.txt') +AUTOINSTALL = AutoInstaller([science_file_path, dist_file_path]) + class LazyLoader(types.ModuleType): """ @@ -14,18 +28,29 @@ class LazyLoader(types.ModuleType): """ # The lint error here is incorrect. - def __init__(self, local_name, name): + def __init__(self, local_name, name, auto_install=True): self._local_name = local_name # get last frame in the stack frame = inspect.currentframe().f_back # get the globals of module who calls LazyLoader self._parent_module_globals = frame.f_globals + self.auto_install = auto_install super(LazyLoader, self).__init__(name) def _load(self): + if self.__name__ == 'torch': + _torch_check_and_set() + # Auto install if necessary + module_name = self.__name__.split('.')[0] + if not _is_module_installed(module_name): + logger.warning( + f"Module '{module_name}' not installed or fully installed.") + logger.warning(f"Auto installing '{module_name}' ...") + AUTOINSTALL.install(module_name) # Import the target module and insert it into the parent's namespace module = importlib.import_module(self.__name__) + self._parent_module_globals[self._local_name] = module # Update this object's dict so that if someone keeps a reference to the diff --git a/data_juicer/utils/mm_utils.py b/data_juicer/utils/mm_utils.py index 5b3ec0430..873b363e6 100644 --- a/data_juicer/utils/mm_utils.py +++ b/data_juicer/utils/mm_utils.py @@ -13,6 +13,9 @@ from data_juicer.utils.constant import DEFAULT_PREFIX, Fields from data_juicer.utils.file_utils import add_suffix_to_filename +from data_juicer.utils.lazy_loader import LazyLoader + +cv2 = LazyLoader('cv2', 'cv2') # suppress most warnings from av av.logging.set_level(av.logging.PANIC) @@ -129,8 +132,6 @@ def pil_to_opencv(pil_image): def detect_faces(image, detector, **extra_kwargs): - import cv2 - img = pil_to_opencv(image) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) dets = detector.detectMultiScale(gray, **extra_kwargs) diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py index 03ddd2639..cda046b81 100644 --- a/data_juicer/utils/model_utils.py +++ b/data_juicer/utils/model_utils.py @@ -9,9 +9,23 @@ from loguru import logger from data_juicer import cuda_device_count, is_cuda_available +from data_juicer.utils.lazy_loader import AUTOINSTALL, LazyLoader from .cache_utils import DATA_JUICER_MODELS_CACHE as DJMC +torch = LazyLoader('torch', 'torch') +transformers = LazyLoader('transformers', 'transformers') +nn = LazyLoader('nn', 'torch.nn') +fasttext = LazyLoader('fasttext', 'fasttext') +sentencepiece = LazyLoader('sentencepiece', 'sentencepiece') +kenlm = LazyLoader('kenlm', 'kenlm') +nltk = LazyLoader('nltk', 'nltk') +aes_pre = LazyLoader('aes_pre', 'aesthetics_predictor') +vllm = LazyLoader('vllm', 'vllm') +diffusers = LazyLoader('diffusers', 'diffusers') +ram = LazyLoader('ram', 'ram.models') +cv2 = LazyLoader('cv2', 'cv2') + MODEL_ZOO = {} # Default cached models links for downloading @@ -98,8 +112,6 @@ def prepare_fasttext_model(model_name='lid.176.bin'): :param model_name: input model name :return: model instance. """ - import fasttext - logger.info('Loading fasttext language identification model...') try: ft_model = fasttext.load_model(check_model(model_name)) @@ -115,8 +127,6 @@ def prepare_sentencepiece_model(model_path): :param model_path: input model path :return: model instance """ - import sentencepiece - logger.info('Loading sentencepiece model...') sentencepiece_model = sentencepiece.SentencePieceProcessor() try: @@ -147,8 +157,6 @@ def prepare_kenlm_model(lang, name_pattern='{}.arpa.bin'): :param lang: language to render model name :return: model instance. """ - import kenlm - model_name = name_pattern.format(lang) logger.info('Loading kenlm language model...') @@ -167,8 +175,6 @@ def prepare_nltk_model(lang, name_pattern='punkt.{}.pickle'): :param lang: language to render model name :return: model instance. """ - from nltk.data import load - nltk_to_punkt = { 'en': 'english', 'fr': 'french', @@ -182,9 +188,9 @@ def prepare_nltk_model(lang, name_pattern='punkt.{}.pickle'): logger.info('Loading nltk punkt split model...') try: - nltk_model = load(check_model(model_name)) + nltk_model = nltk.data.load(check_model(model_name)) except: # noqa: E722 - nltk_model = load(check_model(model_name, force=True)) + nltk_model = nltk.data.load(check_model(model_name, force=True)) return nltk_model @@ -200,14 +206,8 @@ def prepare_video_blip_model(pretrained_model_name_or_path, :return: a tuple (model, input processor) if `return_model` is True; otherwise, only the processor is returned. """ - import torch - import torch.nn as nn - from transformers import (AutoModelForCausalLM, AutoModelForSeq2SeqLM, - Blip2Config, Blip2ForConditionalGeneration, - Blip2QFormerModel, Blip2VisionModel) - from transformers.modeling_outputs import BaseModelOutputWithPooling - - class VideoBlipVisionModel(Blip2VisionModel): + + class VideoBlipVisionModel(transformers.Blip2VisionModel): """A simple, augmented version of Blip2VisionModel to handle videos.""" @@ -217,7 +217,8 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - ) -> Union[tuple, BaseModelOutputWithPooling]: + ) -> Union[tuple, + transformers.modeling_outputs.BaseModelOutputWithPooling]: """Flatten `pixel_values` along the batch and time dimension, pass it through the original vision model, then unflatten it back. @@ -251,7 +252,8 @@ def forward( flat_pixel_values = pixel_values.permute(0, 2, 1, 3, 4).flatten(end_dim=1) - vision_outputs: BaseModelOutputWithPooling = super().forward( + vision_outputs: transformers.modeling_outputs.BaseModelOutputWithPooling = super( # noqa: E501 + ).forward( pixel_values=flat_pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, @@ -281,7 +283,7 @@ def forward( for hidden in vision_outputs.attentions) if vision_outputs.attentions is not None else None) if return_dict: - return BaseModelOutputWithPooling( + return transformers.modeling_outputs.BaseModelOutputWithPooling( # noqa: E501 last_hidden_state=last_hidden_state, pooler_output=pooler_output, hidden_states=hidden_states, @@ -290,37 +292,39 @@ def forward( return (last_hidden_state, pooler_output, hidden_states, attentions) - class VideoBlipForConditionalGeneration(Blip2ForConditionalGeneration): + class VideoBlipForConditionalGeneration( + transformers.Blip2ForConditionalGeneration): - def __init__(self, config: Blip2Config) -> None: + def __init__(self, config: transformers.Blip2Config) -> None: # HACK: we call the grandparent super().__init__() to bypass - # Blip2ForConditionalGeneration.__init__() so we can replace - # self.vision_model - super(Blip2ForConditionalGeneration, self).__init__(config) + # transformers.Blip2ForConditionalGeneration.__init__() so we can + # replace self.vision_model + super(transformers.Blip2ForConditionalGeneration, + self).__init__(config) self.vision_model = VideoBlipVisionModel(config.vision_config) self.query_tokens = nn.Parameter( torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size)) - self.qformer = Blip2QFormerModel(config.qformer_config) + self.qformer = transformers.Blip2QFormerModel( + config.qformer_config) self.language_projection = nn.Linear( config.qformer_config.hidden_size, config.text_config.hidden_size) if config.use_decoder_only_language_model: - language_model = AutoModelForCausalLM.from_config( + language_model = transformers.AutoModelForCausalLM.from_config( config.text_config) else: - language_model = AutoModelForSeq2SeqLM.from_config( + language_model = transformers.AutoModelForSeq2SeqLM.from_config( # noqa: E501 config.text_config) self.language_model = language_model # Initialize weights and apply final processing self.post_init() - from transformers import AutoProcessor - processor = AutoProcessor.from_pretrained( + processor = transformers.AutoProcessor.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code) if return_model: model_class = VideoBlipForConditionalGeneration @@ -340,28 +344,23 @@ def prepare_simple_aesthetics_model(pretrained_model_name_or_path, :return: a tuple (model, input processor) if `return_model` is True; otherwise, only the processor is returned. """ - from aesthetics_predictor import (AestheticsPredictorV1, - AestheticsPredictorV2Linear, - AestheticsPredictorV2ReLU) - from transformers import CLIPProcessor - - processor = CLIPProcessor.from_pretrained( + processor = transformers.CLIPProcessor.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code) if not return_model: return processor else: if 'v1' in pretrained_model_name_or_path: - model = AestheticsPredictorV1.from_pretrained( + model = aes_pre.AestheticsPredictorV1.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code) elif ('v2' in pretrained_model_name_or_path and 'linear' in pretrained_model_name_or_path): - model = AestheticsPredictorV2Linear.from_pretrained( + model = aes_pre.AestheticsPredictorV2Linear.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code) elif ('v2' in pretrained_model_name_or_path and 'relu' in pretrained_model_name_or_path): - model = AestheticsPredictorV2ReLU.from_pretrained( + model = aes_pre.AestheticsPredictorV2ReLU.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code) else: @@ -382,14 +381,14 @@ def prepare_huggingface_model(pretrained_model_name_or_path, :return: a tuple (model, input processor) if `return_model` is True; otherwise, only the processor is returned. """ - import transformers - from transformers import AutoConfig, AutoProcessor + # require torch for transformer model + AUTOINSTALL.check(['torch']) - processor = AutoProcessor.from_pretrained( + processor = transformers.AutoProcessor.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code) if return_model: - config = AutoConfig.from_pretrained( + config = transformers.AutoConfig.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code) if hasattr(config, 'auto_map'): class_name = next( @@ -427,20 +426,16 @@ def prepare_vllm_model(pretrained_model_name_or_path, :return: a tuple (model, input processor) if `return_model` is True; otherwise, only the processor is returned. """ - from transformers import AutoProcessor - from vllm import LLM as vLLM - - processor = AutoProcessor.from_pretrained( + processor = transformers.AutoProcessor.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code) if return_model: - import torch - model = vLLM(model=pretrained_model_name_or_path, - trust_remote_code=trust_remote_code, - dtype=torch.float16, - tensor_parallel_size=tensor_parallel_size, - max_model_len=max_model_len, - max_num_seqs=max_num_seqs) + model = vllm.LLM(model=pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + dtype=torch.float16, + tensor_parallel_size=tensor_parallel_size, + max_model_len=max_model_len, + max_num_seqs=max_num_seqs) return (model, processor) if return_model else processor @@ -522,15 +517,12 @@ def prepare_diffusion_model(pretrained_model_name_or_path, by Git. :return: a Diffusion model. """ - import torch - from diffusers import (AutoPipelineForImage2Image, - AutoPipelineForInpainting, - AutoPipelineForText2Image) + AUTOINSTALL.check(['torch', 'transformers']) diffusion_type_to_pipeline = { - 'image2image': AutoPipelineForImage2Image, - 'text2image': AutoPipelineForText2Image, - 'inpainting': AutoPipelineForInpainting + 'image2image': diffusers.AutoPipelineForImage2Image, + 'text2image': diffusers.AutoPipelineForText2Image, + 'inpainting': diffusers.AutoPipelineForInpainting } if diffusion_type not in diffusion_type_to_pipeline.keys(): @@ -576,24 +568,23 @@ def prepare_recognizeAnything_model( :param model_name: input model name. :param input_size: the input size of the model. """ - from ram.models import ram_plus logger.info('Loading recognizeAnything model...') try: - model = ram_plus(pretrained=check_model(pretrained_model_name_or_path), - image_size=input_size, - vit='swin_l') + model = ram.ram_plus( + pretrained=check_model(pretrained_model_name_or_path), + image_size=input_size, + vit='swin_l') except (RuntimeError, UnpicklingError) as e: # noqa: E722 logger.warning(e) - model = ram_plus(pretrained=check_model(pretrained_model_name_or_path, - force=True), - image_size=input_size, - vit='swin_l') + model = ram.ram_plus(pretrained=check_model( + pretrained_model_name_or_path, force=True), + image_size=input_size, + vit='swin_l') model.eval() return model def prepare_opencv_classifier(model_path): - import cv2 model = cv2.CascadeClassifier(model_path) return model diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py index 039e48eab..81033b224 100644 --- a/data_juicer/utils/unittest_utils.py +++ b/data_juicer/utils/unittest_utils.py @@ -3,14 +3,17 @@ import unittest import numpy -import ray.data as rd from data_juicer import is_cuda_available from data_juicer.core.data import DJDataset, NestedDataset from data_juicer.core.ray_data import RayDataset +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import free_models from data_juicer.utils.registry import Registry +rd = LazyLoader('rd', 'ray.data') +transformers = LazyLoader('transformers', 'transformers') + SKIPPED_TESTS = Registry('SkippedTests') @@ -45,7 +48,6 @@ def tearDownClass(cls, hf_model_name=None) -> None: multiprocess.set_start_method(cls.original_mp_method, force=True) # clean the huggingface model cache files - import transformers if hf_model_name: # given the hf model name, remove this model only model_dir = os.path.join( diff --git a/docs/DeveloperGuide.md b/docs/DeveloperGuide.md index e786c8378..bef46ec92 100644 --- a/docs/DeveloperGuide.md +++ b/docs/DeveloperGuide.md @@ -375,12 +375,11 @@ else: ... ``` -5. As the number of OPs increases, Data-Juicer's dependencies also multiply. To prevent Data-Juicer from becoming excessively burdened with dependencies, we've implemented a strategy that incorporates lazy importing and on-demand installation of additional dependencies required by OPs. Below is an example illustrating this approach: +5. As the number of OPs increases, Data-Juicer's dependencies also multiply. To prevent Data-Juicer from becoming excessively burdened with dependencies, we've implemented a strategy that incorporates lazy importing and on-demand installation of additional dependencies required by OPs. `LazyLoader` will check if the packages corresponding to the module being loaded are installed, and if not, it will dynamically install them automatically. `AUTOINSTALL` is used for installing additional patches. Below is an example illustrating this approach: ```python # ... (import some library) -from ..base_op import AUTOINSTALL -from data_juicer.utils.lazy_loader import LazyLoader +from data_juicer.utils.lazy_loader import LazyLoader, AUTOINSTALL # lazy import kenlm = LazyLoader('kenlm', 'kenlm') @@ -393,7 +392,7 @@ class PerplexityFilter(Filter): **kwargs): # auto install before init super().__init__(*args, **kwargs) - AUTOINSTALL.check(['sentencepiece', 'kenlm']) + AUTOINSTALL.check(['fasttext-wheel']) # ... (some codes) def process(self, sample): diff --git a/docs/DeveloperGuide_ZH.md b/docs/DeveloperGuide_ZH.md index ec87d180c..52b2c8533 100644 --- a/docs/DeveloperGuide_ZH.md +++ b/docs/DeveloperGuide_ZH.md @@ -352,12 +352,11 @@ else: ... ``` -5. 随着算子数量的增加,Data-Juicer的依赖也不断增多。为了防止Data-Juicer的依赖越来越重,我们为算子额外增加的依赖提供了一套lazy import加上使用时安装依赖的策略。如下样例: +5. 随着算子数量的增加,Data-Juicer的依赖也不断增多。为了防止Data-Juicer的依赖越来越重,我们为算子额外增加的依赖提供了一套延迟加载加上使用时安装依赖的策略。`LazyLoader`会检查加载的module对应的package有没有都安装,没有的话会动态自动安装。`AUTOINSTALL`用于安装额外的补丁。如下样例: ```python # ... (import some library) -from ..base_op import AUTOINSTALL -from data_juicer.utils.lazy_loader import LazyLoader +from data_juicer.utils.lazy_loader import LazyLoader, AUTOINSTALL # lazy import kenlm = LazyLoader('kenlm', 'kenlm') @@ -370,7 +369,7 @@ class PerplexityFilter(Filter): **kwargs): # auto install before init super().__init__(*args, **kwargs) - AUTOINSTALL.check(['sentencepiece', 'kenlm']) + AUTOINSTALL.check(['fasttext-wheel']) # ... (some codes) def process(self, sample): diff --git a/environments/minimal_requires.txt b/environments/minimal_requires.txt index 7e614159b..80fd1769a 100644 --- a/environments/minimal_requires.txt +++ b/environments/minimal_requires.txt @@ -26,3 +26,4 @@ multiprocess==0.70.12 dill==0.3.4 psutil pydantic>=2.0 +Pillow diff --git a/environments/science_requires.txt b/environments/science_requires.txt index e51e94f02..f0b4ddc0f 100644 --- a/environments/science_requires.txt +++ b/environments/science_requires.txt @@ -1,6 +1,6 @@ torch>=1.11.0 torchaudio -easyocr +easyocr==1.7.1 fasttext fasttext-wheel kenlm @@ -26,3 +26,4 @@ scenedetect[opencv] ffmpeg-python opencv-python vllm +ram@git+https://github.com/xinyu1205/recognize-anything.git diff --git a/tests/ops/mapper/test_extract_qa_mapper.py b/tests/ops/mapper/test_extract_qa_mapper.py index 648996a9f..2e1b59a78 100644 --- a/tests/ops/mapper/test_extract_qa_mapper.py +++ b/tests/ops/mapper/test_extract_qa_mapper.py @@ -13,6 +13,7 @@ class ExtractQAMapperTest(DataJuicerTestCaseBase): def _run_extract_qa(self, samples, enable_vllm=False, sampling_params={}, **kwargs): op = ExtractQAMapper( hf_model='alibaba-pai/pai-qwen1_5-7b-doc2qa', + trust_remote_code=True, qa_format='chatml', enable_vllm=enable_vllm, sampling_params=sampling_params, diff --git a/tests/ops/mapper/test_generate_instruction_mapper.py b/tests/ops/mapper/test_generate_instruction_mapper.py index 0bd7a1099..43bd31262 100644 --- a/tests/ops/mapper/test_generate_instruction_mapper.py +++ b/tests/ops/mapper/test_generate_instruction_mapper.py @@ -16,6 +16,7 @@ def _run_generate_instruction(self, enable_vllm=False): hf_model='Qwen/Qwen-7B-Chat', seed_file='demos/data/demo-dataset-chatml.jsonl', instruct_num=2, + trust_remote_code=True, enable_vllm=enable_vllm ) diff --git a/tests/ops/mapper/test_image_blur_mapper.py b/tests/ops/mapper/test_image_blur_mapper.py index 98046c867..00f93759a 100644 --- a/tests/ops/mapper/test_image_blur_mapper.py +++ b/tests/ops/mapper/test_image_blur_mapper.py @@ -1,14 +1,14 @@ import os import unittest - import numpy as np -from data_juicer.core.data import NestedDataset as Dataset +from PIL import ImageFilter + +from data_juicer.core.data import NestedDataset as Dataset from data_juicer.ops.mapper.image_blur_mapper import ImageBlurMapper from data_juicer.utils.mm_utils import load_image from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase - class ImageBlurMapperTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', @@ -18,7 +18,6 @@ class ImageBlurMapperTest(DataJuicerTestCaseBase): img3_path = os.path.join(data_path, 'img3.jpg') def _get_blur_kernel(self, blur_type='gaussian', radius=2): - from PIL import ImageFilter if blur_type == 'mean': return ImageFilter.BLUR elif blur_type == 'box': diff --git a/tests/ops/mapper/test_video_resize_resolution_mapper.py b/tests/ops/mapper/test_video_resize_resolution_mapper.py index 9bedffb61..68c0495f7 100644 --- a/tests/ops/mapper/test_video_resize_resolution_mapper.py +++ b/tests/ops/mapper/test_video_resize_resolution_mapper.py @@ -1,13 +1,15 @@ import os import unittest -import ffmpeg from data_juicer.core.data import NestedDataset as Dataset from data_juicer.ops.mapper.video_resize_resolution_mapper import \ VideoResizeResolutionMapper from data_juicer.utils.constant import Fields from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase +from data_juicer.utils.lazy_loader import LazyLoader + +ffmpeg = LazyLoader('ffmpeg', 'ffmpeg') class VideoResizeResolutionMapperTest(DataJuicerTestCaseBase):