From 0fe4c2d243ba183473d12203ac9b705e8ca681ce Mon Sep 17 00:00:00 2001 From: Haibin Date: Thu, 10 Oct 2024 14:34:25 +0800 Subject: [PATCH] lazy_loader to LazyLoader --- .../document_minhash_deduplicator.py | 4 +- .../document_simhash_deduplicator.py | 4 +- .../ops/deduplicator/image_deduplicator.py | 4 +- .../deduplicator/ray_image_deduplicator.py | 4 +- .../ops/filter/image_aesthetics_filter.py | 4 +- .../ops/filter/image_face_ratio_filter.py | 4 +- data_juicer/ops/filter/image_nsfw_filter.py | 6 +-- .../ops/filter/image_text_matching_filter.py | 6 +-- .../filter/image_text_similarity_filter.py | 6 +-- .../ops/filter/image_watermark_filter.py | 6 +-- .../ops/filter/language_id_score_filter.py | 4 +- data_juicer/ops/filter/perplexity_filter.py | 7 ++- .../filter/phrase_grounding_recall_filter.py | 8 ++-- data_juicer/ops/filter/stopwords_filter.py | 4 +- data_juicer/ops/filter/token_num_filter.py | 5 +-- .../ops/filter/video_aesthetics_filter.py | 4 +- .../video_frames_text_similarity_filter.py | 6 +-- .../ops/filter/video_motion_score_filter.py | 4 +- data_juicer/ops/filter/video_nsfw_filter.py | 6 +-- .../ops/filter/video_ocr_area_ratio_filter.py | 4 +- .../ops/filter/video_watermark_filter.py | 4 +- .../ops/mapper/audio_ffmpeg_wrapped_mapper.py | 5 +-- .../ops/mapper/chinese_convert_mapper.py | 4 +- data_juicer/ops/mapper/clean_html_mapper.py | 4 +- data_juicer/ops/mapper/fix_unicode_mapper.py | 4 +- .../ops/mapper/image_face_blur_mapper.py | 6 +-- data_juicer/ops/mapper/nlpaug_en_mapper.py | 11 ++--- data_juicer/ops/mapper/nlpcda_zh_mapper.py | 4 +- .../ops/mapper/video_face_blur_mapper.py | 6 +-- .../ops/mapper/video_ffmpeg_wrapped_mapper.py | 5 +-- .../mapper/video_remove_watermark_mapper.py | 4 +- .../video_resize_aspect_ratio_mapper.py | 5 +-- .../mapper/video_resize_resolution_mapper.py | 4 +- .../ops/mapper/video_split_by_scene_mapper.py | 4 +- .../mapper/video_tagging_from_audio_mapper.py | 4 +- .../video_tagging_from_frames_mapper.py | 6 +-- data_juicer/utils/lazy_loader.py | 44 +++++++++++++++++++ docs/DeveloperGuide.md | 6 +-- docs/DeveloperGuide_ZH.md | 6 +-- 39 files changed, 138 insertions(+), 98 deletions(-) create mode 100644 data_juicer/utils/lazy_loader.py diff --git a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py index 54b2edc4f..4d524de7d 100644 --- a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py @@ -7,7 +7,6 @@ from collections import defaultdict from typing import Optional -import lazy_loader as lazy import numpy as np import regex from loguru import logger @@ -16,6 +15,7 @@ from typing_extensions import Annotated from data_juicer.utils.constant import HashKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import prepare_sentencepiece_model from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator @@ -23,7 +23,7 @@ OP_NAME = 'document_minhash_deduplicator' -integrate = lazy.load('scipy.integrate') +integrate = LazyLoader('integrate', 'scipy.integrate') MERSENNE_PRIME = np.uint64((1 << 61) - 1) MAX_HASH = np.uint64((1 << 32) - 1) diff --git a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py index e5f994682..4228fba1f 100644 --- a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py @@ -5,20 +5,20 @@ from collections import defaultdict, deque from typing import Dict, Optional, Set -import lazy_loader as lazy import numpy as np import regex from loguru import logger from pydantic import PositiveInt from data_juicer.utils.constant import HashKeys +from data_juicer.utils.lazy_loader import LazyLoader from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator from ..common.helper_func import split_on_whitespace OP_NAME = 'document_simhash_deduplicator' -simhash = lazy.load('simhash') +simhash = LazyLoader('simhash', 'simhash') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/deduplicator/image_deduplicator.py b/data_juicer/ops/deduplicator/image_deduplicator.py index 5d6e8b3ba..0a217a889 100644 --- a/data_juicer/ops/deduplicator/image_deduplicator.py +++ b/data_juicer/ops/deduplicator/image_deduplicator.py @@ -1,10 +1,10 @@ from collections import defaultdict from typing import Dict, Set, Tuple -import lazy_loader as lazy import numpy as np from data_juicer.utils.constant import HashKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator @@ -13,7 +13,7 @@ OP_NAME = 'image_deduplicator' -imagededup = lazy.load('imagededup') +imagededup = LazyLoader('imagededup', 'imagededup') HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} diff --git a/data_juicer/ops/deduplicator/ray_image_deduplicator.py b/data_juicer/ops/deduplicator/ray_image_deduplicator.py index 8444a1f94..22ebca47f 100644 --- a/data_juicer/ops/deduplicator/ray_image_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_image_deduplicator.py @@ -1,7 +1,7 @@ -import lazy_loader as lazy import numpy as np from pydantic import PositiveInt +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image from ..base_op import AUTOINSTALL, OPERATORS @@ -10,7 +10,7 @@ OP_NAME = 'ray_image_deduplicator' -imagededup = lazy.load('imagededup') +imagededup = LazyLoader('imagededup', 'imagededup') HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} diff --git a/data_juicer/ops/filter/image_aesthetics_filter.py b/data_juicer/ops/filter/image_aesthetics_filter.py index 8924aee8d..bcb4284ef 100644 --- a/data_juicer/ops/filter/image_aesthetics_filter.py +++ b/data_juicer/ops/filter/image_aesthetics_filter.py @@ -1,8 +1,8 @@ -import lazy_loader as lazy import numpy as np from loguru import logger from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image from ...utils.model_utils import get_model, prepare_model @@ -12,7 +12,7 @@ OP_NAME = 'image_aesthetics_filter' CHECK_PKGs = ['torch', 'transformers', 'simple-aesthetics-predictor'] -torch = lazy.load('torch') +torch = LazyLoader('torch', 'torch') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_face_ratio_filter.py b/data_juicer/ops/filter/image_face_ratio_filter.py index 76071f602..6cc73e32c 100644 --- a/data_juicer/ops/filter/image_face_ratio_filter.py +++ b/data_juicer/ops/filter/image_face_ratio_filter.py @@ -1,10 +1,10 @@ import os -import lazy_loader as lazy import numpy as np from loguru import logger from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (detect_faces, load_data_with_context, load_image) from data_juicer.utils.model_utils import get_model, prepare_model @@ -14,7 +14,7 @@ OP_NAME = 'image_face_ratio_filter' -cv2 = lazy.load('cv2') +cv2 = LazyLoader('cv2', 'cv2') @UNFORKABLE.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_nsfw_filter.py b/data_juicer/ops/filter/image_nsfw_filter.py index 50ac74a78..357b81a10 100644 --- a/data_juicer/ops/filter/image_nsfw_filter.py +++ b/data_juicer/ops/filter/image_nsfw_filter.py @@ -1,7 +1,7 @@ -import lazy_loader as lazy import numpy as np from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image from data_juicer.utils.model_utils import get_model, prepare_model @@ -10,8 +10,8 @@ OP_NAME = 'image_nsfw_filter' -torch = lazy.load('torch') -transformers = lazy.load('transformers') +torch = LazyLoader('torch', 'torch') +transformers = LazyLoader('transformers', 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_text_matching_filter.py b/data_juicer/ops/filter/image_text_matching_filter.py index dda7bd153..c47982eae 100644 --- a/data_juicer/ops/filter/image_text_matching_filter.py +++ b/data_juicer/ops/filter/image_text_matching_filter.py @@ -1,8 +1,8 @@ -import lazy_loader as lazy import numpy as np from PIL import ImageOps from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context, load_image, remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model @@ -12,8 +12,8 @@ OP_NAME = 'image_text_matching_filter' -torch = lazy.load('torch') -transformers = lazy.load('transformers') +torch = LazyLoader('torch', 'torch') +transformers = LazyLoader('transformers', 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_text_similarity_filter.py b/data_juicer/ops/filter/image_text_similarity_filter.py index ca74441ca..d70568ba6 100644 --- a/data_juicer/ops/filter/image_text_similarity_filter.py +++ b/data_juicer/ops/filter/image_text_similarity_filter.py @@ -1,8 +1,8 @@ -import lazy_loader as lazy import numpy as np from PIL import ImageOps from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context, load_image, remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model @@ -12,8 +12,8 @@ OP_NAME = 'image_text_similarity_filter' -torch = lazy.load('torch') -transformers = lazy.load('transformers') +torch = LazyLoader('torch', 'torch') +transformers = LazyLoader('transformers', 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/image_watermark_filter.py b/data_juicer/ops/filter/image_watermark_filter.py index 4369dcafe..1d34d8ad6 100644 --- a/data_juicer/ops/filter/image_watermark_filter.py +++ b/data_juicer/ops/filter/image_watermark_filter.py @@ -1,7 +1,7 @@ -import lazy_loader as lazy import numpy as np from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import load_data_with_context, load_image from data_juicer.utils.model_utils import get_model, prepare_model @@ -10,8 +10,8 @@ OP_NAME = 'image_watermark_filter' -torch = lazy.load('torch') -transformers = lazy.load('transformers') +torch = LazyLoader('torch', 'torch') +transformers = LazyLoader('transformers', 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/language_id_score_filter.py b/data_juicer/ops/filter/language_id_score_filter.py index 9da08f6a5..6aa892274 100644 --- a/data_juicer/ops/filter/language_id_score_filter.py +++ b/data_juicer/ops/filter/language_id_score_filter.py @@ -1,16 +1,16 @@ from typing import List, Union -import lazy_loader as lazy from loguru import logger from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model from ..base_op import AUTOINSTALL, OPERATORS, Filter OP_NAME = 'language_id_score_filter' -fasttext = lazy.load('fasttext') +fasttext = LazyLoader('fasttext', 'fasttext') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/perplexity_filter.py b/data_juicer/ops/filter/perplexity_filter.py index ab031157b..97fb3fc01 100644 --- a/data_juicer/ops/filter/perplexity_filter.py +++ b/data_juicer/ops/filter/perplexity_filter.py @@ -2,9 +2,8 @@ # https://huggingface.co/spaces/huggingface/text-data-filtering # -------------------------------------------------------- -import lazy_loader as lazy - from data_juicer.utils.constant import Fields, InterVars, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model from ..base_op import AUTOINSTALL, OPERATORS, Filter @@ -13,8 +12,8 @@ OP_NAME = 'perplexity_filter' -kenlm = lazy.load('kenlm') -sentencepiece = lazy.load('sentencepiece') +kenlm = LazyLoader('kenlm', 'kenlm') +sentencepiece = LazyLoader('sentencepiece', 'sentencepiece') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/phrase_grounding_recall_filter.py b/data_juicer/ops/filter/phrase_grounding_recall_filter.py index 9a9ba65dd..0f00781f0 100644 --- a/data_juicer/ops/filter/phrase_grounding_recall_filter.py +++ b/data_juicer/ops/filter/phrase_grounding_recall_filter.py @@ -1,11 +1,11 @@ from typing import List -import lazy_loader as lazy import numpy as np from loguru import logger from PIL import ImageOps from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, iou, load_data_with_context, load_image, remove_special_tokens) @@ -16,9 +16,9 @@ OP_NAME = 'phrase_grounding_recall_filter' -torch = lazy.load('torch') -transformers = lazy.load('transformers') -nltk = lazy.load('nltk') +torch = LazyLoader('torch', 'torch') +transformers = LazyLoader('transformers', 'transformers') +nltk = LazyLoader('nltk', 'nltk') # NER algorithm adapted from GLIP starts diff --git a/data_juicer/ops/filter/stopwords_filter.py b/data_juicer/ops/filter/stopwords_filter.py index 1d9f59b7b..fde4c2321 100644 --- a/data_juicer/ops/filter/stopwords_filter.py +++ b/data_juicer/ops/filter/stopwords_filter.py @@ -4,11 +4,11 @@ from typing import List -import lazy_loader as lazy from pydantic import PositiveInt from data_juicer.utils.asset_utils import ASSET_DIR, load_words_asset from data_juicer.utils.constant import Fields, InterVars, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model from ..base_op import AUTOINSTALL, OPERATORS, Filter @@ -18,7 +18,7 @@ OP_NAME = 'stopwords_filter' -sentencepiece = lazy.load('sentencepiece') +sentencepiece = LazyLoader('sentencepiece', 'sentencepiece') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/token_num_filter.py b/data_juicer/ops/filter/token_num_filter.py index de3349315..3f9a72a40 100644 --- a/data_juicer/ops/filter/token_num_filter.py +++ b/data_juicer/ops/filter/token_num_filter.py @@ -1,8 +1,7 @@ import sys -import lazy_loader as lazy - from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model from ..base_op import AUTOINSTALL, OPERATORS, Filter @@ -10,7 +9,7 @@ OP_NAME = 'token_num_filter' -transformers = lazy.load('transformers') +transformers = LazyLoader('transformers', 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/video_aesthetics_filter.py b/data_juicer/ops/filter/video_aesthetics_filter.py index 31c242473..ccdcbffce 100644 --- a/data_juicer/ops/filter/video_aesthetics_filter.py +++ b/data_juicer/ops/filter/video_aesthetics_filter.py @@ -1,9 +1,9 @@ -import lazy_loader as lazy import numpy as np from loguru import logger from pydantic import PositiveInt from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (close_video, extract_key_frames, extract_video_frames_uniformly, load_data_with_context, load_video) @@ -14,7 +14,7 @@ OP_NAME = 'video_aesthetics_filter' -torch = lazy.load('torch') +torch = LazyLoader('torch', 'torch') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/video_frames_text_similarity_filter.py b/data_juicer/ops/filter/video_frames_text_similarity_filter.py index ddcbff1e7..6b0d1d2c6 100644 --- a/data_juicer/ops/filter/video_frames_text_similarity_filter.py +++ b/data_juicer/ops/filter/video_frames_text_similarity_filter.py @@ -1,9 +1,9 @@ -import lazy_loader as lazy import numpy as np from PIL import ImageOps from pydantic import PositiveInt from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (SpecialTokens, close_video, extract_key_frames, extract_video_frames_uniformly, @@ -16,8 +16,8 @@ OP_NAME = 'video_frames_text_similarity_filter' -torch = lazy.load('torch') -transformers = lazy.load('transformers') +torch = LazyLoader('torch', 'torch') +transformers = LazyLoader('transformers', 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/video_motion_score_filter.py b/data_juicer/ops/filter/video_motion_score_filter.py index e8e63f052..30bbc48ee 100644 --- a/data_juicer/ops/filter/video_motion_score_filter.py +++ b/data_juicer/ops/filter/video_motion_score_filter.py @@ -2,17 +2,17 @@ from contextlib import contextmanager from typing import Optional, Tuple, Union -import lazy_loader as lazy import numpy as np from pydantic import PositiveFloat, PositiveInt from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter OP_NAME = 'video_motion_score_filter' -cv2 = lazy.load('cv2') +cv2 = LazyLoader('cv2', 'cv2') @contextmanager diff --git a/data_juicer/ops/filter/video_nsfw_filter.py b/data_juicer/ops/filter/video_nsfw_filter.py index a96151f3e..a4de77aa0 100644 --- a/data_juicer/ops/filter/video_nsfw_filter.py +++ b/data_juicer/ops/filter/video_nsfw_filter.py @@ -1,8 +1,8 @@ -import lazy_loader as lazy import numpy as np from pydantic import PositiveInt from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (close_video, extract_key_frames, extract_video_frames_uniformly, load_data_with_context, load_video) @@ -13,8 +13,8 @@ OP_NAME = 'video_nsfw_filter' -torch = lazy.load('torch') -transformers = lazy.load('transformers') +torch = LazyLoader('torch', 'torch') +transformers = LazyLoader('transformers', 'transformers') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/filter/video_ocr_area_ratio_filter.py b/data_juicer/ops/filter/video_ocr_area_ratio_filter.py index a36214fbc..bb069f4c5 100644 --- a/data_juicer/ops/filter/video_ocr_area_ratio_filter.py +++ b/data_juicer/ops/filter/video_ocr_area_ratio_filter.py @@ -1,11 +1,11 @@ from typing import List, Union -import lazy_loader as lazy import numpy as np from pydantic import PositiveInt from data_juicer import cuda_device_count from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (close_video, extract_video_frames_uniformly, load_data_with_context, load_video) @@ -15,7 +15,7 @@ OP_NAME = 'video_ocr_area_ratio_filter' -easyocr = lazy.load('easyocr') +easyocr = LazyLoader('easyocr', 'easyocr') def triangle_area(p1, p2, p3): diff --git a/data_juicer/ops/filter/video_watermark_filter.py b/data_juicer/ops/filter/video_watermark_filter.py index c5ddfc8b7..2642bce2d 100644 --- a/data_juicer/ops/filter/video_watermark_filter.py +++ b/data_juicer/ops/filter/video_watermark_filter.py @@ -1,8 +1,8 @@ -import lazy_loader as lazy import numpy as np from pydantic import PositiveInt from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (close_video, extract_key_frames, extract_video_frames_uniformly, load_data_with_context, load_video) @@ -13,7 +13,7 @@ OP_NAME = 'video_watermark_filter' -torch = lazy.load('torch') +torch = LazyLoader('torch', 'torch') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py b/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py index b6434c0f4..a640b11da 100644 --- a/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py +++ b/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py @@ -1,9 +1,8 @@ from typing import Dict, List, Optional -import lazy_loader as lazy - from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.logger_utils import HiddenPrints from ..base_op import AUTOINSTALL, OPERATORS, Mapper @@ -11,7 +10,7 @@ OP_NAME = 'audio_ffmpeg_wrapped_mapper' with HiddenPrints(): - ffmpeg = lazy.load('ffmpeg') + ffmpeg = LazyLoader('ffmpeg', 'ffmpeg') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py index e18fa0afc..e44127c4d 100644 --- a/data_juicer/ops/mapper/chinese_convert_mapper.py +++ b/data_juicer/ops/mapper/chinese_convert_mapper.py @@ -1,10 +1,10 @@ -import lazy_loader as lazy +from data_juicer.utils.lazy_loader import LazyLoader from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'chinese_convert_mapper' -opencc = lazy.load('opencc') +opencc = LazyLoader('opencc', 'opencc') OPENCC_CONVERTER = None diff --git a/data_juicer/ops/mapper/clean_html_mapper.py b/data_juicer/ops/mapper/clean_html_mapper.py index 477c46846..b088394d0 100644 --- a/data_juicer/ops/mapper/clean_html_mapper.py +++ b/data_juicer/ops/mapper/clean_html_mapper.py @@ -2,13 +2,13 @@ # https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/ # -------------------------------------------------------- -import lazy_loader as lazy +from data_juicer.utils.lazy_loader import LazyLoader from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'clean_html_mapper' -selectolax = lazy.load('selectolax') +selectolax = LazyLoader('selectolax', 'selectolax') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/fix_unicode_mapper.py b/data_juicer/ops/mapper/fix_unicode_mapper.py index e2323c3b9..daa98a47b 100644 --- a/data_juicer/ops/mapper/fix_unicode_mapper.py +++ b/data_juicer/ops/mapper/fix_unicode_mapper.py @@ -1,10 +1,10 @@ -import lazy_loader as lazy +from data_juicer.utils.lazy_loader import LazyLoader from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'fix_unicode_mapper' -ftfy = lazy.load('ftfy') +ftfy = LazyLoader('ftfy', 'ftfy') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/image_face_blur_mapper.py b/data_juicer/ops/mapper/image_face_blur_mapper.py index e3d37e21b..0afbf7bbb 100644 --- a/data_juicer/ops/mapper/image_face_blur_mapper.py +++ b/data_juicer/ops/mapper/image_face_blur_mapper.py @@ -1,11 +1,11 @@ import os -import lazy_loader as lazy from loguru import logger from pydantic import NonNegativeFloat from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (detect_faces, load_data_with_context, load_image) from data_juicer.utils.model_utils import get_model, prepare_model @@ -15,8 +15,8 @@ OP_NAME = 'image_face_blur_mapper' -cv2 = lazy.load('cv2') -PIL = lazy.load('PIL') +cv2 = LazyLoader('cv2', 'cv2') +PIL = LazyLoader('PIL', 'PIL') @UNFORKABLE.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/nlpaug_en_mapper.py b/data_juicer/ops/mapper/nlpaug_en_mapper.py index 9a253c9c2..fffb3488b 100644 --- a/data_juicer/ops/mapper/nlpaug_en_mapper.py +++ b/data_juicer/ops/mapper/nlpaug_en_mapper.py @@ -1,17 +1,18 @@ from copy import deepcopy -import lazy_loader as lazy from loguru import logger from pydantic import PositiveInt +from data_juicer.utils.lazy_loader import LazyLoader + from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'nlpaug_en_mapper' -nlpaug = lazy.load('nlpaug') -nac = lazy.load('nlpaug.augmenter.char') -naw = lazy.load('nlpaug.augmenter.word') -naf = lazy.load('nlpaug.flow') +nlpaug = LazyLoader('nlpaug', 'nlpaug') +nac = LazyLoader('nac', 'nlpaug.augmenter.char') +naw = LazyLoader('naw', 'nlpaug.augmenter.word') +naf = LazyLoader('naf', 'nlpaug.flow') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/nlpcda_zh_mapper.py b/data_juicer/ops/mapper/nlpcda_zh_mapper.py index adc718beb..32f33368b 100644 --- a/data_juicer/ops/mapper/nlpcda_zh_mapper.py +++ b/data_juicer/ops/mapper/nlpcda_zh_mapper.py @@ -1,16 +1,16 @@ from copy import deepcopy -import lazy_loader as lazy from loguru import logger from pydantic import PositiveInt +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.logger_utils import HiddenPrints from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'nlpcda_zh_mapper' -nlpcda = lazy.load('nlpcda') +nlpcda = LazyLoader('nlpcda', 'nlpcda') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/video_face_blur_mapper.py b/data_juicer/ops/mapper/video_face_blur_mapper.py index f30917536..48fa0ce7b 100644 --- a/data_juicer/ops/mapper/video_face_blur_mapper.py +++ b/data_juicer/ops/mapper/video_face_blur_mapper.py @@ -1,10 +1,10 @@ import os import av -import lazy_loader as lazy from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (close_video, detect_faces, load_data_with_context, load_video, process_each_frame) @@ -15,8 +15,8 @@ OP_NAME = 'video_face_blur_mapper' -cv2 = lazy.load('cv2') -PIL = lazy.load('PIL') +cv2 = LazyLoader('cv2', 'cv2') +PIL = LazyLoader('PIL', 'PIL') @UNFORKABLE.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py b/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py index c711a6ae8..4a3cf0053 100644 --- a/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py +++ b/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py @@ -1,9 +1,8 @@ from typing import Dict, List, Optional -import lazy_loader as lazy - from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.logger_utils import HiddenPrints from ..base_op import AUTOINSTALL, OPERATORS, Mapper @@ -11,7 +10,7 @@ OP_NAME = 'video_ffmpeg_wrapped_mapper' with HiddenPrints(): - ffmpeg = lazy.load('ffmpeg') + ffmpeg = LazyLoader('ffmpeg', 'ffmpeg') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/video_remove_watermark_mapper.py b/data_juicer/ops/mapper/video_remove_watermark_mapper.py index 2c3166e8b..054a4f9f0 100644 --- a/data_juicer/ops/mapper/video_remove_watermark_mapper.py +++ b/data_juicer/ops/mapper/video_remove_watermark_mapper.py @@ -2,12 +2,12 @@ from typing import List, Optional import av -import lazy_loader as lazy import numpy as np from pydantic import PositiveInt from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.logger_utils import HiddenPrints from data_juicer.utils.mm_utils import (close_video, extract_video_frames_uniformly, @@ -21,7 +21,7 @@ OP_NAME = 'video_remove_watermark_mapper' with HiddenPrints(): - cv2 = lazy.load('cv2') + cv2 = LazyLoader('cv2', 'cv2') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py b/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py index 99192c9c1..ac49e29e7 100644 --- a/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py +++ b/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py @@ -2,10 +2,9 @@ import os from fractions import Fraction -import lazy_loader as lazy - from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.logger_utils import HiddenPrints from data_juicer.utils.mm_utils import close_video, load_video @@ -14,7 +13,7 @@ OP_NAME = 'video_resize_aspect_ratio_mapper' with HiddenPrints(): - ffmpeg = lazy.load('ffmpeg') + ffmpeg = LazyLoader('ffmpeg', 'ffmpeg') def rescale(width, height, ori_ratio, min_ratio, max_ratio, strategy): diff --git a/data_juicer/ops/mapper/video_resize_resolution_mapper.py b/data_juicer/ops/mapper/video_resize_resolution_mapper.py index 574dd04d6..961c755c1 100644 --- a/data_juicer/ops/mapper/video_resize_resolution_mapper.py +++ b/data_juicer/ops/mapper/video_resize_resolution_mapper.py @@ -2,11 +2,11 @@ import os import sys -import lazy_loader as lazy from pydantic import PositiveInt from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.logger_utils import HiddenPrints from data_juicer.utils.mm_utils import close_video, load_video @@ -16,7 +16,7 @@ OP_NAME = 'video_resize_resolution_mapper' with HiddenPrints(): - ffmpeg = lazy.load('ffmpeg') + ffmpeg = LazyLoader('ffmpeg', 'ffmpeg') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/video_split_by_scene_mapper.py b/data_juicer/ops/mapper/video_split_by_scene_mapper.py index 7ce921e09..6c042d254 100644 --- a/data_juicer/ops/mapper/video_split_by_scene_mapper.py +++ b/data_juicer/ops/mapper/video_split_by_scene_mapper.py @@ -2,19 +2,19 @@ import re from itertools import chain -import lazy_loader as lazy from pydantic import NonNegativeFloat, NonNegativeInt from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import (add_suffix_to_filename, transfer_filename) +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import SpecialTokens from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'video_split_by_scene_mapper' -scenedetect = lazy.load('scenedetect') +scenedetect = LazyLoader('scenedetect', 'scenedetect') def replace_func(match, scene_counts_iter): diff --git a/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py b/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py index 164fc46bd..5dcf3b71a 100644 --- a/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py +++ b/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py @@ -1,8 +1,8 @@ -import lazy_loader as lazy import librosa import numpy as np from data_juicer.utils.constant import Fields +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import extract_audio_from_video from data_juicer.utils.model_utils import get_model, prepare_model @@ -10,7 +10,7 @@ OP_NAME = 'video_tagging_from_audio_mapper' -torch = lazy.load('torch') +torch = LazyLoader('torch', 'torch') @OPERATORS.register_module(OP_NAME) diff --git a/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py b/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py index 99cb96aa4..a8a70fb82 100644 --- a/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py +++ b/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py @@ -1,10 +1,10 @@ from collections import Counter -import lazy_loader as lazy import numpy as np from pydantic import PositiveInt from data_juicer.utils.constant import Fields +from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.mm_utils import (close_video, extract_key_frames, extract_video_frames_uniformly, load_data_with_context, load_video) @@ -15,8 +15,8 @@ OP_NAME = 'video_tagging_from_frames_mapper' -ram = lazy.load('ram') -torch = lazy.load('torch') +ram = LazyLoader('ram', 'ram') +torch = LazyLoader('torch', 'torch') @UNFORKABLE.register_module(OP_NAME) diff --git a/data_juicer/utils/lazy_loader.py b/data_juicer/utils/lazy_loader.py new file mode 100644 index 000000000..cc8b392ce --- /dev/null +++ b/data_juicer/utils/lazy_loader.py @@ -0,0 +1,44 @@ +# Code copied from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/util/lazy_loader.py # noqa: E501 +"""A LazyLoader class.""" + +import importlib +import inspect +import types + + +class LazyLoader(types.ModuleType): + """ + Lazily import a module, mainly to avoid pulling in large dependencies. + `contrib`, and `ffmpeg` are examples of modules that are large and not + always needed, and this allows them to only be loaded when they are used. + """ + + # The lint error here is incorrect. + def __init__(self, local_name, name): + self._local_name = local_name + # get last frame in the stack + frame = inspect.currentframe().f_back + # get the globals of module who calls LazyLoader + self._parent_module_globals = frame.f_globals + + super(LazyLoader, self).__init__(name) + + def _load(self): + # Import the target module and insert it into the parent's namespace + module = importlib.import_module(self.__name__) + self._parent_module_globals[self._local_name] = module + + # Update this object's dict so that if someone keeps a reference to the + # LazyLoader, lookups are efficient (__getattr__ is only called on + # lookups that fail). + self.__dict__.update(module.__dict__) + + return module + + def __getattr__(self, item): + module = self._load() + return getattr(module, item) + + def __dir__(self): + module = self._load() + return dir(module) diff --git a/docs/DeveloperGuide.md b/docs/DeveloperGuide.md index 42fa5d09e..e786c8378 100644 --- a/docs/DeveloperGuide.md +++ b/docs/DeveloperGuide.md @@ -380,11 +380,11 @@ else: ```python # ... (import some library) from ..base_op import AUTOINSTALL -import lazy_loader as lazy +from data_juicer.utils.lazy_loader import LazyLoader # lazy import -kenlm = lazy.load('kenlm') -sentencepiece = lazy.load('sentencepiece') +kenlm = LazyLoader('kenlm', 'kenlm') +sp = LazyLoader('sp', 'sentencepiece') class PerplexityFilter(Filter): def __init__(self, diff --git a/docs/DeveloperGuide_ZH.md b/docs/DeveloperGuide_ZH.md index 1574287f6..ec87d180c 100644 --- a/docs/DeveloperGuide_ZH.md +++ b/docs/DeveloperGuide_ZH.md @@ -357,11 +357,11 @@ else: ```python # ... (import some library) from ..base_op import AUTOINSTALL -import lazy_loader as lazy +from data_juicer.utils.lazy_loader import LazyLoader # lazy import -kenlm = lazy.load('kenlm') -sentencepiece = lazy.load('sentencepiece') +kenlm = LazyLoader('kenlm', 'kenlm') +sp = LazyLoader('sp', 'sentencepiece') class PerplexityFilter(Filter): def __init__(self,