Skip to content

Commit

Permalink
lazy_loader to LazyLoader
Browse files Browse the repository at this point in the history
  • Loading branch information
BeachWang committed Oct 10, 2024
1 parent 5e6a340 commit 0fe4c2d
Show file tree
Hide file tree
Showing 39 changed files with 138 additions and 98 deletions.
4 changes: 2 additions & 2 deletions data_juicer/ops/deduplicator/document_minhash_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from collections import defaultdict
from typing import Optional

import lazy_loader as lazy
import numpy as np
import regex
from loguru import logger
Expand All @@ -16,14 +15,15 @@
from typing_extensions import Annotated

from data_juicer.utils.constant import HashKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.model_utils import prepare_sentencepiece_model

from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
from ..common.helper_func import UnionFind, split_on_whitespace

OP_NAME = 'document_minhash_deduplicator'

integrate = lazy.load('scipy.integrate')
integrate = LazyLoader('integrate', 'scipy.integrate')

MERSENNE_PRIME = np.uint64((1 << 61) - 1)
MAX_HASH = np.uint64((1 << 32) - 1)
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/ops/deduplicator/document_simhash_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,20 @@
from collections import defaultdict, deque
from typing import Dict, Optional, Set

import lazy_loader as lazy
import numpy as np
import regex
from loguru import logger
from pydantic import PositiveInt

from data_juicer.utils.constant import HashKeys
from data_juicer.utils.lazy_loader import LazyLoader

from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
from ..common.helper_func import split_on_whitespace

OP_NAME = 'document_simhash_deduplicator'

simhash = lazy.load('simhash')
simhash = LazyLoader('simhash', 'simhash')


@OPERATORS.register_module(OP_NAME)
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/ops/deduplicator/image_deduplicator.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from collections import defaultdict
from typing import Dict, Set, Tuple

import lazy_loader as lazy
import numpy as np

from data_juicer.utils.constant import HashKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image

from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
Expand All @@ -13,7 +13,7 @@

OP_NAME = 'image_deduplicator'

imagededup = lazy.load('imagededup')
imagededup = LazyLoader('imagededup', 'imagededup')

HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}

Expand Down
4 changes: 2 additions & 2 deletions data_juicer/ops/deduplicator/ray_image_deduplicator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import lazy_loader as lazy
import numpy as np
from pydantic import PositiveInt

from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image

from ..base_op import AUTOINSTALL, OPERATORS
Expand All @@ -10,7 +10,7 @@

OP_NAME = 'ray_image_deduplicator'

imagededup = lazy.load('imagededup')
imagededup = LazyLoader('imagededup', 'imagededup')

HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}

Expand Down
4 changes: 2 additions & 2 deletions data_juicer/ops/filter/image_aesthetics_filter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import lazy_loader as lazy
import numpy as np
from loguru import logger

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image

from ...utils.model_utils import get_model, prepare_model
Expand All @@ -12,7 +12,7 @@
OP_NAME = 'image_aesthetics_filter'
CHECK_PKGs = ['torch', 'transformers', 'simple-aesthetics-predictor']

torch = lazy.load('torch')
torch = LazyLoader('torch', 'torch')


@OPERATORS.register_module(OP_NAME)
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/ops/filter/image_face_ratio_filter.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import os

import lazy_loader as lazy
import numpy as np
from loguru import logger

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import (detect_faces, load_data_with_context,
load_image)
from data_juicer.utils.model_utils import get_model, prepare_model
Expand All @@ -14,7 +14,7 @@

OP_NAME = 'image_face_ratio_filter'

cv2 = lazy.load('cv2')
cv2 = LazyLoader('cv2', 'cv2')


@UNFORKABLE.register_module(OP_NAME)
Expand Down
6 changes: 3 additions & 3 deletions data_juicer/ops/filter/image_nsfw_filter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import lazy_loader as lazy
import numpy as np

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image
from data_juicer.utils.model_utils import get_model, prepare_model

Expand All @@ -10,8 +10,8 @@

OP_NAME = 'image_nsfw_filter'

torch = lazy.load('torch')
transformers = lazy.load('transformers')
torch = LazyLoader('torch', 'torch')
transformers = LazyLoader('transformers', 'transformers')


@OPERATORS.register_module(OP_NAME)
Expand Down
6 changes: 3 additions & 3 deletions data_juicer/ops/filter/image_text_matching_filter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import lazy_loader as lazy
import numpy as np
from PIL import ImageOps

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context,
load_image, remove_special_tokens)
from data_juicer.utils.model_utils import get_model, prepare_model
Expand All @@ -12,8 +12,8 @@

OP_NAME = 'image_text_matching_filter'

torch = lazy.load('torch')
transformers = lazy.load('transformers')
torch = LazyLoader('torch', 'torch')
transformers = LazyLoader('transformers', 'transformers')


@OPERATORS.register_module(OP_NAME)
Expand Down
6 changes: 3 additions & 3 deletions data_juicer/ops/filter/image_text_similarity_filter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import lazy_loader as lazy
import numpy as np
from PIL import ImageOps

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context,
load_image, remove_special_tokens)
from data_juicer.utils.model_utils import get_model, prepare_model
Expand All @@ -12,8 +12,8 @@

OP_NAME = 'image_text_similarity_filter'

torch = lazy.load('torch')
transformers = lazy.load('transformers')
torch = LazyLoader('torch', 'torch')
transformers = LazyLoader('transformers', 'transformers')


@OPERATORS.register_module(OP_NAME)
Expand Down
6 changes: 3 additions & 3 deletions data_juicer/ops/filter/image_watermark_filter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import lazy_loader as lazy
import numpy as np

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image
from data_juicer.utils.model_utils import get_model, prepare_model

Expand All @@ -10,8 +10,8 @@

OP_NAME = 'image_watermark_filter'

torch = lazy.load('torch')
transformers = lazy.load('transformers')
torch = LazyLoader('torch', 'torch')
transformers = LazyLoader('transformers', 'transformers')


@OPERATORS.register_module(OP_NAME)
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/ops/filter/language_id_score_filter.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from typing import List, Union

import lazy_loader as lazy
from loguru import logger

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import AUTOINSTALL, OPERATORS, Filter

OP_NAME = 'language_id_score_filter'

fasttext = lazy.load('fasttext')
fasttext = LazyLoader('fasttext', 'fasttext')


@OPERATORS.register_module(OP_NAME)
Expand Down
7 changes: 3 additions & 4 deletions data_juicer/ops/filter/perplexity_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
# https://huggingface.co/spaces/huggingface/text-data-filtering
# --------------------------------------------------------

import lazy_loader as lazy

from data_juicer.utils.constant import Fields, InterVars, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import AUTOINSTALL, OPERATORS, Filter
Expand All @@ -13,8 +12,8 @@

OP_NAME = 'perplexity_filter'

kenlm = lazy.load('kenlm')
sentencepiece = lazy.load('sentencepiece')
kenlm = LazyLoader('kenlm', 'kenlm')
sentencepiece = LazyLoader('sentencepiece', 'sentencepiece')


@OPERATORS.register_module(OP_NAME)
Expand Down
8 changes: 4 additions & 4 deletions data_juicer/ops/filter/phrase_grounding_recall_filter.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from typing import List

import lazy_loader as lazy
import numpy as np
from loguru import logger
from PIL import ImageOps

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import (SpecialTokens, iou,
load_data_with_context, load_image,
remove_special_tokens)
Expand All @@ -16,9 +16,9 @@

OP_NAME = 'phrase_grounding_recall_filter'

torch = lazy.load('torch')
transformers = lazy.load('transformers')
nltk = lazy.load('nltk')
torch = LazyLoader('torch', 'torch')
transformers = LazyLoader('transformers', 'transformers')
nltk = LazyLoader('nltk', 'nltk')


# NER algorithm adapted from GLIP starts
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/ops/filter/stopwords_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@

from typing import List

import lazy_loader as lazy
from pydantic import PositiveInt

from data_juicer.utils.asset_utils import ASSET_DIR, load_words_asset
from data_juicer.utils.constant import Fields, InterVars, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import AUTOINSTALL, OPERATORS, Filter
Expand All @@ -18,7 +18,7 @@

OP_NAME = 'stopwords_filter'

sentencepiece = lazy.load('sentencepiece')
sentencepiece = LazyLoader('sentencepiece', 'sentencepiece')


@OPERATORS.register_module(OP_NAME)
Expand Down
5 changes: 2 additions & 3 deletions data_juicer/ops/filter/token_num_filter.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
import sys

import lazy_loader as lazy

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..common import get_words_from_document

OP_NAME = 'token_num_filter'

transformers = lazy.load('transformers')
transformers = LazyLoader('transformers', 'transformers')


@OPERATORS.register_module(OP_NAME)
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/ops/filter/video_aesthetics_filter.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import lazy_loader as lazy
import numpy as np
from loguru import logger
from pydantic import PositiveInt

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import (close_video, extract_key_frames,
extract_video_frames_uniformly,
load_data_with_context, load_video)
Expand All @@ -14,7 +14,7 @@

OP_NAME = 'video_aesthetics_filter'

torch = lazy.load('torch')
torch = LazyLoader('torch', 'torch')


@OPERATORS.register_module(OP_NAME)
Expand Down
6 changes: 3 additions & 3 deletions data_juicer/ops/filter/video_frames_text_similarity_filter.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import lazy_loader as lazy
import numpy as np
from PIL import ImageOps
from pydantic import PositiveInt

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import (SpecialTokens, close_video,
extract_key_frames,
extract_video_frames_uniformly,
Expand All @@ -16,8 +16,8 @@

OP_NAME = 'video_frames_text_similarity_filter'

torch = lazy.load('torch')
transformers = lazy.load('transformers')
torch = LazyLoader('torch', 'torch')
transformers = LazyLoader('transformers', 'transformers')


@OPERATORS.register_module(OP_NAME)
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/ops/filter/video_motion_score_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@
from contextlib import contextmanager
from typing import Optional, Tuple, Union

import lazy_loader as lazy
import numpy as np
from pydantic import PositiveFloat, PositiveInt

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader

from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter

OP_NAME = 'video_motion_score_filter'

cv2 = lazy.load('cv2')
cv2 = LazyLoader('cv2', 'cv2')


@contextmanager
Expand Down
6 changes: 3 additions & 3 deletions data_juicer/ops/filter/video_nsfw_filter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import lazy_loader as lazy
import numpy as np
from pydantic import PositiveInt

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import (close_video, extract_key_frames,
extract_video_frames_uniformly,
load_data_with_context, load_video)
Expand All @@ -13,8 +13,8 @@

OP_NAME = 'video_nsfw_filter'

torch = lazy.load('torch')
transformers = lazy.load('transformers')
torch = LazyLoader('torch', 'torch')
transformers = LazyLoader('transformers', 'transformers')


@OPERATORS.register_module(OP_NAME)
Expand Down
Loading

0 comments on commit 0fe4c2d

Please sign in to comment.