Skip to content

Commit

Permalink
autoinstall in lazyloader
Browse files Browse the repository at this point in the history
  • Loading branch information
BeachWang committed Oct 17, 2024
1 parent 76d2d72 commit fcf8748
Show file tree
Hide file tree
Showing 79 changed files with 491 additions and 529 deletions.
19 changes: 11 additions & 8 deletions data_juicer/analysis/collector.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from itertools import chain

import torch
from torch.distributions import Categorical
from transformers import AutoTokenizer

from data_juicer.format import load_formatter
from data_juicer.utils.lazy_loader import LazyLoader

torch = LazyLoader('torch', 'torch')
transformers = LazyLoader('transformers', 'transformers')


class TextTokenDistCollector(object):
Expand All @@ -18,11 +18,14 @@ def __init__(self, tokenizer):
:param tokenizer: tokenizer name on huggingface
"""
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer,
trust_remote_code=True)
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
tokenizer, trust_remote_code=True)
self.vocab_size = len(self.tokenizer)

def collect(self, data_path, text_key, num_proc=1) -> 'Categorical':
def collect(self,
data_path,
text_key,
num_proc=1) -> 'torch.distributions.Categorical':
"""
Tokenize and collect tokens distribution of input dataset
:param data_path: path to input dataset.
Expand Down Expand Up @@ -63,5 +66,5 @@ def _tokenize_fn(example, ):
list(chain.from_iterable(dataset['input_ids'])))
indices, counts = token_ids.unique(return_counts=True)
token_count.scatter_(0, indices, counts.to(token_count.dtype))
dist = Categorical(token_count)
dist = torch.distributions.Categorical(token_count)
return dist
23 changes: 12 additions & 11 deletions data_juicer/analysis/measure.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import torch
import torch.nn.functional as F
from torch import Tensor
from torch.distributions import Categorical
from data_juicer.utils.lazy_loader import LazyLoader

torch = LazyLoader('torch', 'torch')
td = LazyLoader('td', 'torch.distributions')
F = LazyLoader('F', 'torch.nn.functional')


class Measure(object):
Expand All @@ -22,9 +23,9 @@ def _convert_to_tensor(self, p):
[`scalar`,`list`, `tuple`, `torch binary file`, and `Categorical`].
:return: torch tensor
"""
if isinstance(p, Tensor):
if isinstance(p, torch.Tensor):
return p
elif isinstance(p, Categorical):
elif isinstance(p, td.Categorical):
return p.probs
elif isinstance(p, str):
return torch.load(p)
Expand All @@ -38,14 +39,14 @@ def _convert_to_categorical(self, p):
[`scalar`,`list`, `tuple`, `torch binary file`, and `Categorical`].
:return: torch Categorical
"""
if isinstance(p, Categorical):
if isinstance(p, td.Categorical):
return p
elif isinstance(p, Tensor):
return Categorical(p)
elif isinstance(p, torch.Tensor):
return td.Categorical(p)
elif isinstance(p, str):
return Categorical(torch.load(p))
return td.Categorical(torch.load(p))
else:
return Categorical(torch.tensor(p))
return td.Categorical(torch.tensor(p))


class KLDivMeasure(Measure):
Expand Down
9 changes: 4 additions & 5 deletions data_juicer/core/ray_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,11 @@
from data_juicer import cuda_device_count
from data_juicer.core.data import DJDataset
from data_juicer.ops import Filter, Mapper
from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.process_utils import calculate_np

with AvailabilityChecking(['ray'], requires_type='dist'):
from ray.data import Dataset
rd = LazyLoader('rd', 'ray.data')


def is_valid_path(item, dataset_dir):
Expand Down Expand Up @@ -54,7 +53,7 @@ def set_dataset_to_absolute_path(dataset, dataset_path, cfg):
return dataset


def preprocess_dataset(dataset: Dataset, dataset_path, cfg) -> Dataset:
def preprocess_dataset(dataset: rd.Dataset, dataset_path, cfg) -> rd.Dataset:
if dataset_path:
dataset = set_dataset_to_absolute_path(dataset, dataset_path, cfg)
columns = dataset.columns()
Expand All @@ -81,7 +80,7 @@ def get_num_gpus(op, op_proc):
class RayDataset(DJDataset):

def __init__(self,
dataset: Dataset,
dataset: rd.Dataset,
dataset_path: str = None,
cfg=None) -> None:
self.data = preprocess_dataset(dataset, dataset_path, cfg)
Expand Down
7 changes: 3 additions & 4 deletions data_juicer/core/ray_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@
from data_juicer.config import init_configs
from data_juicer.core.ray_data import RayDataset
from data_juicer.ops import load_ops
from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.lazy_loader import LazyLoader

with AvailabilityChecking(['ray'], requires_type='dist'):
import ray
import ray.data as rd
ray = LazyLoader('ray', 'ray')
rd = LazyLoader('rd', 'ray.data')


class RayExecutor:
Expand Down
5 changes: 4 additions & 1 deletion data_juicer/format/empty_formatter.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
from typing import List

import pandas as pd
import ray
from datasets import Dataset, Features, Value

from data_juicer.utils.lazy_loader import LazyLoader

from .formatter import FORMATTERS, BaseFormatter

ray = LazyLoader('ray', 'ray')


@FORMATTERS.register_module()
class EmptyFormatter(BaseFormatter):
Expand Down
6 changes: 0 additions & 6 deletions data_juicer/ops/base_op.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,18 @@
import copy
import os
import traceback
from functools import wraps

import pyarrow as pa
from loguru import logger

from data_juicer import is_cuda_available
from data_juicer.utils.auto_install_utils import AutoInstaller
from data_juicer.utils.constant import Fields
from data_juicer.utils.mm_utils import size_to_bytes
from data_juicer.utils.process_utils import calculate_np
from data_juicer.utils.registry import Registry

OPERATORS = Registry('Operators')
UNFORKABLE = Registry('Unforkable')
current_path = os.path.dirname(os.path.realpath(__file__))
version_file_path = os.path.join(current_path,
'../../environments/science_requires.txt')
AUTOINSTALL = AutoInstaller([version_file_path])


def convert_list_dict_to_dict_list(samples):
Expand Down
7 changes: 3 additions & 4 deletions data_juicer/ops/deduplicator/document_minhash_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.model_utils import prepare_sentencepiece_model

from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
from ..base_op import OPERATORS, Deduplicator
from ..common.helper_func import UnionFind, split_on_whitespace

OP_NAME = 'document_minhash_deduplicator'

integrate = LazyLoader('integrate', 'scipy.integrate')

OP_NAME = 'document_minhash_deduplicator'

MERSENNE_PRIME = np.uint64((1 << 61) - 1)
MAX_HASH = np.uint64((1 << 32) - 1)

Expand Down Expand Up @@ -151,7 +151,6 @@ def __init__(
sentencepiece tokenization.
"""
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['scipy'])
# about minhash computation
self.tokenization = tokenization
self.window_size = window_size
Expand Down
7 changes: 3 additions & 4 deletions data_juicer/ops/deduplicator/document_simhash_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@
from data_juicer.utils.constant import HashKeys
from data_juicer.utils.lazy_loader import LazyLoader

from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
from ..base_op import OPERATORS, Deduplicator
from ..common.helper_func import split_on_whitespace

OP_NAME = 'document_simhash_deduplicator'

simhash = LazyLoader('simhash', 'simhash')

OP_NAME = 'document_simhash_deduplicator'


@OPERATORS.register_module(OP_NAME)
class DocumentSimhashDeduplicator(Deduplicator):
Expand Down Expand Up @@ -56,7 +56,6 @@ def __init__(self,
"""
# about simhash computation
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['simhash-pybind'])
self.tokenization = tokenization
self.window_size = window_size
self.lowercase = lowercase
Expand Down
15 changes: 9 additions & 6 deletions data_juicer/ops/deduplicator/image_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,25 @@
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image

from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator
from ..base_op import OPERATORS, Deduplicator
from ..op_fusion import LOADED_IMAGES
from .document_deduplicator import DocumentDeduplicator

OP_NAME = 'image_deduplicator'
imgdedup_methods = LazyLoader('imgdedup_methods', 'imagededup.methods')

imagededup = LazyLoader('imagededup', 'imagededup')
OP_NAME = 'image_deduplicator'

HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}


def get_hash_method(method_name):
from imagededup.methods import AHash, DHash, PHash, WHash

mapping = {'phash': PHash, 'dhash': DHash, 'whash': WHash, 'ahash': AHash}
mapping = {
'phash': imgdedup_methods.PHash,
'dhash': imgdedup_methods.DHash,
'whash': imgdedup_methods.WHash,
'ahash': imgdedup_methods.AHash
}

return mapping[method_name]

Expand Down Expand Up @@ -49,7 +53,6 @@ def __init__(self,
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['imagededup'])
if method not in HASH_METHOD:
raise ValueError(f'Keep strategy [{method}] is not supported. '
f'Can only be one of {HASH_METHOD}.')
Expand Down
12 changes: 2 additions & 10 deletions data_juicer/ops/deduplicator/ray_basic_deduplicator.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,11 @@
from typing import Any

from pydantic import PositiveInt

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import HashKeys
from data_juicer.utils.lazy_loader import LazyLoader

from ..base_op import Filter

with AvailabilityChecking(['redis'],
op_name='ray_xxx_deduplicator',
requires_type='dist'):
try:
import redis
except Exception:
redis = Any
redis = LazyLoader('redis', 'redis')


class RayBasicDeduplicator(Filter):
Expand Down
15 changes: 9 additions & 6 deletions data_juicer/ops/deduplicator/ray_image_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,25 @@
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image

from ..base_op import AUTOINSTALL, OPERATORS
from ..base_op import OPERATORS
from ..op_fusion import LOADED_IMAGES
from .ray_basic_deduplicator import RayBasicDeduplicator

OP_NAME = 'ray_image_deduplicator'
imgdedup_methods = LazyLoader('imgdedup_methods', 'imagededup.methods')

imagededup = LazyLoader('imagededup', 'imagededup')
OP_NAME = 'ray_image_deduplicator'

HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}


def get_hash_method(method_name):
from imagededup.methods import AHash, DHash, PHash, WHash

mapping = {'phash': PHash, 'dhash': DHash, 'whash': WHash, 'ahash': AHash}
mapping = {
'phash': imgdedup_methods.PHash,
'dhash': imgdedup_methods.DHash,
'whash': imgdedup_methods.WHash,
'ahash': imgdedup_methods.AHash
}

return mapping[method_name]

Expand Down Expand Up @@ -48,7 +52,6 @@ def __init__(self,
redis_port=redis_port,
*args,
**kwargs)
AUTOINSTALL.check(['imagededup'])
if method not in HASH_METHOD:
raise ValueError(f'Keep strategy [{method}] is not supported. '
f'Can only be one of {HASH_METHOD}.')
Expand Down
3 changes: 1 addition & 2 deletions data_juicer/ops/filter/alphanumeric_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..base_op import OPERATORS, Filter
from ..common import get_words_from_document

OP_NAME = 'alphanumeric_filter'
Expand Down Expand Up @@ -39,7 +39,6 @@ def __init__(self,
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['transformers'])
self.tokenization = tokenization
self.min_ratio = min_ratio
self.max_ratio = max_ratio
Expand Down
3 changes: 1 addition & 2 deletions data_juicer/ops/filter/flagged_words_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from data_juicer.utils.model_utils import get_model, prepare_model

from ...utils.asset_utils import ASSET_DIR, load_words_asset
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..base_op import OPERATORS, Filter
from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
words_refinement)
from ..op_fusion import INTER_WORDS
Expand Down Expand Up @@ -54,7 +54,6 @@ def __init__(self,
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
AUTOINSTALL.check(['sentencepiece'])
self.lang = lang
self.max_ratio = max_ratio
self.use_words_aug = use_words_aug
Expand Down
9 changes: 3 additions & 6 deletions data_juicer/ops/filter/image_aesthetics_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,13 @@
from data_juicer.utils.mm_utils import load_data_with_context, load_image

from ...utils.model_utils import get_model, prepare_model
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..base_op import OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_aesthetics_filter'
CHECK_PKGs = ['torch', 'transformers', 'simple-aesthetics-predictor']

torch = LazyLoader('torch', 'torch')

OP_NAME = 'image_aesthetics_filter'


@OPERATORS.register_module(OP_NAME)
@LOADED_IMAGES.register_module(OP_NAME)
Expand Down Expand Up @@ -49,8 +48,6 @@ def __init__(self,
"""

super().__init__(*args, **kwargs)
AUTOINSTALL.check(
['torch', 'transformers', 'simple-aesthetics-predictor'])
if hf_scorer_model == '':
hf_scorer_model = \
'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE'
Expand Down
Loading

0 comments on commit fcf8748

Please sign in to comment.