Skip to content

Commit

Permalink
to be debug
Browse files Browse the repository at this point in the history
  • Loading branch information
BeachWang committed Sep 2, 2024
1 parent 2a6651d commit 97729c9
Show file tree
Hide file tree
Showing 53 changed files with 99 additions and 359 deletions.
6 changes: 5 additions & 1 deletion data_juicer/ops/base_op.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
import copy
import os
import traceback
from functools import wraps

import pyarrow as pa
from loguru import logger

from data_juicer import is_cuda_available
from data_juicer.utils.auto_install_utils import
from data_juicer.utils.constant import Fields
from data_juicer.utils.mm_utils import size_to_bytes
from data_juicer.utils.process_utils import calculate_np
from data_juicer.utils.registry import Registry

OPERATORS = Registry('Operators')
UNFORKABLE = Registry('Unforkable')

current_path = os.path.dirname(os.path.realpath(__file__))
version_file_path = os.path.join(current_path, '../../environments/science_requires.txt')
AUTOINSTALL = AutoInstaller(version_file_path)

def convert_list_dict_to_dict_list(samples):
# reconstruct samples from "list of dicts" to "dict of lists"
Expand Down
8 changes: 3 additions & 5 deletions data_juicer/ops/deduplicator/document_minhash_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from loguru import logger
from tqdm import tqdm

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import HashKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.model_utils import prepare_sentencepiece_model
Expand All @@ -22,8 +21,7 @@

OP_NAME = 'document_minhash_deduplicator'

with AvailabilityChecking(['scipy'], OP_NAME):
from scipy.integrate import quad as integrate
integrate = LazyLoader('integrate', globals(), 'scipy.integrate')

MERSENNE_PRIME = np.uint64((1 << 61) - 1)
MAX_HASH = np.uint64((1 << 32) - 1)
Expand Down Expand Up @@ -69,7 +67,7 @@ def false_positive_probability(th: float, band: int, rows: int):
def proba(s):
return 1 - (1 - s**float(rows))**float(band)

a, _ = integrate(proba, 0.0, th)
a, _ = integrate.quad(proba, 0.0, th)
return a

def false_negative_probability(th: float, band: int, rows: int):
Expand All @@ -78,7 +76,7 @@ def false_negative_probability(th: float, band: int, rows: int):
def proba(s):
return 1 - (1 - (1 - s**float(rows))**float(band))

a, _ = integrate(proba, th, 1.0)
a, _ = integrate.quad(proba, th, 1.0)
return a

# object: minimize the weighted FP and FN ratio
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from jsonargparse.typing import PositiveInt
from loguru import logger

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import HashKeys
from data_juicer.utils.lazy_loader import LazyLoader

Expand All @@ -19,8 +18,7 @@

OP_NAME = 'document_simhash_deduplicator'

with AvailabilityChecking(['simhash-pybind'], OP_NAME):
import simhash
simhash = LazyLoader('simhash', globals(), 'simhash')


@OPERATORS.register_module(OP_NAME)
Expand Down
24 changes: 11 additions & 13 deletions data_juicer/ops/deduplicator/image_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import numpy as np

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import HashKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image
Expand All @@ -14,22 +13,21 @@

OP_NAME = 'image_deduplicator'

with AvailabilityChecking(['imagededup'], OP_NAME):
import imagededup # noqa: F401
imagededup = LazyLoader('imagededup', globals(), 'imagededup')

HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}
HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}

def get_hash_method(method_name):
from imagededup.methods import AHash, DHash, PHash, WHash
def get_hash_method(method_name):
from imagededup.methods import AHash, DHash, PHash, WHash

mapping = {
'phash': PHash,
'dhash': DHash,
'whash': WHash,
'ahash': AHash
}
mapping = {
'phash': PHash,
'dhash': DHash,
'whash': WHash,
'ahash': AHash
}

return mapping[method_name]
return mapping[method_name]


@OPERATORS.register_module(OP_NAME)
Expand Down
24 changes: 11 additions & 13 deletions data_juicer/ops/deduplicator/ray_image_deduplicator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import numpy as np
from jsonargparse.typing import PositiveInt

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image

Expand All @@ -11,22 +10,21 @@

OP_NAME = 'ray_image_deduplicator'

with AvailabilityChecking(['imagededup'], OP_NAME):
import imagededup # noqa: F401
imagededup = LazyLoader('imagededup', globals(), 'imagededup')

HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}
HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}

def get_hash_method(method_name):
from imagededup.methods import AHash, DHash, PHash, WHash
def get_hash_method(method_name):
from imagededup.methods import AHash, DHash, PHash, WHash

mapping = {
'phash': PHash,
'dhash': DHash,
'whash': WHash,
'ahash': AHash
}
mapping = {
'phash': PHash,
'dhash': DHash,
'whash': WHash,
'ahash': AHash
}

return mapping[method_name]
return mapping[method_name]


@OPERATORS.register_module(OP_NAME)
Expand Down
5 changes: 0 additions & 5 deletions data_juicer/ops/filter/alphanumeric_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,14 @@

from jsonargparse.typing import PositiveFloat

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..common import get_words_from_document

OP_NAME = 'alphanumeric_filter'

with AvailabilityChecking(['transformers'], OP_NAME):
import transformers # noqa: F401


@OPERATORS.register_module('alphanumeric_filter')
class AlphanumericFilter(Filter):
Expand Down
5 changes: 0 additions & 5 deletions data_juicer/ops/filter/flagged_words_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@

from jsonargparse.typing import ClosedUnitInterval, List

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, InterVars, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.model_utils import get_model, prepare_model

from ...utils.asset_utils import ASSET_DIR, load_words_asset
Expand All @@ -17,9 +15,6 @@

OP_NAME = 'flagged_words_filter'

with AvailabilityChecking(['sentencepiece'], OP_NAME):
import sentencepiece # noqa: F401


@OPERATORS.register_module(OP_NAME)
@INTER_WORDS.register_module(OP_NAME)
Expand Down
10 changes: 1 addition & 9 deletions data_juicer/ops/filter/image_aesthetics_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from jsonargparse.typing import ClosedUnitInterval
from loguru import logger

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image
Expand All @@ -14,14 +13,7 @@
OP_NAME = 'image_aesthetics_filter'
CHECK_PKGs = ['torch', 'transformers', 'simple-aesthetics-predictor']

with AvailabilityChecking(CHECK_PKGs, OP_NAME):

import aesthetics_predictor # noqa: F401
import torch
import transformers # noqa: F401

# avoid hanging when calling clip in multiprocessing
torch.set_num_threads(1)
torch = LazyLoader('torch', globals(), 'torch')


@OPERATORS.register_module(OP_NAME)
Expand Down
4 changes: 1 addition & 3 deletions data_juicer/ops/filter/image_face_ratio_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from jsonargparse.typing import ClosedUnitInterval
from loguru import logger

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import (detect_faces, load_data_with_context,
Expand All @@ -16,8 +15,7 @@

OP_NAME = 'image_face_ratio_filter'

with AvailabilityChecking(['opencv-python'], OP_NAME):
import cv2
cv2 = LazyLoader('cv2', globals(), 'cv2')


@UNFORKABLE.register_module(OP_NAME)
Expand Down
9 changes: 2 additions & 7 deletions data_juicer/ops/filter/image_nsfw_filter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import numpy as np
from jsonargparse.typing import ClosedUnitInterval

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image
Expand All @@ -12,12 +11,8 @@

OP_NAME = 'image_nsfw_filter'

with AvailabilityChecking(['torch', 'transformers'], OP_NAME):
import torch
import transformers # noqa: F401

# avoid hanging when calling nsfw detection in multiprocessing
torch.set_num_threads(1)
torch = LazyLoader('torch', globals(), 'torch')
transformers = LazyLoader('transformers', globals(), 'transformers')


@OPERATORS.register_module(OP_NAME)
Expand Down
9 changes: 2 additions & 7 deletions data_juicer/ops/filter/image_text_matching_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from jsonargparse.typing import ClosedUnitInterval
from PIL import ImageOps

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context,
Expand All @@ -14,12 +13,8 @@

OP_NAME = 'image_text_matching_filter'

with AvailabilityChecking(['torch', 'transformers'], OP_NAME):
import torch
import transformers # noqa: F401

# avoid hanging when calling blip in multiprocessing
torch.set_num_threads(1)
torch = LazyLoader('torch', globals(), 'torch')
transformers = LazyLoader('transformers', globals(), 'transformers')


@OPERATORS.register_module(OP_NAME)
Expand Down
10 changes: 2 additions & 8 deletions data_juicer/ops/filter/image_text_similarity_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from jsonargparse.typing import ClosedUnitInterval
from PIL import ImageOps

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context,
Expand All @@ -14,13 +13,8 @@

OP_NAME = 'image_text_similarity_filter'

with AvailabilityChecking(['torch', 'transformers'], OP_NAME):

import torch
import transformers # noqa: F401

# avoid hanging when calling clip in multiprocessing
torch.set_num_threads(1)
torch = LazyLoader('torch', globals(), 'torch')
transformers = LazyLoader('transformers', globals(), 'transformers')


@OPERATORS.register_module(OP_NAME)
Expand Down
9 changes: 2 additions & 7 deletions data_juicer/ops/filter/image_watermark_filter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import numpy as np
from jsonargparse.typing import ClosedUnitInterval

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image
Expand All @@ -12,12 +11,8 @@

OP_NAME = 'image_watermark_filter'

with AvailabilityChecking(['torch', 'transformers'], OP_NAME):
import torch
import transformers # noqa: F401

# avoid hanging when calling watermark detection in multiprocessing
torch.set_num_threads(1)
torch = LazyLoader('torch', globals(), 'torch')
transformers = LazyLoader('transformers', globals(), 'transformers')


@OPERATORS.register_module(OP_NAME)
Expand Down
4 changes: 1 addition & 3 deletions data_juicer/ops/filter/language_id_score_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from jsonargparse.typing import ClosedUnitInterval
from loguru import logger

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.model_utils import get_model, prepare_model
Expand All @@ -12,8 +11,7 @@

OP_NAME = 'language_id_score_filter'

with AvailabilityChecking(['fasttext-wheel'], OP_NAME):
import fasttext # noqa: F401
fasttext = LazyLoader('fasttext', globals(), 'fasttext')


@OPERATORS.register_module(OP_NAME)
Expand Down
7 changes: 2 additions & 5 deletions data_juicer/ops/filter/perplexity_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from jsonargparse.typing import PositiveFloat

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, InterVars, StatsKeys
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.model_utils import get_model, prepare_model
Expand All @@ -15,10 +14,8 @@

OP_NAME = 'perplexity_filter'

with AvailabilityChecking(['sentencepiece', 'kenlm'], OP_NAME):
import kenlm # noqa: F401
import sentencepiece # noqa: F401

kenlm = LazyLoader('kenlm', globals(), 'kenlm')
sentencepiece = LazyLoader('sentencepiece', globals(), 'sentencepiece')

@OPERATORS.register_module(OP_NAME)
@INTER_WORDS.register_module(OP_NAME)
Expand Down
Loading

0 comments on commit 97729c9

Please sign in to comment.