Skip to content

Commit

Permalink
autoinstall check
Browse files Browse the repository at this point in the history
  • Loading branch information
BeachWang committed Aug 23, 2024
1 parent 705065b commit 2a6651d
Show file tree
Hide file tree
Showing 48 changed files with 57 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ class DocumentMinhashDeduplicator(Deduplicator):
kept in the final dataset.
"""

@AUTOINSTALL.check(['scipy'])
def __init__(
self,
tokenization: str = 'space',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
class DocumentSimhashDeduplicator(Deduplicator):
"""Deduplicator to deduplicate samples at document-level using SimHash."""

@AUTOINSTALL.check(['simhash-pybind'])
def __init__(self,
tokenization: str = 'space',
window_size: PositiveInt = 6,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/deduplicator/image_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class ImageDeduplicator(Deduplicator):
of images between documents.
"""

@AUTOINSTALL.check(['imagededup'])
def __init__(self,
method: str = 'phash',
consider_text: bool = False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/deduplicator/ray_image_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class RayImageDeduplicator(RayBasicDeduplicator):
of images between documents.
"""

@AUTOINSTALL.check(['imagededup'])
def __init__(self,
redis_host: str = 'localhost',
redis_port: PositiveInt = 6380,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/alphanumeric_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class AlphanumericFilter(Filter):
"""Filter to keep samples with alphabet/numeric ratio within a specific
range."""

@AUTOINSTALL.check(['transformers'])
def __init__(self,
tokenization: bool = False,
min_ratio: float = 0.25,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/flagged_words_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class FlaggedWordFilter(Filter):
"""Filter to keep samples with flagged-word ratio less than a specific max
value."""

@AUTOINSTALL.check(['sentencepiece'])
def __init__(self,
lang: str = 'en',
tokenization: bool = False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/image_aesthetics_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class ImageAestheticsFilter(Filter):

_accelerator = 'cuda'

@AUTOINSTALL.check(['torch', 'transformers', 'simple-aesthetics-predictor'])
def __init__(self,
hf_scorer_model='',
trust_remote_code=False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/image_face_ratio_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class ImageFaceRatioFilter(Filter):
'maxSize': None,
}

@AUTOINSTALL.check(['opencv-python'])
def __init__(self,
cv_classifier='',
min_ratio: ClosedUnitInterval = 0.0,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/image_nsfw_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class ImageNSFWFilter(Filter):

_accelerator = 'cuda'

@AUTOINSTALL.check(['torch', 'transformers'])
def __init__(self,
hf_nsfw_model='Falconsai/nsfw_image_detection',
trust_remote_code=False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/image_text_matching_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class ImageTextMatchingFilter(Filter):

_accelerator = 'cuda'

@AUTOINSTALL.check(['torch', 'transformers'])
def __init__(self,
hf_blip='Salesforce/blip-itm-base-coco',
trust_remote_code=False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/image_text_similarity_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class ImageTextSimilarityFilter(Filter):

_accelerator = 'cuda'

@AUTOINSTALL.check(['torch', 'transformers'])
def __init__(self,
hf_clip='openai/clip-vit-base-patch32',
trust_remote_code=False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/image_watermark_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class ImageWatermarkFilter(Filter):

_accelerator = 'cuda'

@AUTOINSTALL.check(['torch', 'transformers'])
def __init__(self,
hf_watermark_model='amrul-hzz/watermark_detector',
trust_remote_code=False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/language_id_score_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class LanguageIDScoreFilter(Filter):
"""Filter to keep samples in a specific language with confidence score
larger than a specific min value."""

@AUTOINSTALL.check(['fasttext-wheel'])
def __init__(self,
lang: Union[str, List[str], Tuple[str]] = '',
min_score: ClosedUnitInterval = 0.8,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/perplexity_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class PerplexityFilter(Filter):
"""Filter to keep samples with perplexity score less than a specific max
value."""

@AUTOINSTALL.check(['perplexity_filter'])
def __init__(self,
lang: str = 'en',
max_ppl: PositiveFloat = 1500,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/phrase_grounding_recall_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ class PhraseGroundingRecallFilter(Filter):

_accelerator = 'cuda'

@AUTOINSTALL.check(['torch', 'transformers', 'nltk'])
def __init__(self,
hf_owlvit='google/owlvit-base-patch32',
trust_remote_code=False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/stopwords_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class StopWordsFilter(Filter):
"""Filter to keep samples with stopword ratio larger than a specific min
value."""

@AUTOINSTALL.check(['sentencepiece'])
def __init__(self,
lang: str = 'en',
tokenization: bool = False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/text_entity_dependency_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class TextEntityDependencyFilter(Filter):
and filter them. The text containing no entities will be omitted.
"""

@AUTOINSTALL.check(['spacy-pkuseg'])
def __init__(self,
lang: str = 'en',
min_dependency_num: int = 1,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/token_num_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class TokenNumFilter(Filter):
"""Filter to keep samples with total token number within a specific
range."""

@AUTOINSTALL.check(['transformers'])
def __init__(self,
hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped',
min_num: PositiveInt = 10,
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/filter/video_aesthetics_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
load_data_with_context, load_video)

from ...utils.model_utils import get_model, prepare_model
from ..base_op import AUTOINSTALL, AUTOINSTALL, OPERATORS, Filter
from ..base_op import AUTOINSTALL, OPERATORS, Filter
from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS

OP_NAME = 'video_aesthetics_filter'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class VideoFramesTextSimilarityFilter(Filter):

_accelerator = 'cuda'

@AUTOINSTALL.check(['torch', 'transformers'])
def __init__(self,
hf_clip='openai/clip-vit-base-patch32',
trust_remote_code=False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/video_motion_score_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class VideoMotionScoreFilter(Filter):
'flags': 0
}

@AUTOINSTALL.check(['opencv-python'])
def __init__(self,
min_score: float = 0.25,
max_score: float = sys.float_info.max,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/video_nsfw_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class VideoNSFWFilter(Filter):

_accelerator = 'cuda'

@AUTOINSTALL.check(['torch', 'transformers'])
def __init__(self,
hf_nsfw_model='Falconsai/nsfw_image_detection',
trust_remote_code=False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/video_ocr_area_ratio_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class VideoOcrAreaRatioFilter(Filter):

_accelerator = 'cuda'

@AUTOINSTALL.check(['easyocr'])
def __init__(self,
min_area_ratio: ClosedUnitInterval = 0,
max_area_ratio: ClosedUnitInterval = 1.0,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/video_tagging_from_frames_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class VideoTaggingFromFramesFilter(Filter):

_accelerator = 'cuda'

@AUTOINSTALL.check(['torch', 'git+https://github.com/xinyu1205/recognize-anything.git'])
def __init__(self,
tags: List[str] = ['people'],
contain: str = 'any',
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/video_watermark_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class VideoWatermarkFilter(Filter):

_accelerator = 'cuda'

@AUTOINSTALL.check(['torch', 'transformers'])
def __init__(self,
hf_watermark_model='amrul-hzz/watermark_detector',
trust_remote_code=False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/word_repetition_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class WordRepetitionFilter(Filter):
"""Filter to keep samples with word-level n-gram repetition ratio within a
specific range."""

@AUTOINSTALL.check(['sentencepiece'])
def __init__(self,
lang: str = 'en',
tokenization: bool = False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/filter/words_num_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class WordsNumFilter(Filter):
"""Filter to keep samples with total words number within a specific
range."""

@AUTOINSTALL.check(['sentencepiece'])
def __init__(self,
lang: str = 'en',
tokenization: bool = False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class AudioFFmpegWrappedMapper(Mapper):
"""Simple wrapper for FFmpeg audio filters.
"""

@AUTOINSTALL.check(['ffmpeg-python'])
def __init__(
self,
filter_name: Optional[str] = None,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/mapper/chinese_convert_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class ChineseConvertMapper(Mapper):
"""Mapper to convert Chinese between Traditional Chinese, Simplified Chinese
and Japanese Kanji."""

@AUTOINSTALL.check(['opencc'])
def __init__(self, mode: str = 's2t', *args, **kwargs):
"""
Initialization method.
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/mapper/clean_html_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
class CleanHtmlMapper(Mapper):
"""Mapper to clean html code in text samples."""

@AUTOINSTALL.check(['selectolax'])
def __init__(self, *args, **kwargs):
"""
Initialization method.
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/mapper/fix_unicode_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
class FixUnicodeMapper(Mapper):
"""Mapper to fix unicode errors in text samples."""

@AUTOINSTALL.check(['ftfy'])
def __init__(self, normalization: str = None, *args, **kwargs):
"""
Initialization method.
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/mapper/image_captioning_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class ImageCaptioningMapper(Mapper):
_accelerator = 'cuda'
_batched_op = True

@AUTOINSTALL.check(['torch', 'transformers', 'simhash-pybind'])
def __init__(self,
hf_img2seq='Salesforce/blip2-opt-2.7b',
trust_remote_code=False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/mapper/image_diffusion_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class ImageDiffusionMapper(Mapper):
_accelerator = 'cuda'
_batched_op = True

@AUTOINSTALL.check(['diffusers', 'torch', 'transformers', 'simhash-pybind'])
def __init__(self,
hf_diffusion: str = 'CompVis/stable-diffusion-v1-4',
trust_remote_code=False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/mapper/image_face_blur_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class ImageFaceBlurMapper(Mapper):
'maxSize': None,
}

@AUTOINSTALL.check(['opencv-python', 'Pillow'])
def __init__(self,
cv_classifier='',
blur_type: str = 'gaussian',
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/mapper/nlpaug_en_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class NlpaugEnMapper(Mapper):

_batched_op = True

@AUTOINSTALL.check(['nlpaug'])
def __init__(self,
sequential: bool = False,
aug_num: int = 1,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/mapper/nlpcda_zh_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class NlpcdaZhMapper(Mapper):

_batched_op = True

@AUTOINSTALL.check(['nlpaug'])
def __init__(self,
sequential: bool = False,
aug_num: int = 1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
class RemoveWordsWithIncorrectSubstringsMapper(Mapper):
"""Mapper to remove words with incorrect substrings."""

@AUTOINSTALL.check(['sentencepiece'])
def __init__(self,
lang: str = 'en',
tokenization: bool = False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/mapper/sentence_split_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
class SentenceSplitMapper(Mapper):
"""Mapper to split text samples to sentences."""

@AUTOINSTALL.check(['nltk'])
def __init__(self, lang: str = 'en', *args, **kwargs):
"""
Initialization method.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class VideoCaptioningFromAudioMapper(Mapper):
_accelerator = 'cuda'
_batched_op = True

@AUTOINSTALL.check(['transformers', 'transformers_stream_generator', 'einops', 'accelerate', 'tiktoken'])
def __init__(self, keep_original_sample: bool = True, *args, **kwargs):
"""
Initialization method.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class VideoCaptioningFromFramesMapper(Mapper):
_accelerator = 'cuda'
_batched_op = True

@AUTOINSTALL.check(['torch', 'transformers', 'simhash-pybind'])
def __init__(
self,
hf_img2seq='Salesforce/blip2-opt-2.7b',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ class VideoCaptioningFromSummarizerMapper(Mapper):
_accelerator = 'cuda'
_batched_op = True

@AUTOINSTALL.check(['torch',
'transformers',
'simhash-pybind', # by video caption
'transformers_stream_generator',
'einops',
'accelerate',
'tiktoken', # by audio caption
'torchaudio', # by audio tag
'git+https://github.com/xinyu1205/recognize-anything.git'])
def __init__(self,
hf_summarizer: str = None,
trust_remote_code=False,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class VideoFFmpegWrappedMapper(Mapper):
"""Simple wrapper for FFmpeg video filters.
"""

@AUTOINSTALL.check(['ffmpeg-python'])
def __init__(
self,
filter_name: Optional[str] = None,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/mapper/video_remove_watermark_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class VideoRemoveWatermarkMapper(Mapper):
Remove the watermarks in videos given regions.
"""

@AUTOINSTALL.check(['opencv-python'])
def __init__(self,
roi_strings: List[str] = ['0,0,0.1,0.1'],
roi_type: str = 'ratio',
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ class VideoResizeAspectRatioMapper(Mapper):

STRATEGY = ['decrease', 'increase']

@AUTOINSTALL.check(['ffmpeg-python'])
def __init__(
self,
min_ratio: str = '9/21',
Expand Down
3 changes: 2 additions & 1 deletion data_juicer/ops/mapper/video_resize_resolution_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from data_juicer.utils.logger_utils import HiddenPrints
from data_juicer.utils.mm_utils import close_video, load_video

from ..base_op import AUTOINSTALL, AUTOINSTALL, OPERATORS, Mapper
from ..base_op import AUTOINSTALL, OPERATORS, Mapper
from ..op_fusion import LOADED_VIDEOS

OP_NAME = 'video_resize_resolution_mapper'
Expand All @@ -28,6 +28,7 @@ class VideoResizeResolutionMapper(Mapper):
with deep learning for future works.
"""

@AUTOINSTALL.check(['ffmpeg-python'])
def __init__(self,
min_width: PositiveInt = 1,
max_width: PositiveInt = sys.maxsize,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/mapper/video_split_by_scene_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class VideoSplitBySceneMapper(Mapper):
['fade_bias', 'add_final_scene', 'method', 'block_size']
}

@AUTOINSTALL.check(['scenedetect[opencv]'])
def __init__(self,
detector: str = 'ContentDetector',
threshold: NonNegativeFloat = 27.0,
Expand Down
1 change: 1 addition & 0 deletions data_juicer/ops/mapper/video_tagging_from_audio_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class VideoTaggingFromAudioMapper(Mapper):

_accelerator = 'cuda'

@AUTOINSTALL.check(['torch', 'transformers', 'torchaudio'])
def __init__(self,
hf_ast='MIT/ast-finetuned-audioset-10-10-0.4593',
trust_remote_code=False,
Expand Down
Loading

0 comments on commit 2a6651d

Please sign in to comment.