Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

* fix pre-commit failures and add pre-commit action #68

Merged
merged 7 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

name: pre-commit

on: [push, pull_request]

jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: '3.8'
- uses: pre-commit/[email protected]
10 changes: 8 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ repos:
hooks:
- id: flake8
- repo: https://github.com/PyCQA/isort.git
rev: 4.3.21
rev: 5.12.0
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-yapf
Expand Down Expand Up @@ -34,4 +34,10 @@ repos:
exclude: thirdparty/
args: [ "--fix=lf" ]

exclude: 'docs/.*'
exclude: |
(?x)^(
docs/.*|
tests/.*|
demos/.*|
.*\.md
)$
15 changes: 8 additions & 7 deletions data_juicer/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,15 @@ def init_configs(args=None):
type=str,
default=SpecialTokens.image,
help='The special token that represents an image in the text. In '
'default, it\'s "<__dj__image>". You can specify your own special'
' token according to your input dataset.')
'default, it\'s "<__dj__image>". You can specify your own special'
' token according to your input dataset.')
parser.add_argument(
'--eoc_special_token',
type=str,
default=SpecialTokens.eoc,
help='The special token that represents the end of a chunk in the '
'text. In default, it\'s "<|__dj__eoc|>". You can specify your '
'own special token according to your input dataset.')
'text. In default, it\'s "<|__dj__eoc|>". You can specify your '
'own special token according to your input dataset.')
parser.add_argument(
'--suffixes',
type=Union[str, List[str], Tuple[str]],
Expand Down Expand Up @@ -314,8 +314,8 @@ def init_setup_from_cfg(cfg):
if os.path.isdir(cfg.dataset_path):
cfg.dataset_dir = os.path.abspath(cfg.dataset_path)
else:
cfg.dataset_dir = os.path.abspath(
os.path.dirname(cfg.dataset_path))
cfg.dataset_dir = os.path.abspath(os.path.dirname(
cfg.dataset_path))
else:
logger.error(f'Input dataset_path [{cfg.dataset_path}] is invalid. '
f'Please check and retry.')
Expand Down Expand Up @@ -445,8 +445,9 @@ def config_backup(cfg):


def display_config(cfg):
from tabulate import tabulate
import pprint

from tabulate import tabulate
table_header = ['key', 'values']

# remove ops outside the process list for better displaying
Expand Down
5 changes: 4 additions & 1 deletion data_juicer/core/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,10 @@ def to_json(dataset, export_path, num_proc=1, **kwargs):
:param kwargs: extra arguments.
:return:
"""
dataset.to_json(export_path, force_ascii=False, num_proc=num_proc, lines=False)
dataset.to_json(export_path,
force_ascii=False,
num_proc=num_proc,
lines=False)

@staticmethod
def to_parquet(dataset, export_path, **kwargs):
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/core/ray_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ def run(self, load_data_np=None):
dataset = dataset.filter(op.process)
else:
logger.error(
'Ray executor only support Filter and Mapper OPs for now'
)
'Ray executor only support Filter and Mapper OPs for '
'now')
raise NotImplementedError
except: # noqa: E722
logger.error(f'An error occurred during Op [{op_name}].')
Expand Down
22 changes: 10 additions & 12 deletions data_juicer/format/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ def __init__(
self.data_files = find_files_with_suffix(dataset_path, suffixes)
self.add_suffix = add_suffix

def load_dataset(self,
num_proc: int = 1,
global_cfg=None) -> Dataset:
def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
"""
Load a dataset from dataset file or dataset directory, and unify its
format.
Expand Down Expand Up @@ -103,9 +101,7 @@ def __init__(self,
self.text_keys = text_keys
self.kwargs = kwargs

def load_dataset(self,
num_proc: int = 1,
global_cfg=None) -> Dataset:
def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
"""
Load a dataset from HuggingFace, and unify its format.

Expand Down Expand Up @@ -226,8 +222,10 @@ def rel2abs(sample, path_keys, dataset_dir):
paths = sample[path_key]
if not paths:
continue
new_paths = [os.path.join(dataset_dir, path)
for path in paths if not os.path.isabs(path)]
new_paths = [
os.path.join(dataset_dir, path) for path in paths
if not os.path.isabs(path)
]
sample[path_key] = new_paths
return sample

Expand All @@ -240,10 +238,10 @@ def rel2abs(sample, path_keys, dataset_dir):
'dataset_dir': ds_dir
})
else:
logger.warning(f'No global config passed into unify_format function. '
f'Relative paths in the dataset might not be converted '
f'to their absolute versions. Data of other modalities '
f'might not be able to find by Data-Juicer.')
logger.warning('No global config passed into unify_format function. '
'Relative paths in the dataset might not be converted '
'to their absolute versions. Data of other modalities '
'might not be able to find by Data-Juicer.')

return dataset

Expand Down
4 changes: 1 addition & 3 deletions data_juicer/format/text_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,7 @@ def __init__(self,
self.dataset_path = dataset_path
self.add_suffix = add_suffix

def load_dataset(self,
num_proc: int = 1,
global_cfg=None) -> Dataset:
def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
"""
Load a dataset from local text-type files.

Expand Down
48 changes: 28 additions & 20 deletions data_juicer/ops/base_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@

OPERATORS = Registry('Operators')


class OP:
def __init__(self,
text_key: str = None,
image_key: str = None,
):

def __init__(
self,
text_key: str = None,
image_key: str = None,
):
"""
Base class of operators.

Expand All @@ -29,12 +32,14 @@ def __init__(self,
def process(self, *args, **kwargs):
raise NotImplementedError


class Mapper(OP):

def __init__(self,
text_key: str = None,
image_key: str = None,
):
def __init__(
self,
text_key: str = None,
image_key: str = None,
):
"""
Base class that conducts data editing.

Expand Down Expand Up @@ -63,10 +68,11 @@ def is_batched_op(self):

class Filter(OP):

def __init__(self,
text_key: str = None,
image_key: str = None,
):
def __init__(
self,
text_key: str = None,
image_key: str = None,
):
"""
Base class that removes specific info.

Expand Down Expand Up @@ -104,10 +110,11 @@ def process(self, sample):

class Deduplicator(OP):

def __init__(self,
text_key: str = None,
image_key: str = None,
):
def __init__(
self,
text_key: str = None,
image_key: str = None,
):
"""
Base class that conducts deduplication.

Expand Down Expand Up @@ -144,10 +151,11 @@ def process(self, dataset, show_num=0):

class Selector(OP):

def __init__(self,
text_key: str = None,
image_key: str = None,
):
def __init__(
self,
text_key: str = None,
image_key: str = None,
):
"""
Base class that conducts selection in dataset-level.

Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/filter/character_repetition_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
@OPERATORS.register_module('character_repetition_filter')
class CharacterRepetitionFilter(Filter):
"""Filter to keep samples with char-level n-gram repetition ratio within a
\ specific range."""
specific range."""

def __init__(self,
rep_len: PositiveInt = 10,
Expand Down
8 changes: 3 additions & 5 deletions data_juicer/ops/filter/image_aspect_ratio_filter.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@

import numpy as np

from jsonargparse.typing import PositiveFloat

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.mm_utils import load_image

from ..base_op import OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES
from data_juicer.utils.mm_utils import load_image


@OPERATORS.register_module('image_aspect_ratio_filter')
Expand Down Expand Up @@ -85,7 +83,8 @@ def process(self, sample):
aspect_ratios = sample[Fields.stats][StatsKeys.aspect_ratios]
keep_bools = np.array([
self.min_ratio <= aspect_ratio <= self.max_ratio
for aspect_ratio in aspect_ratios])
for aspect_ratio in aspect_ratios
])
if len(keep_bools) <= 0:
return True

Expand All @@ -94,4 +93,3 @@ def process(self, sample):
return keep_bools.any()
else:
return keep_bools.all()

3 changes: 2 additions & 1 deletion data_juicer/ops/filter/language_id_score_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def compute_stats(self, sample):
def process(self, sample):
if self.lang:
return sample[Fields.stats][StatsKeys.lang] == self.lang \
and sample[Fields.stats][StatsKeys.lang_score] >= self.min_score
and sample[Fields.stats][StatsKeys.lang_score] >= \
self.min_score
else:
return sample[Fields.stats][StatsKeys.lang_score] >= self.min_score
3 changes: 2 additions & 1 deletion data_juicer/ops/filter/special_characters_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ def compute_stats(self, sample):
return sample

def process(self, sample):
if self.min_ratio <= sample[Fields.stats][StatsKeys.special_char_ratio] \
if self.min_ratio <= \
sample[Fields.stats][StatsKeys.special_char_ratio] \
<= self.max_ratio:
return True
else:
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/filter/word_repetition_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
@INTER_WORDS.register_module('word_repetition_filter')
class WordRepetitionFilter(Filter):
"""Filter to keep samples with word-level n-gram repetition ratio within a
\ specific range."""
specific range."""

def __init__(self,
lang: str = 'en',
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/mapper/punctuation_normalization_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
@OPERATORS.register_module('punctuation_normalization_mapper')
class PunctuationNormalizationMapper(Mapper):
"""Mapper to normalize unicode punctuations to English punctuations in text
\ samples."""
samples."""

def __init__(self, *args, **kwargs):
"""
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/mapper/remove_comments_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class RemoveCommentsMapper(Mapper):
"""
Mapper to remove comments in different kinds of documents.

Only support 'tex' \ for now.
Only support 'tex' for now.
"""

def __init__(self,
Expand Down
4 changes: 3 additions & 1 deletion data_juicer/utils/mm_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@

from datasets import Image

from data_juicer.utils.constant import DEFAULT_PREFIX


# A class to keep special tokens for multimodal information in the texts
# The tokens in this class can be updated by corresponding arguments in config
class SpecialTokens(object):
Expand All @@ -12,9 +12,11 @@ class SpecialTokens(object):
# others
eoc = f'<|{DEFAULT_PREFIX}eoc|>'


def load_images(paths):
return [load_image(path) for path in paths]


def load_image(path):
img_feature = Image()
img = img_feature.decode_example(img_feature.encode_example(path))
Expand Down
6 changes: 6 additions & 0 deletions docs/DeveloperGuide.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ pre-commit run --all-files
git commit -m "xxxx"
```

**Note**: We have configured pre-commit checks in github workflow. If this
check in your PR fails, please locally ① ensure that the relevant
dependencies of pre-commit are consistent with the project configuration
(which can be completed through `pre-commit clean` and `pre-commit install`);
and ② execute `pre-commit run --all-files` before push.

## Build your own ops

- Data-Juicer allows everybody to build their own ops.
Expand Down
2 changes: 2 additions & 0 deletions docs/DeveloperGuide_ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ pre-commit run --all-files
git commit -m "<your_commit_message>"
```

**注意**:我们在github workflow配置了pre-commit的检查。如果您的PR中该检查没通过,请在本地①确保pre-commit 的相关依赖与项目配置一致(可通过`pre-commit clean`和`pre-commit install`完成);②push前执行了`pre-commit run --all-files`.

## 构建自己的算子

- Data-Juicer 支持每个人定义自己的算子。
Expand Down
Loading