Skip to content

Commit

Permalink
* fix pre-commit failures and add pre-commit action (#68)
Browse files Browse the repository at this point in the history
* * fix pre-commit failures
+ add pre-commit action
  • Loading branch information
HYLcool authored Nov 14, 2023
1 parent 16d159f commit db59995
Show file tree
Hide file tree
Showing 28 changed files with 310 additions and 255 deletions.
14 changes: 14 additions & 0 deletions .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

name: pre-commit

on: [push, pull_request]

jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: '3.8'
- uses: pre-commit/[email protected]
10 changes: 8 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ repos:
hooks:
- id: flake8
- repo: https://github.com/PyCQA/isort.git
rev: 4.3.21
rev: 5.12.0
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-yapf
Expand Down Expand Up @@ -34,4 +34,10 @@ repos:
exclude: thirdparty/
args: [ "--fix=lf" ]

exclude: 'docs/.*'
exclude: |
(?x)^(
docs/.*|
tests/.*|
demos/.*|
.*\.md
)$
15 changes: 8 additions & 7 deletions data_juicer/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,15 @@ def init_configs(args=None):
type=str,
default=SpecialTokens.image,
help='The special token that represents an image in the text. In '
'default, it\'s "<__dj__image>". You can specify your own special'
' token according to your input dataset.')
'default, it\'s "<__dj__image>". You can specify your own special'
' token according to your input dataset.')
parser.add_argument(
'--eoc_special_token',
type=str,
default=SpecialTokens.eoc,
help='The special token that represents the end of a chunk in the '
'text. In default, it\'s "<|__dj__eoc|>". You can specify your '
'own special token according to your input dataset.')
'text. In default, it\'s "<|__dj__eoc|>". You can specify your '
'own special token according to your input dataset.')
parser.add_argument(
'--suffixes',
type=Union[str, List[str], Tuple[str]],
Expand Down Expand Up @@ -314,8 +314,8 @@ def init_setup_from_cfg(cfg):
if os.path.isdir(cfg.dataset_path):
cfg.dataset_dir = os.path.abspath(cfg.dataset_path)
else:
cfg.dataset_dir = os.path.abspath(
os.path.dirname(cfg.dataset_path))
cfg.dataset_dir = os.path.abspath(os.path.dirname(
cfg.dataset_path))
else:
logger.error(f'Input dataset_path [{cfg.dataset_path}] is invalid. '
f'Please check and retry.')
Expand Down Expand Up @@ -445,8 +445,9 @@ def config_backup(cfg):


def display_config(cfg):
from tabulate import tabulate
import pprint

from tabulate import tabulate
table_header = ['key', 'values']

# remove ops outside the process list for better displaying
Expand Down
5 changes: 4 additions & 1 deletion data_juicer/core/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,10 @@ def to_json(dataset, export_path, num_proc=1, **kwargs):
:param kwargs: extra arguments.
:return:
"""
dataset.to_json(export_path, force_ascii=False, num_proc=num_proc, lines=False)
dataset.to_json(export_path,
force_ascii=False,
num_proc=num_proc,
lines=False)

@staticmethod
def to_parquet(dataset, export_path, **kwargs):
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/core/ray_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ def run(self, load_data_np=None):
dataset = dataset.filter(op.process)
else:
logger.error(
'Ray executor only support Filter and Mapper OPs for now'
)
'Ray executor only support Filter and Mapper OPs for '
'now')
raise NotImplementedError
except: # noqa: E722
logger.error(f'An error occurred during Op [{op_name}].')
Expand Down
22 changes: 10 additions & 12 deletions data_juicer/format/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ def __init__(
self.data_files = find_files_with_suffix(dataset_path, suffixes)
self.add_suffix = add_suffix

def load_dataset(self,
num_proc: int = 1,
global_cfg=None) -> Dataset:
def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
"""
Load a dataset from dataset file or dataset directory, and unify its
format.
Expand Down Expand Up @@ -103,9 +101,7 @@ def __init__(self,
self.text_keys = text_keys
self.kwargs = kwargs

def load_dataset(self,
num_proc: int = 1,
global_cfg=None) -> Dataset:
def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
"""
Load a dataset from HuggingFace, and unify its format.
Expand Down Expand Up @@ -226,8 +222,10 @@ def rel2abs(sample, path_keys, dataset_dir):
paths = sample[path_key]
if not paths:
continue
new_paths = [os.path.join(dataset_dir, path)
for path in paths if not os.path.isabs(path)]
new_paths = [
os.path.join(dataset_dir, path) for path in paths
if not os.path.isabs(path)
]
sample[path_key] = new_paths
return sample

Expand All @@ -240,10 +238,10 @@ def rel2abs(sample, path_keys, dataset_dir):
'dataset_dir': ds_dir
})
else:
logger.warning(f'No global config passed into unify_format function. '
f'Relative paths in the dataset might not be converted '
f'to their absolute versions. Data of other modalities '
f'might not be able to find by Data-Juicer.')
logger.warning('No global config passed into unify_format function. '
'Relative paths in the dataset might not be converted '
'to their absolute versions. Data of other modalities '
'might not be able to find by Data-Juicer.')

return dataset

Expand Down
4 changes: 1 addition & 3 deletions data_juicer/format/text_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,7 @@ def __init__(self,
self.dataset_path = dataset_path
self.add_suffix = add_suffix

def load_dataset(self,
num_proc: int = 1,
global_cfg=None) -> Dataset:
def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
"""
Load a dataset from local text-type files.
Expand Down
48 changes: 28 additions & 20 deletions data_juicer/ops/base_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@

OPERATORS = Registry('Operators')


class OP:
def __init__(self,
text_key: str = None,
image_key: str = None,
):

def __init__(
self,
text_key: str = None,
image_key: str = None,
):
"""
Base class of operators.
Expand All @@ -29,12 +32,14 @@ def __init__(self,
def process(self, *args, **kwargs):
raise NotImplementedError


class Mapper(OP):

def __init__(self,
text_key: str = None,
image_key: str = None,
):
def __init__(
self,
text_key: str = None,
image_key: str = None,
):
"""
Base class that conducts data editing.
Expand Down Expand Up @@ -63,10 +68,11 @@ def is_batched_op(self):

class Filter(OP):

def __init__(self,
text_key: str = None,
image_key: str = None,
):
def __init__(
self,
text_key: str = None,
image_key: str = None,
):
"""
Base class that removes specific info.
Expand Down Expand Up @@ -104,10 +110,11 @@ def process(self, sample):

class Deduplicator(OP):

def __init__(self,
text_key: str = None,
image_key: str = None,
):
def __init__(
self,
text_key: str = None,
image_key: str = None,
):
"""
Base class that conducts deduplication.
Expand Down Expand Up @@ -144,10 +151,11 @@ def process(self, dataset, show_num=0):

class Selector(OP):

def __init__(self,
text_key: str = None,
image_key: str = None,
):
def __init__(
self,
text_key: str = None,
image_key: str = None,
):
"""
Base class that conducts selection in dataset-level.
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/filter/character_repetition_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
@OPERATORS.register_module('character_repetition_filter')
class CharacterRepetitionFilter(Filter):
"""Filter to keep samples with char-level n-gram repetition ratio within a
\ specific range."""
specific range."""

def __init__(self,
rep_len: PositiveInt = 10,
Expand Down
8 changes: 3 additions & 5 deletions data_juicer/ops/filter/image_aspect_ratio_filter.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@

import numpy as np

from jsonargparse.typing import PositiveFloat

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.mm_utils import load_image

from ..base_op import OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES
from data_juicer.utils.mm_utils import load_image


@OPERATORS.register_module('image_aspect_ratio_filter')
Expand Down Expand Up @@ -85,7 +83,8 @@ def process(self, sample):
aspect_ratios = sample[Fields.stats][StatsKeys.aspect_ratios]
keep_bools = np.array([
self.min_ratio <= aspect_ratio <= self.max_ratio
for aspect_ratio in aspect_ratios])
for aspect_ratio in aspect_ratios
])
if len(keep_bools) <= 0:
return True

Expand All @@ -94,4 +93,3 @@ def process(self, sample):
return keep_bools.any()
else:
return keep_bools.all()

3 changes: 2 additions & 1 deletion data_juicer/ops/filter/language_id_score_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def compute_stats(self, sample):
def process(self, sample):
if self.lang:
return sample[Fields.stats][StatsKeys.lang] == self.lang \
and sample[Fields.stats][StatsKeys.lang_score] >= self.min_score
and sample[Fields.stats][StatsKeys.lang_score] >= \
self.min_score
else:
return sample[Fields.stats][StatsKeys.lang_score] >= self.min_score
3 changes: 2 additions & 1 deletion data_juicer/ops/filter/special_characters_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ def compute_stats(self, sample):
return sample

def process(self, sample):
if self.min_ratio <= sample[Fields.stats][StatsKeys.special_char_ratio] \
if self.min_ratio <= \
sample[Fields.stats][StatsKeys.special_char_ratio] \
<= self.max_ratio:
return True
else:
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/filter/word_repetition_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
@INTER_WORDS.register_module('word_repetition_filter')
class WordRepetitionFilter(Filter):
"""Filter to keep samples with word-level n-gram repetition ratio within a
\ specific range."""
specific range."""

def __init__(self,
lang: str = 'en',
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/mapper/punctuation_normalization_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
@OPERATORS.register_module('punctuation_normalization_mapper')
class PunctuationNormalizationMapper(Mapper):
"""Mapper to normalize unicode punctuations to English punctuations in text
\ samples."""
samples."""

def __init__(self, *args, **kwargs):
"""
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/mapper/remove_comments_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class RemoveCommentsMapper(Mapper):
"""
Mapper to remove comments in different kinds of documents.
Only support 'tex' \ for now.
Only support 'tex' for now.
"""

def __init__(self,
Expand Down
4 changes: 3 additions & 1 deletion data_juicer/utils/mm_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@

from datasets import Image

from data_juicer.utils.constant import DEFAULT_PREFIX


# A class to keep special tokens for multimodal information in the texts
# The tokens in this class can be updated by corresponding arguments in config
class SpecialTokens(object):
Expand All @@ -12,9 +12,11 @@ class SpecialTokens(object):
# others
eoc = f'<|{DEFAULT_PREFIX}eoc|>'


def load_images(paths):
return [load_image(path) for path in paths]


def load_image(path):
img_feature = Image()
img = img_feature.decode_example(img_feature.encode_example(path))
Expand Down
6 changes: 6 additions & 0 deletions docs/DeveloperGuide.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ pre-commit run --all-files
git commit -m "xxxx"
```

**Note**: We have configured pre-commit checks in github workflow. If this
check in your PR fails, please locally ① ensure that the relevant
dependencies of pre-commit are consistent with the project configuration
(which can be completed through `pre-commit clean` and `pre-commit install`);
and ② execute `pre-commit run --all-files` before push.

## Build your own ops

- Data-Juicer allows everybody to build their own ops.
Expand Down
2 changes: 2 additions & 0 deletions docs/DeveloperGuide_ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ pre-commit run --all-files
git commit -m "<your_commit_message>"
```

**注意**:我们在github workflow配置了pre-commit的检查。如果您的PR中该检查没通过,请在本地①确保pre-commit 的相关依赖与项目配置一致(可通过`pre-commit clean``pre-commit install`完成);②push前执行了`pre-commit run --all-files`.

## 构建自己的算子

- Data-Juicer 支持每个人定义自己的算子。
Expand Down
Loading

0 comments on commit db59995

Please sign in to comment.