Skip to content

Commit

Permalink
* Merge main into this branch
Browse files Browse the repository at this point in the history
  • Loading branch information
HYLcool committed Nov 17, 2023
2 parents bb1f211 + 62c5fb5 commit 0cce370
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 5 deletions.
9 changes: 7 additions & 2 deletions data_juicer/format/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,11 +208,16 @@ def non_empty_text(sample, target_keys):

# 3. convert relative paths to absolute paths
if global_cfg:
ds_dir = global_cfg.dataset_dir
image_key = global_cfg.image_key

if image_key not in dataset.features:
# no image path list in dataset, no need to convert
return dataset

logger.info('Converting relative paths in the dataset to their '
'absolute version. (Based on the directory of input '
'dataset file)')
ds_dir = global_cfg.dataset_dir
image_key = global_cfg.image_key

# function to convert relative paths to absolute paths
def rel2abs(sample, path_keys, dataset_dir):
Expand Down
11 changes: 8 additions & 3 deletions data_juicer/ops/mapper/chinese_convert_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@
OP_NAME = 'chinese_convert_mapper'

with AvailabilityChecking(['opencc'], OP_NAME):
import opencc
import opencc # noqa: F401


def prepare_converter(mode):
global OPENCC_CONVERTER
OPENCC_CONVERTER = opencc.OpenCC(mode + '.json')


@OPERATORS.register_module(OP_NAME)
Expand Down Expand Up @@ -44,9 +49,9 @@ def __init__(self, mode: str = 's2t', *args, **kwargs):
]
assert mode in mode_list, 'Please make sure mode is one of {}'.format(
mode_list)
self.converter = opencc.OpenCC(mode + '.json')
prepare_converter(mode)

def process(self, sample):

sample[self.text_key] = self.converter.convert(sample[self.text_key])
sample[self.text_key] = OPENCC_CONVERTER.convert(sample[self.text_key])
return sample

0 comments on commit 0cce370

Please sign in to comment.