From 9497ce51c24d75987930465c2b75dcfa3563f0e1 Mon Sep 17 00:00:00 2001 From: Yilun Huang Date: Fri, 17 Nov 2023 10:46:34 +0800 Subject: [PATCH 1/2] * opt: convert relative paths only when it's necessary (#79) --- data_juicer/format/formatter.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/data_juicer/format/formatter.py b/data_juicer/format/formatter.py index 86ea481db..c47ad2414 100644 --- a/data_juicer/format/formatter.py +++ b/data_juicer/format/formatter.py @@ -208,11 +208,16 @@ def non_empty_text(sample, target_keys): # 3. convert relative paths to absolute paths if global_cfg: + ds_dir = global_cfg.dataset_dir + image_key = global_cfg.image_key + + if image_key not in dataset.features: + # no image path list in dataset, no need to convert + return dataset + logger.info('Converting relative paths in the dataset to their ' 'absolute version. (Based on the directory of input ' 'dataset file)') - ds_dir = global_cfg.dataset_dir - image_key = global_cfg.image_key # function to convert relative paths to absolute paths def rel2abs(sample, path_keys, dataset_dir): From 62c5fb5eebae96a7d79428f99324e3f79be03170 Mon Sep 17 00:00:00 2001 From: chenhesen Date: Fri, 17 Nov 2023 16:26:53 +0800 Subject: [PATCH 2/2] fix opencc serialization error (#83) --- data_juicer/ops/mapper/chinese_convert_mapper.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py index 7d87a9165..8fc0a41c3 100644 --- a/data_juicer/ops/mapper/chinese_convert_mapper.py +++ b/data_juicer/ops/mapper/chinese_convert_mapper.py @@ -1,8 +1,12 @@ -import opencc - from ..base_op import OPERATORS, Mapper +def prepare_converter(mode): + global OPENCC_CONVERTER + import opencc + OPENCC_CONVERTER = opencc.OpenCC(mode + '.json') + + @OPERATORS.register_module('chinese_convert_mapper') class ChineseConvertMapper(Mapper): """Mapper to convert Chinese between Traditional Chinese, Simplified Chinese @@ -39,9 +43,9 @@ def __init__(self, mode: str = 's2t', *args, **kwargs): ] assert mode in mode_list, 'Please make sure mode is one of {}'.format( mode_list) - self.converter = opencc.OpenCC(mode + '.json') + prepare_converter(mode) def process(self, sample): - sample[self.text_key] = self.converter.convert(sample[self.text_key]) + sample[self.text_key] = OPENCC_CONVERTER.convert(sample[self.text_key]) return sample