diff --git a/data_juicer/format/formatter.py b/data_juicer/format/formatter.py index 86ea481db..c47ad2414 100644 --- a/data_juicer/format/formatter.py +++ b/data_juicer/format/formatter.py @@ -208,11 +208,16 @@ def non_empty_text(sample, target_keys): # 3. convert relative paths to absolute paths if global_cfg: + ds_dir = global_cfg.dataset_dir + image_key = global_cfg.image_key + + if image_key not in dataset.features: + # no image path list in dataset, no need to convert + return dataset + logger.info('Converting relative paths in the dataset to their ' 'absolute version. (Based on the directory of input ' 'dataset file)') - ds_dir = global_cfg.dataset_dir - image_key = global_cfg.image_key # function to convert relative paths to absolute paths def rel2abs(sample, path_keys, dataset_dir): diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py index 5f55dedd3..fc321094c 100644 --- a/data_juicer/ops/mapper/chinese_convert_mapper.py +++ b/data_juicer/ops/mapper/chinese_convert_mapper.py @@ -5,7 +5,12 @@ OP_NAME = 'chinese_convert_mapper' with AvailabilityChecking(['opencc'], OP_NAME): - import opencc + import opencc # noqa: F401 + + +def prepare_converter(mode): + global OPENCC_CONVERTER + OPENCC_CONVERTER = opencc.OpenCC(mode + '.json') @OPERATORS.register_module(OP_NAME) @@ -44,9 +49,9 @@ def __init__(self, mode: str = 's2t', *args, **kwargs): ] assert mode in mode_list, 'Please make sure mode is one of {}'.format( mode_list) - self.converter = opencc.OpenCC(mode + '.json') + prepare_converter(mode) def process(self, sample): - sample[self.text_key] = self.converter.convert(sample[self.text_key]) + sample[self.text_key] = OPENCC_CONVERTER.convert(sample[self.text_key]) return sample