From 573a7044c65fdf75b5a89ed76bf0168059a7dc86 Mon Sep 17 00:00:00 2001 From: "hesen.chs" Date: Thu, 16 Nov 2023 18:45:54 +0800 Subject: [PATCH] fix opencc serialization error --- data_juicer/ops/mapper/chinese_convert_mapper.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py index 7d87a9165..8fc0a41c3 100644 --- a/data_juicer/ops/mapper/chinese_convert_mapper.py +++ b/data_juicer/ops/mapper/chinese_convert_mapper.py @@ -1,8 +1,12 @@ -import opencc - from ..base_op import OPERATORS, Mapper +def prepare_converter(mode): + global OPENCC_CONVERTER + import opencc + OPENCC_CONVERTER = opencc.OpenCC(mode + '.json') + + @OPERATORS.register_module('chinese_convert_mapper') class ChineseConvertMapper(Mapper): """Mapper to convert Chinese between Traditional Chinese, Simplified Chinese @@ -39,9 +43,9 @@ def __init__(self, mode: str = 's2t', *args, **kwargs): ] assert mode in mode_list, 'Please make sure mode is one of {}'.format( mode_list) - self.converter = opencc.OpenCC(mode + '.json') + prepare_converter(mode) def process(self, sample): - sample[self.text_key] = self.converter.convert(sample[self.text_key]) + sample[self.text_key] = OPENCC_CONVERTER.convert(sample[self.text_key]) return sample