diff --git a/data_juicer/ops/filter/alphanumeric_filter.py b/data_juicer/ops/filter/alphanumeric_filter.py index 4e4112453..17361b29c 100644 --- a/data_juicer/ops/filter/alphanumeric_filter.py +++ b/data_juicer/ops/filter/alphanumeric_filter.py @@ -86,9 +86,10 @@ def process(self, samples): ratio_key = StatsKeys.alpha_token_ratio if self.tokenization \ else StatsKeys.alnum_ratio if isinstance(samples[Fields.stats], list): - return map( - lambda stat: self.min_ratio <= stat[ratio_key] <= self. - max_ratio, samples[Fields.stats]) + return list( + map( + lambda stat: self.min_ratio <= stat[ratio_key] <= self. + max_ratio, samples[Fields.stats])) else: # single sample for ray filter if self.min_ratio <= samples[ diff --git a/data_juicer/ops/filter/average_line_length_filter.py b/data_juicer/ops/filter/average_line_length_filter.py index d2867b774..74d624a82 100644 --- a/data_juicer/ops/filter/average_line_length_filter.py +++ b/data_juicer/ops/filter/average_line_length_filter.py @@ -60,9 +60,11 @@ def compute_stats(self, samples, context=False): def process(self, samples): if isinstance(samples[Fields.stats], list): - return map( - lambda stat: self.min_len <= stat[StatsKeys.avg_line_length] <= - self.max_len, samples[Fields.stats]) + return list( + map( + lambda stat: self.min_len <= stat[StatsKeys.avg_line_length + ] <= self.max_len, + samples[Fields.stats])) else: # single sample for ray filter if self.min_len <= samples[Fields.stats][ diff --git a/data_juicer/ops/filter/character_repetition_filter.py b/data_juicer/ops/filter/character_repetition_filter.py index 965b368d6..a0441334a 100644 --- a/data_juicer/ops/filter/character_repetition_filter.py +++ b/data_juicer/ops/filter/character_repetition_filter.py @@ -80,9 +80,11 @@ def compute_stats(self, samples): def process(self, samples): if isinstance(samples[Fields.stats], list): - return map( - lambda stat: self.min_ratio <= stat[StatsKeys.char_rep_ratio] - <= self.max_ratio, samples[Fields.stats]) + return list( + map( + lambda stat: self.min_ratio <= stat[ + StatsKeys.char_rep_ratio] <= self.max_ratio, + samples[Fields.stats])) else: # single sample for ray filter if self.min_ratio <= samples[Fields.stats][ diff --git a/data_juicer/ops/filter/maximum_line_length_filter.py b/data_juicer/ops/filter/maximum_line_length_filter.py index 16c919406..146cfb0a2 100644 --- a/data_juicer/ops/filter/maximum_line_length_filter.py +++ b/data_juicer/ops/filter/maximum_line_length_filter.py @@ -61,9 +61,11 @@ def compute_stats(self, samples, context=False): def process(self, samples): if isinstance(samples[Fields.stats], list): - return map( - lambda stat: self.min_len <= stat[StatsKeys.max_line_length] <= - self.max_len, samples[Fields.stats]) + return list( + map( + lambda stat: self.min_len <= stat[StatsKeys.max_line_length + ] <= self.max_len, + samples[Fields.stats])) else: # single sample for ray filter if self.min_len <= samples[Fields.stats][ diff --git a/data_juicer/ops/filter/perplexity_filter.py b/data_juicer/ops/filter/perplexity_filter.py index 9b532d7c6..287d15a11 100644 --- a/data_juicer/ops/filter/perplexity_filter.py +++ b/data_juicer/ops/filter/perplexity_filter.py @@ -80,7 +80,8 @@ def compute_stats(self, samples, context=False): def process(self, samples): if isinstance(samples[Fields.stats], list): - return map(lambda stat: stat[StatsKeys.perplexity] <= self.max_ppl, - samples[Fields.stats]) + return list( + map(lambda stat: stat[StatsKeys.perplexity] <= self.max_ppl, + samples[Fields.stats])) else: return samples[Fields.stats][StatsKeys.perplexity] <= self.max_ppl diff --git a/data_juicer/ops/filter/special_characters_filter.py b/data_juicer/ops/filter/special_characters_filter.py index 59fa61f52..0b56f390e 100644 --- a/data_juicer/ops/filter/special_characters_filter.py +++ b/data_juicer/ops/filter/special_characters_filter.py @@ -54,10 +54,11 @@ def compute_stats(self, samples): def process(self, samples): if isinstance(samples[Fields.stats], list): - return map( - lambda stat: self.min_ratio <= stat[ - StatsKeys.special_char_ratio] <= self.max_ratio, - samples[Fields.stats]) + return list( + map( + lambda stat: self.min_ratio <= stat[ + StatsKeys.special_char_ratio] <= self.max_ratio, + samples[Fields.stats])) else: # single sample for ray filter if self.min_ratio <= \ diff --git a/data_juicer/ops/filter/word_repetition_filter.py b/data_juicer/ops/filter/word_repetition_filter.py index 3e9cad251..71f806e25 100644 --- a/data_juicer/ops/filter/word_repetition_filter.py +++ b/data_juicer/ops/filter/word_repetition_filter.py @@ -116,9 +116,11 @@ def compute_stats(self, samples, context=False): def process(self, samples): if isinstance(samples[Fields.stats], list): - return map( - lambda stat: self.min_ratio <= stat[StatsKeys.word_rep_ratio] - <= self.max_ratio, samples[Fields.stats]) + return list( + map( + lambda stat: self.min_ratio <= stat[ + StatsKeys.word_rep_ratio] <= self.max_ratio, + samples[Fields.stats])) else: # single sample for ray filter if self.min_ratio <= samples[Fields.stats][ diff --git a/data_juicer/ops/filter/words_num_filter.py b/data_juicer/ops/filter/words_num_filter.py index 978c252ad..07eb8e2b7 100644 --- a/data_juicer/ops/filter/words_num_filter.py +++ b/data_juicer/ops/filter/words_num_filter.py @@ -80,9 +80,10 @@ def compute_stats(self, samples, context=False): def process(self, samples): if isinstance(samples[Fields.stats], list): - return map( - lambda stat: self.min_num <= stat[StatsKeys.num_words] <= self. - max_num, samples[Fields.stats]) + return list( + map( + lambda stat: self.min_num <= stat[StatsKeys.num_words] <= + self.max_num, samples[Fields.stats])) else: # single sample for ray filter if self.min_num <= samples[Fields.stats][ diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py index 7b1de9d20..9236ddaa2 100644 --- a/data_juicer/ops/mapper/chinese_convert_mapper.py +++ b/data_juicer/ops/mapper/chinese_convert_mapper.py @@ -87,7 +87,7 @@ def __init__(self, mode: str = 's2t', *args, **kwargs): def process(self, samples): prepare_converter(self.mode) - samples[self.text_key] = map( - lambda text: OPENCC_CONVERTER.convert(text), - samples[self.text_key]) + samples[self.text_key] = list( + map(lambda text: OPENCC_CONVERTER.convert(text), + samples[self.text_key])) return samples diff --git a/data_juicer/ops/mapper/clean_copyright_mapper.py b/data_juicer/ops/mapper/clean_copyright_mapper.py index 5a1ed6ca7..3bf6fcbdf 100644 --- a/data_juicer/ops/mapper/clean_copyright_mapper.py +++ b/data_juicer/ops/mapper/clean_copyright_mapper.py @@ -55,7 +55,7 @@ def _process_single_sample(self, sample): return sample def process(self, samples): - samples[self.text_key] = map( - lambda text: self._process_single_sample(text), - samples[self.text_key]) + samples[self.text_key] = list( + map(lambda text: self._process_single_sample(text), + samples[self.text_key])) return samples diff --git a/data_juicer/ops/mapper/clean_html_mapper.py b/data_juicer/ops/mapper/clean_html_mapper.py index 582137177..d959cc85f 100644 --- a/data_juicer/ops/mapper/clean_html_mapper.py +++ b/data_juicer/ops/mapper/clean_html_mapper.py @@ -37,6 +37,6 @@ def _clean_html(raw_html): parser = HTMLParser(raw_html) return parser.text() - samples[self.text_key] = map(lambda text: _clean_html(text), - samples[self.text_key]) + samples[self.text_key] = list( + map(lambda text: _clean_html(text), samples[self.text_key])) return samples diff --git a/data_juicer/ops/mapper/fix_unicode_mapper.py b/data_juicer/ops/mapper/fix_unicode_mapper.py index e510abac5..4ca71c30a 100644 --- a/data_juicer/ops/mapper/fix_unicode_mapper.py +++ b/data_juicer/ops/mapper/fix_unicode_mapper.py @@ -36,7 +36,9 @@ def __init__(self, normalization: str = None, *args, **kwargs): '["NFC", "NFKC", "NFD", "NFKD"]') def process(self, samples): - samples[self.text_key] = map( - lambda text: ftfy.fix_text(text, normalization=self.normalization), - samples[self.text_key]) + samples[self.text_key] = list( + map( + lambda text: ftfy.fix_text(text, + normalization=self.normalization), + samples[self.text_key])) return samples diff --git a/data_juicer/ops/mapper/punctuation_normalization_mapper.py b/data_juicer/ops/mapper/punctuation_normalization_mapper.py index 4845823b9..6531833a3 100644 --- a/data_juicer/ops/mapper/punctuation_normalization_mapper.py +++ b/data_juicer/ops/mapper/punctuation_normalization_mapper.py @@ -58,8 +58,9 @@ def __init__(self, *args, **kwargs): } def process(self, samples): - samples[self.text_key] = map( - lambda text: ''.join( - [self.punctuation_unicode.get(c, c) for c in text]), - samples[self.text_key]) + samples[self.text_key] = list( + map( + lambda text: ''.join( + [self.punctuation_unicode.get(c, c) for c in text]), + samples[self.text_key])) return samples diff --git a/data_juicer/ops/mapper/remove_bibliography_mapper.py b/data_juicer/ops/mapper/remove_bibliography_mapper.py index 481ba78e2..d2a2bf342 100644 --- a/data_juicer/ops/mapper/remove_bibliography_mapper.py +++ b/data_juicer/ops/mapper/remove_bibliography_mapper.py @@ -30,9 +30,11 @@ def __init__(self, *args, **kwargs): self.pattern += r').*$' def process(self, samples): - samples[self.text_key] = map( - lambda text: re.sub( - pattern=self.pattern, repl=r'', string=text, flags=re.DOTALL), - samples[self.text_key]) + samples[self.text_key] = list( + map( + lambda text: re.sub(pattern=self.pattern, + repl=r'', + string=text, + flags=re.DOTALL), samples[self.text_key])) return samples diff --git a/data_juicer/ops/mapper/remove_specific_chars_mapper.py b/data_juicer/ops/mapper/remove_specific_chars_mapper.py index 2c00e710d..d487efa2f 100644 --- a/data_juicer/ops/mapper/remove_specific_chars_mapper.py +++ b/data_juicer/ops/mapper/remove_specific_chars_mapper.py @@ -34,8 +34,10 @@ def process(self, samples): if self.pattern is None: return samples - samples[self.text_key] = map( - lambda text: re.sub( - pattern=self.pattern, repl=r'', string=text, flags=re.DOTALL), - samples[self.text_key]) + samples[self.text_key] = list( + map( + lambda text: re.sub(pattern=self.pattern, + repl=r'', + string=text, + flags=re.DOTALL), samples[self.text_key])) return samples