Skip to content

Commit

Permalink
add parquet to data preprocessing (swiss-ai#9)
Browse files Browse the repository at this point in the history
* add parquet to data preprocessing

* Update tools/preprocess_data.py

Co-authored-by: Antoni-Joan Solergibert <[email protected]>

---------

Co-authored-by: Antoni-Joan Solergibert <[email protected]>
  • Loading branch information
jquesnelle and TJ-Solergibert authored Jul 5, 2024
1 parent c104c34 commit 50da275
Showing 1 changed file with 15 additions and 1 deletion.
16 changes: 15 additions & 1 deletion tools/preprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import argparse

from datatrove.executor.local import LocalPipelineExecutor
from datatrove.pipeline.readers import HuggingFaceDatasetReader, JsonlReader
from datatrove.pipeline.readers import HuggingFaceDatasetReader, JsonlReader, ParquetReader
from datatrove.pipeline.tokens import DocumentTokenizer


Expand Down Expand Up @@ -72,6 +72,18 @@ def get_args():
"--glob-pattern", type=str, default=None, help="A glob pattern to filter files to read. Default: None"
)

p3 = sp.add_parser(name="parquet")
p3.add_argument(
"--dataset",
type=str,
required=True,
help="Path to a .parquet file or a folder containing multiple .parquet files",
)
p3.add_argument("--column", type=str, default="text", help="Column to preprocess from the Dataset. Default: text")
p3.add_argument(
"--glob-pattern", type=str, default=None, help="A glob pattern to filter files to read. Default: None"
)

args = parser.parse_args()

return args
Expand All @@ -85,6 +97,8 @@ def main(args):
text_key=args.column,
dataset_options={"split": args.split},
)
elif args.readers == "parquet":
datatrove_reader = ParquetReader(data_folder=args.dataset, text_key=args.column, glob_pattern=args.glob_pattern)
else:
datatrove_reader = JsonlReader(data_folder=args.dataset, text_key=args.column, glob_pattern=args.glob_pattern)

Expand Down

0 comments on commit 50da275

Please sign in to comment.