Skip to content

Commit

Permalink
Add expand_metadata feature in jsonlwriter (#268)
Browse files Browse the repository at this point in the history
  • Loading branch information
justHungryMan authored Aug 28, 2024
1 parent c4f5783 commit 3de36c9
Showing 1 changed file with 3 additions and 0 deletions.
3 changes: 3 additions & 0 deletions src/datatrove/pipeline/writers/jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class JsonlWriter(DiskWriter):
output_filename: the filename to use when saving data, including extension. Can contain placeholders such as `${rank}` or metadata tags `${tag}`
compression: if any compression scheme should be used. By default, "infer" - will be guessed from the filename
adapter: a custom function to "adapt" the Document format to the desired output format
expand_metadata: save each metadata entry in a different column instead of as a dictionary
"""

default_output_filename: str = "${rank}.jsonl"
Expand All @@ -24,13 +25,15 @@ def __init__(
output_filename: str = None,
compression: str | None = "gzip",
adapter: Callable = None,
expand_metadata: bool = False,
max_file_size: int = -1, # in bytes. -1 for unlimited
):
super().__init__(
output_folder,
output_filename=output_filename,
compression=compression,
adapter=adapter,
expand_metadata=expand_metadata,
mode="wb",
max_file_size=max_file_size,
)
Expand Down

0 comments on commit 3de36c9

Please sign in to comment.