-
Notifications
You must be signed in to change notification settings - Fork 138
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement zstd Compression Support for JSONL and Parquet Files (#230)
* Add zstandard dependency for compression support * feat: Add zstd compression support for jsonl reader * feat: Add zstd compression support for ParquetWriter * feat: Update DiskWriter to handle the other compression for Parquet files * Remove annotaion * feat: Update compression handling in DiskWriter and ParquetWriter * Update src/datatrove/pipeline/writers/disk_base.py Handle compression on ParquetWriter directly Co-authored-by: Guilherme Penedo <[email protected]> * Update src/datatrove/pipeline/writers/parquet.py None to out of list Co-authored-by: Guilherme Penedo <[email protected]> * Refactor constructor to explicitly set default compression to None * Add validation for compression parameter in ParquetWriter * Update src/datatrove/pipeline/writers/disk_base.py official extension for zstd is ".zst" Co-authored-by: Guilherme Penedo <[email protected]> --------- Co-authored-by: Guilherme Penedo <[email protected]>
- Loading branch information
1 parent
3b91550
commit d5d1924
Showing
5 changed files
with
78 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import shutil | ||
import tempfile | ||
import unittest | ||
|
||
from datatrove.data import Document | ||
from datatrove.pipeline.readers.jsonl import JsonlReader | ||
from datatrove.pipeline.writers.jsonl import JsonlWriter | ||
|
||
|
||
class TestZstdCompression(unittest.TestCase): | ||
def setUp(self): | ||
# Create a temporary directory | ||
self.tmp_dir = tempfile.mkdtemp() | ||
self.addCleanup(shutil.rmtree, self.tmp_dir) | ||
|
||
def test_jsonl_writer_reader(self): | ||
data = [ | ||
Document(text=text, id=str(i), metadata={"somedata": 2 * i, "somefloat": i * 0.4, "somestring": "hello"}) | ||
for i, text in enumerate(["hello", "text2", "more text"]) | ||
] | ||
with JsonlWriter(output_folder=self.tmp_dir, compression="zstd") as w: | ||
for doc in data: | ||
w.write(doc) | ||
reader = JsonlReader(self.tmp_dir, compression="zstd") | ||
c = 0 | ||
for read_doc, original in zip(reader(), data): | ||
read_doc.metadata.pop("file_path", None) | ||
assert read_doc == original | ||
c += 1 | ||
assert c == len(data) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import shutil | ||
import tempfile | ||
import unittest | ||
|
||
from datatrove.data import Document | ||
from datatrove.pipeline.readers.parquet import ParquetReader | ||
from datatrove.pipeline.writers.parquet import ParquetWriter | ||
|
||
|
||
class TestZstdCompression(unittest.TestCase): | ||
def setUp(self): | ||
# Create a temporary directory | ||
self.tmp_dir = tempfile.mkdtemp() | ||
self.addCleanup(shutil.rmtree, self.tmp_dir) | ||
|
||
def test_parquet_writer_reader(self): | ||
data = [ | ||
Document(text=text, id=str(i), metadata={"somedata": 2 * i, "somefloat": i * 0.4, "somestring": "hello"}) | ||
for i, text in enumerate(["hello", "text2", "more text"]) | ||
] | ||
with ParquetWriter(output_folder=self.tmp_dir, compression="zstd") as w: | ||
for doc in data: | ||
w.write(doc) | ||
reader = ParquetReader(self.tmp_dir) | ||
c = 0 | ||
for read_doc, original in zip(reader(), data): | ||
read_doc.metadata.pop("file_path", None) | ||
assert read_doc == original | ||
c += 1 | ||
assert c == len(data) |