Make parquet file writing start at 1 rather than 0, for ease of reading

simw · Nov 8, 2023 · 0012cb3 · 0012cb3
1 parent 64fdf46
commit 0012cb3
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 8 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "pipedata"
-version = "0.1"
+version = "0.1.1"
 description = "Framework for building pipelines for data processing"
 authors = ["Simon Wicks <[email protected]>"]
 readme = "README.md"

diff --git a/src/pipedata/__init__.py b/src/pipedata/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.1"
+__version__ = "0.1.1"
 
 __all__ = [
     "__version__",

diff --git a/src/pipedata/ops/storage.py b/src/pipedata/ops/storage.py
@@ -5,6 +5,15 @@
 
 from pipedata.core.chain import batched
 
+# Option to accumulate the pyarrow table more frequently
+# so that doesn't need whole list(dict) and pyarrow table
+# in memory at the same time
+
+# Option to hae row_group_length and max_file_length dpendent
+# on size of data, as opposed to number of just numbers of rows.
+# Can combine this with the existing settings, so runs
+# at the smaller of the two.
+
 
 def parquet_writer(
     file_path: str,
@@ -24,7 +33,7 @@ def parquet_writer(
 
     def parquet_writer_func(records: Iterator[Dict[str, Any]]) -> Iterator[str]:
         writer = None
-        file_number = 0
+        file_number = 1
         file_length = 0
         for batch in batched(records, row_group_length):
             table = pa.Table.from_pylist(batch, schema=schema)

diff --git a/tests/ops/test_storage.py b/tests/ops/test_storage.py
@@ -82,17 +82,17 @@ def test_parquet_multiple_files() -> None:
             .to_list()
         )
 
-        assert result == [
-            str(temp_path / "test_0000.parquet"),
+        assert sorted(result) == sorted([
             str(temp_path / "test_0001.parquet"),
-        ]
+            str(temp_path / "test_0002.parquet"),
+        ])
 
         files = list(temp_path.glob("**/*"))
         expected_files = [
-            temp_path / "test_0000.parquet",
             temp_path / "test_0001.parquet",
+            temp_path / "test_0002.parquet",
         ]
-        assert files == expected_files
+        assert sorted(files) == sorted(expected_files)
 
         table1 = pq.read_table(files[0])
         assert table1.to_pydict() == {