diff --git a/pyproject.toml b/pyproject.toml index 37dadb6..3beec90 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "pipedata" -version = "0.1" +version = "0.1.1" description = "Framework for building pipelines for data processing" authors = ["Simon Wicks "] readme = "README.md" diff --git a/src/pipedata/__init__.py b/src/pipedata/__init__.py index d091303..21c2c19 100644 --- a/src/pipedata/__init__.py +++ b/src/pipedata/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.1" +__version__ = "0.1.1" __all__ = [ "__version__", diff --git a/src/pipedata/ops/storage.py b/src/pipedata/ops/storage.py index 178f004..581814f 100644 --- a/src/pipedata/ops/storage.py +++ b/src/pipedata/ops/storage.py @@ -5,6 +5,15 @@ from pipedata.core.chain import batched +# Option to accumulate the pyarrow table more frequently +# so that doesn't need whole list(dict) and pyarrow table +# in memory at the same time + +# Option to hae row_group_length and max_file_length dpendent +# on size of data, as opposed to number of just numbers of rows. +# Can combine this with the existing settings, so runs +# at the smaller of the two. + def parquet_writer( file_path: str, @@ -24,7 +33,7 @@ def parquet_writer( def parquet_writer_func(records: Iterator[Dict[str, Any]]) -> Iterator[str]: writer = None - file_number = 0 + file_number = 1 file_length = 0 for batch in batched(records, row_group_length): table = pa.Table.from_pylist(batch, schema=schema) diff --git a/tests/ops/test_storage.py b/tests/ops/test_storage.py index 326dbce..7d05752 100644 --- a/tests/ops/test_storage.py +++ b/tests/ops/test_storage.py @@ -83,14 +83,14 @@ def test_parquet_multiple_files() -> None: ) assert result == [ - str(temp_path / "test_0000.parquet"), str(temp_path / "test_0001.parquet"), + str(temp_path / "test_0002.parquet"), ] - files = list(temp_path.glob("**/*")) + files = sorted(temp_path.glob("**/*")) expected_files = [ - temp_path / "test_0000.parquet", temp_path / "test_0001.parquet", + temp_path / "test_0002.parquet", ] assert files == expected_files