Skip to content

Commit

Permalink
Make parquet file writing start at 1 rather than 0, for ease of readi…
Browse files Browse the repository at this point in the history
…ng (#8)
  • Loading branch information
simw authored Nov 8, 2023
1 parent 64fdf46 commit 146285e
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 6 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "pipedata"
version = "0.1"
version = "0.1.1"
description = "Framework for building pipelines for data processing"
authors = ["Simon Wicks <[email protected]>"]
readme = "README.md"
Expand Down
2 changes: 1 addition & 1 deletion src/pipedata/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.1"
__version__ = "0.1.1"

__all__ = [
"__version__",
Expand Down
11 changes: 10 additions & 1 deletion src/pipedata/ops/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@

from pipedata.core.chain import batched

# Option to accumulate the pyarrow table more frequently
# so that doesn't need whole list(dict) and pyarrow table
# in memory at the same time

# Option to hae row_group_length and max_file_length dpendent
# on size of data, as opposed to number of just numbers of rows.
# Can combine this with the existing settings, so runs
# at the smaller of the two.


def parquet_writer(
file_path: str,
Expand All @@ -24,7 +33,7 @@ def parquet_writer(

def parquet_writer_func(records: Iterator[Dict[str, Any]]) -> Iterator[str]:
writer = None
file_number = 0
file_number = 1
file_length = 0
for batch in batched(records, row_group_length):
table = pa.Table.from_pylist(batch, schema=schema)
Expand Down
6 changes: 3 additions & 3 deletions tests/ops/test_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,14 @@ def test_parquet_multiple_files() -> None:
)

assert result == [
str(temp_path / "test_0000.parquet"),
str(temp_path / "test_0001.parquet"),
str(temp_path / "test_0002.parquet"),
]

files = list(temp_path.glob("**/*"))
files = sorted(temp_path.glob("**/*"))
expected_files = [
temp_path / "test_0000.parquet",
temp_path / "test_0001.parquet",
temp_path / "test_0002.parquet",
]
assert files == expected_files

Expand Down

0 comments on commit 146285e

Please sign in to comment.