Skip to content

Commit

Permalink
Make parquet file writing start at 1 rather than 0, for ease of reading
Browse files Browse the repository at this point in the history
  • Loading branch information
simw committed Nov 8, 2023
1 parent 64fdf46 commit 0012cb3
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 8 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "pipedata"
version = "0.1"
version = "0.1.1"
description = "Framework for building pipelines for data processing"
authors = ["Simon Wicks <[email protected]>"]
readme = "README.md"
Expand Down
2 changes: 1 addition & 1 deletion src/pipedata/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.1"
__version__ = "0.1.1"

__all__ = [
"__version__",
Expand Down
11 changes: 10 additions & 1 deletion src/pipedata/ops/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@

from pipedata.core.chain import batched

# Option to accumulate the pyarrow table more frequently
# so that doesn't need whole list(dict) and pyarrow table
# in memory at the same time

# Option to hae row_group_length and max_file_length dpendent
# on size of data, as opposed to number of just numbers of rows.
# Can combine this with the existing settings, so runs
# at the smaller of the two.


def parquet_writer(
file_path: str,
Expand All @@ -24,7 +33,7 @@ def parquet_writer(

def parquet_writer_func(records: Iterator[Dict[str, Any]]) -> Iterator[str]:
writer = None
file_number = 0
file_number = 1
file_length = 0
for batch in batched(records, row_group_length):
table = pa.Table.from_pylist(batch, schema=schema)
Expand Down
10 changes: 5 additions & 5 deletions tests/ops/test_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,17 +82,17 @@ def test_parquet_multiple_files() -> None:
.to_list()
)

assert result == [
str(temp_path / "test_0000.parquet"),
assert sorted(result) == sorted([
str(temp_path / "test_0001.parquet"),
]
str(temp_path / "test_0002.parquet"),
])

files = list(temp_path.glob("**/*"))
expected_files = [
temp_path / "test_0000.parquet",
temp_path / "test_0001.parquet",
temp_path / "test_0002.parquet",
]
assert files == expected_files
assert sorted(files) == sorted(expected_files)

table1 = pq.read_table(files[0])
assert table1.to_pydict() == {
Expand Down

0 comments on commit 0012cb3

Please sign in to comment.