Skip to content

Commit

Permalink
Fix parquet file suffix appending for sort order
Browse files Browse the repository at this point in the history
  • Loading branch information
bmcdonald3 committed Dec 2, 2021
1 parent 3bc41a6 commit fe923b4
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 13 deletions.
6 changes: 3 additions & 3 deletions src/Parquet.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,12 @@ module Parquet {
return ArrowTypes.notimplemented;
}

proc writeDistArrayToParquet(A, filename, dsetname, rowGroupSize) {
proc writeDistArrayToParquet(A, filename, dsetname, rowGroupSize) throws {
extern proc c_writeColumnToParquet(filename, chpl_arr, colnum,
dsetname, numelems, rowGroupSize);
var filenames: [0..#A.targetLocales().size] string;
for i in 0..#A.targetLocales().size {
var suffix = i: string;
var suffix = '%04i'.format(i): string;
filenames[i] = filename + "_LOCALE" + suffix + ".parquet";
}

Expand All @@ -102,7 +102,7 @@ module Parquet {
}
}

proc write1DDistArrayParquet(filename: string, dsetname, A) {
proc write1DDistArrayParquet(filename: string, dsetname, A) throws {
writeDistArrayToParquet(A, filename, dsetname, ROWGROUPS);
return false;
}
Expand Down
16 changes: 6 additions & 10 deletions tests/parquet_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import pytest

SIZE = 50
SIZE = 100
NUMFILES = 5
verbose = True

Expand All @@ -14,29 +14,25 @@ def test_parquet(self):
ak_arr = ak.randint(0, 2**32, SIZE)
ak_arr.save_parquet("pq_testcorrect", "my-dset")
pq_arr = ak.read_parquet("pq_testcorrect*", "my-dset")
# get the dset from the dictionary in multi-locale cases
self.assertTrue((ak_arr == pq_arr).all())

for f in glob.glob('pq_test*'):
os.remove(f)
a = ak_arr.to_ndarray().sort()
b = pq_arr.to_ndarray().sort()
self.assertTrue(a == b)

def test_multi_file(self):
adjusted_size = int(SIZE/NUMFILES)*NUMFILES
test_arrs = []
elems = ak.randint(0, 2**32, adjusted_size)
per_arr = int(adjusted_size/NUMFILES)
print(elems)
for i in range(NUMFILES):
test_arrs.append(elems[(i*per_arr):(i*per_arr)+per_arr])
print(test_arrs[i])
test_arrs[i].save_parquet("pq_test" + str(i), "test-dset")

a = elems.to_ndarray()
a.sort()
pq_arr = ak.read_parquet("pq_test*", "test-dset")
b = pq_arr.to_ndarray()
b.sort()

self.assertTrue((a == b).all())
self.assertTrue((elems == pq_arr).all())

for f in glob.glob('pq_test*'):
os.remove(f)

0 comments on commit fe923b4

Please sign in to comment.