From 3bc41a6edde5b6ba62bf1a4ef1c931a87f321ad3 Mon Sep 17 00:00:00 2001 From: Ben McDonald Date: Wed, 1 Dec 2021 11:46:04 -0600 Subject: [PATCH 1/3] Fix parquet tests for multilocale runs --- tests/parquet_test.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/parquet_test.py b/tests/parquet_test.py index 6a622f586a..031b6cb055 100644 --- a/tests/parquet_test.py +++ b/tests/parquet_test.py @@ -1,6 +1,7 @@ import glob, os from context import arkouda as ak from base_test import ArkoudaTest +import numpy as np import pytest SIZE = 50 @@ -16,20 +17,26 @@ def test_parquet(self): # get the dset from the dictionary in multi-locale cases for f in glob.glob('pq_test*'): os.remove(f) - self.assertTrue((ak_arr == pq_arr).all()) + a = ak_arr.to_ndarray().sort() + b = pq_arr.to_ndarray().sort() + self.assertTrue(a == b) def test_multi_file(self): adjusted_size = int(SIZE/NUMFILES)*NUMFILES test_arrs = [] + elems = ak.randint(0, 2**32, adjusted_size) + per_arr = int(adjusted_size/NUMFILES) for i in range(NUMFILES): - test_arrs.append(ak.randint(0, 2**32, int(adjusted_size/NUMFILES))) + test_arrs.append(elems[(i*per_arr):(i*per_arr)+per_arr]) test_arrs[i].save_parquet("pq_test" + str(i), "test-dset") + a = elems.to_ndarray() + a.sort() pq_arr = ak.read_parquet("pq_test*", "test-dset") - self.assertTrue(len(pq_arr) == adjusted_size) + b = pq_arr.to_ndarray() + b.sort() + + self.assertTrue((a == b).all()) - for i in range(NUMFILES): - sz = len(test_arrs[i]) - self.assertTrue((test_arrs[i] == pq_arr[(i*sz):(i*sz)+sz]).all()) for f in glob.glob('pq_test*'): os.remove(f) From fe923b4bc5d7158cf9d8409c0ff1f5afefd2be1d Mon Sep 17 00:00:00 2001 From: Ben McDonald Date: Wed, 1 Dec 2021 16:46:03 -0800 Subject: [PATCH 2/3] Fix parquet file suffix appending for sort order --- src/Parquet.chpl | 6 +++--- tests/parquet_test.py | 16 ++++++---------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/src/Parquet.chpl b/src/Parquet.chpl index f8c67fae6f..0890997ab0 100644 --- a/src/Parquet.chpl +++ b/src/Parquet.chpl @@ -84,12 +84,12 @@ module Parquet { return ArrowTypes.notimplemented; } - proc writeDistArrayToParquet(A, filename, dsetname, rowGroupSize) { + proc writeDistArrayToParquet(A, filename, dsetname, rowGroupSize) throws { extern proc c_writeColumnToParquet(filename, chpl_arr, colnum, dsetname, numelems, rowGroupSize); var filenames: [0..#A.targetLocales().size] string; for i in 0..#A.targetLocales().size { - var suffix = i: string; + var suffix = '%04i'.format(i): string; filenames[i] = filename + "_LOCALE" + suffix + ".parquet"; } @@ -102,7 +102,7 @@ module Parquet { } } - proc write1DDistArrayParquet(filename: string, dsetname, A) { + proc write1DDistArrayParquet(filename: string, dsetname, A) throws { writeDistArrayToParquet(A, filename, dsetname, ROWGROUPS); return false; } diff --git a/tests/parquet_test.py b/tests/parquet_test.py index 031b6cb055..8efa77f00d 100644 --- a/tests/parquet_test.py +++ b/tests/parquet_test.py @@ -4,7 +4,7 @@ import numpy as np import pytest -SIZE = 50 +SIZE = 100 NUMFILES = 5 verbose = True @@ -14,29 +14,25 @@ def test_parquet(self): ak_arr = ak.randint(0, 2**32, SIZE) ak_arr.save_parquet("pq_testcorrect", "my-dset") pq_arr = ak.read_parquet("pq_testcorrect*", "my-dset") - # get the dset from the dictionary in multi-locale cases + self.assertTrue((ak_arr == pq_arr).all()) + for f in glob.glob('pq_test*'): os.remove(f) - a = ak_arr.to_ndarray().sort() - b = pq_arr.to_ndarray().sort() - self.assertTrue(a == b) def test_multi_file(self): adjusted_size = int(SIZE/NUMFILES)*NUMFILES test_arrs = [] elems = ak.randint(0, 2**32, adjusted_size) per_arr = int(adjusted_size/NUMFILES) + print(elems) for i in range(NUMFILES): test_arrs.append(elems[(i*per_arr):(i*per_arr)+per_arr]) + print(test_arrs[i]) test_arrs[i].save_parquet("pq_test" + str(i), "test-dset") - a = elems.to_ndarray() - a.sort() pq_arr = ak.read_parquet("pq_test*", "test-dset") - b = pq_arr.to_ndarray() - b.sort() - self.assertTrue((a == b).all()) + self.assertTrue((elems == pq_arr).all()) for f in glob.glob('pq_test*'): os.remove(f) From 7925ffdaa90f98447eb8e5b94586f4e1a9f22cc7 Mon Sep 17 00:00:00 2001 From: Ben McDonald Date: Thu, 2 Dec 2021 08:32:23 -0800 Subject: [PATCH 3/3] Remove print statements in Parquet test --- tests/parquet_test.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/parquet_test.py b/tests/parquet_test.py index 8efa77f00d..3b1781b255 100644 --- a/tests/parquet_test.py +++ b/tests/parquet_test.py @@ -24,10 +24,8 @@ def test_multi_file(self): test_arrs = [] elems = ak.randint(0, 2**32, adjusted_size) per_arr = int(adjusted_size/NUMFILES) - print(elems) for i in range(NUMFILES): test_arrs.append(elems[(i*per_arr):(i*per_arr)+per_arr]) - print(test_arrs[i]) test_arrs[i].save_parquet("pq_test" + str(i), "test-dset") pq_arr = ak.read_parquet("pq_test*", "test-dset")