diff --git a/.travis.yml b/.travis.yml index 357f415..a62423e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,7 @@ jobs: os: linux - dist: focal os: linux - - osx_image: xcode12.5 + - osx_image: xcode14 os: osx - os: windows language: c diff --git a/docs/reference.md b/docs/reference.md index df43315..086c8ce 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -2005,6 +2005,7 @@ Supported options: - `COMPRESSION` - Selects the compression type for Arrow to use when writing Parquet files. The libarrow build being used must include the corresponding libraries. Values supported: `UNCOMPRESSED` (default), `SNAPPY`, `GZIP`, `BROTLI`, `ZSTD`, `LZ4_RAW`, `LZ4`, `LZ4_HADOOP`, `LZO`, `BZ2`. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. - `NULL_MAPPING` - Sub-dictionary of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `ARROW_CHUNK_ROWS` - The number of rows to include in each arrow array. If the total rows in the kdb data are greater then the kdb lists are internally chunked into the parquet file writer. This is different to row groups (set using `PARQUET_CHUNK_SIZE`) which control how the parquet file is structured. Long, default 0 (not enabled). > :warning: **The Parquet format is compressed and designed for for maximum space efficiency which may cause a performance overhead compared to Arrow. Parquet is also less fully featured than Arrow which can result in schema limitations** > @@ -2044,6 +2045,7 @@ Supported options: - `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`. Later versions are more fully featured but may be incompatible with older Parquet implementations. Default `V1.0` - `COMPRESSION` - Selects the compression type for Arrow to use when writing Parquet files. The libarrow build being used must include the corresponding libraries. Values supported: `UNCOMPRESSED` (default), `SNAPPY`, `GZIP`, `BROTLI`, `ZSTD`, `LZ4_RAW`, `LZ4`, `LZ4_HADOOP`, `LZO`, `BZ2`. - `NULL_MAPPING` - Sub-dictionary of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `ARROW_CHUNK_ROWS` - The number of rows to include in each arrow array. If the total rows in the kdb data are greater then the kdb lists are internally chunked into the parquet file writer. This is different to row groups (set using `PARQUET_CHUNK_SIZE`) which control how the parquet file is structured. Long, default 0 (not enabled). > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > diff --git a/src/TableData.cpp b/src/TableData.cpp index 4b16b12..9c00302 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -96,7 +96,7 @@ std::vector> MakeChunkedArrays( // Create a an arrow table from the arrow schema and mixed list of kdb array objects std::shared_ptr MakeTable(std::shared_ptr schema, K array_data, kx::arrowkdb::TypeMappingOverride& type_overrides) { - return arrow::Table::Make(schema, MakeArrays(schema, array_data, type_overrides)); + return arrow::Table::Make(schema, MakeChunkedArrays(schema, array_data, type_overrides)); } K prettyPrintTable(K schema_id, K array_data, K options) @@ -222,6 +222,9 @@ K writeParquet(K parquet_file, K schema_id, K array_data, K options) // Type mapping overrides kx::arrowkdb::TypeMappingOverride type_overrides{ write_options }; + // Chunk size + write_options.GetIntOption( kx::arrowkdb::Options::ARROW_CHUNK_ROWS, type_overrides.chunk_length ); + auto parquet_props = parquet_props_builder.compression(getCompressionType(write_options))->build(); auto arrow_props = arrow_props_builder.build();