From b9d52e20f9c7b61a5df47b1afc03a65e46b59ef1 Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Mon, 31 Oct 2022 19:32:23 +0000 Subject: [PATCH] * Support latest v2 parquet file formats * Upgrade xcode version --- .travis.yml | 2 +- README.md | 2 +- docs/reference.md | 4 ++-- src/TableData.cpp | 9 +++++++++ src/TableData.h | 19 ++++++++++++++----- 5 files changed, 27 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3237bc0..7deb2e9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,7 @@ jobs: os: linux - dist: focal os: linux - - osx_image: xcode10.2 + - osx_image: xcode12.5 os: osx - os: windows language: c diff --git a/README.md b/README.md index c9adc8e..d48f31e 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ Conversely, Arrow is an in-memory format meant for direct and efficient use for - kdb+ ≥ 3.5 64-bit (Linux/MacOS/Windows) - Apache Arrow = 9.0.0 (or ≥ 2.0.0 if building `arrowkdb` from source) -- C++11 or later +- C++14 or later - CMake ≥ 3.1.3 diff --git a/docs/reference.md b/docs/reference.md index 4a042e5..e150c04 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -2061,7 +2061,7 @@ The mixed list of Arrow array data should be ordered in schema field number and Supported options: - `PARQUET_CHUNK_SIZE` - Controls the approximate size of encoded data pages within a column chunk. Long, default 1MB. -- `PARQUET_VERSION` - Select the Parquet format version, either `V1.0` or `V2.0`. `V2.0` is more fully featured but may be incompatible with older Parquet implementations. String, default `V1.0` +- `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`. Later versions are more fully featured but may be incompatible with older Parquet implementations. Default `V1.0` - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. ??? warning "The Parquet format is compressed and designed for for maximum space efficiency which may cause a performance overhead compared to Arrow. Parquet is also less fully featured than Arrow which can result in schema limitations" @@ -2099,7 +2099,7 @@ returns generic null on success Supported options: - `PARQUET_CHUNK_SIZE` - Controls the approximate size of encoded data pages within a column chunk. Long, default 1MB. -- `PARQUET_VERSION` - Select the Parquet format version, either `V1.0` or `V2.0`. `V2.0` is more fully featured but may be incompatible with older Parquet implementations. String, default `V1.0` +- `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`. Later versions are more fully featured but may be incompatible with older Parquet implementations. Default `V1.0` ??? warning "Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors" diff --git a/src/TableData.cpp b/src/TableData.cpp index 8cbeb73..0deeb88 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -157,6 +157,15 @@ K writeParquet(K parquet_file, K schema_id, K array_data, K options) if (parquet_version == "V2.0") { parquet_props_builder.version(parquet::ParquetVersion::PARQUET_2_0); parquet_props_builder.data_page_version(parquet::ParquetDataPageVersion::V2); + } else if (parquet_version == "V2.4") { + parquet_props_builder.version(parquet::ParquetVersion::PARQUET_2_4); + parquet_props_builder.data_page_version(parquet::ParquetDataPageVersion::V2); + } else if (parquet_version == "V2.6") { + parquet_props_builder.version(parquet::ParquetVersion::PARQUET_2_6); + parquet_props_builder.data_page_version(parquet::ParquetDataPageVersion::V2); + } else if (parquet_version == "V2.LATEST") { + parquet_props_builder.version(parquet::ParquetVersion::PARQUET_2_LATEST); + parquet_props_builder.data_page_version(parquet::ParquetDataPageVersion::V2); } else { // Not using v2.0 so map timestamp(ns) to timestamp(us) with truncation arrow_props_builder.coerce_timestamps(arrow::TimeUnit::MICRO); diff --git a/src/TableData.h b/src/TableData.h index 1f837ee..35f42b3 100644 --- a/src/TableData.h +++ b/src/TableData.h @@ -87,9 +87,10 @@ extern "C" * PARQUET_CHUNK_SIZE (long) - Controls the approximate size of encoded data * pages within a column chunk. Default 1MB * - * PARQUET_VERSION (string) - Selects the Parquet format version, either - * `V1.0` or `V2.0`. `V2.0` is more fully featured but may be incompatible - * with older Parquet implementations. Default `V1.0` + * PARQUET_VERSION (string) - Selects the Parquet format version: `V1.0`, + * `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`. Later versions are more fully + * featured but may be incompatible with older Parquet implementations. + * Default `V1.0` * * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the * default type mapping for the arrow decimal128 datatype and instead @@ -170,14 +171,22 @@ extern "C" * * Supported options: * + * PARQUET_MULTITHREADED_READ (long) - Flag indicating whether the parquet + * reader should run in multithreaded mode. This can improve performance by + * processing multiple columns in parallel. Default 0 + * + * USE_MMAP (long) - Flag indicating whether the parquet file should be memory + * mapped in. This can improve performance on systems which support mmap. + * Default 0 + * * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the * default type mapping for the arrow decimal128 datatype and instead * represent it as a double (9h). Default 0. * * @param parquet_file String name of the parquet file to read - * @param row_groups Integer list (6h) of row groups indicies to read, or + * @param row_groups Integer list (6h) of row groups indices to read, or * generic null (::) to read all row groups - * @param columns Integer list (6h) of column indicies to read, or + * @param columns Integer list (6h) of column indices to read, or * generic null (::) to read all columns * @options Dictionary of options or generic null (::) to use * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or