Merge pull request #26 from KxSystems/KXI-23708-orc-data-loader

Kxi 23708 orc data loader
KxSystems · Mar 30, 2023 · c864f83 · c864f83
2 parents 0c4e452 + c51ad70
commit c864f83
Show file tree

Hide file tree

Showing 17 changed files with 1,158 additions and 58 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 arrowkdb.code-workspace
 .vscode/
 build/
+tests/*.q
 test.q
 unit.q
 *.user
diff --git a/.travis.yml b/.travis.yml
@@ -82,7 +82,11 @@ before_install:
 script:
   - if [[ $TESTS == "True" && "x$OD" != "x" && "x$QLIC_KC" != "x" ]]; then
       curl -o test.q -L https://github.com/KxSystems/hdf5/raw/master/test.q;
-      q test.q tests -q && q test.q tests/null_mapping -q && q test.q tests/null_bitmap -q;
+      if [[ $TRAVIS_OS_NAME == "windows" ]]; then
+        q test.q tests -q && q test.q tests/null_mapping -q && q test.q tests/null_bitmap -q;
+      else
+        q test.q tests -q && q test.q tests/null_mapping -q && q test.q tests/null_bitmap -q && q test.q tests/orc_dataloader -q;
+      fi
     fi
   - if [[ $TRAVIS_OS_NAME == "windows" && $BUILD == "True" ]]; then
       7z a -tzip -r $FILE_NAME ./cmake/$FILE_ROOT/*;

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -5,11 +5,8 @@ endif()
 cmake_minimum_required(VERSION 3.1.3)
 project(arrowkdb CXX)
 
+set(CMAKE_PREFIX_PATH "${CMAKE_PREFIX_PATH};${ARROW_INSTALL}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -DKXVER=3")
-set(CMAKE_CXX_STANDARD 14)
-IF(APPLE)
-  set(CMAKE_CXX_STANDARD 17)
-endif()
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
@@ -28,6 +25,13 @@ include_directories (
     ${CMAKE_BINARY_DIR} # For 'k.h', downloaded below
 )
 
+find_package(Arrow REQUIRED)
+if(ARROW_SO_VERSION LESS "1000")
+  set(CMAKE_CXX_STANDARD 14)
+else()
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
 find_library(ARROW_LIBRARY
     NAMES arrow
     HINTS "${ARROW_INSTALL}/lib/"
@@ -64,7 +68,7 @@ else()
    set(OSFLAG l)
 endif()
 
-target_link_libraries(${MY_LIBRARY_NAME} ${ARROW_LIBRARY} ${PARQUET_LIBRARY} ${LINK_LIBS})
+target_link_libraries(${MY_LIBRARY_NAME} ${ARROW_LIBRARY}  ${PARQUET_LIBRARY} ${LINK_LIBS})
 set_target_properties(${MY_LIBRARY_NAME} PROPERTIES PREFIX "")
 
 # Check if 32-bit/64-bit machine

diff --git a/README.md b/README.md
@@ -44,10 +44,11 @@ Conversely, Arrow is an in-memory format meant for direct and efficient use for
 ### Requirements
 
 - kdb+ ≥ 3.5 64-bit (Linux/MacOS/Windows)
-- Apache Arrow = 9.0.0 (or ≥ 6.0.0 if building `arrowkdb` from source)
+- Apache Arrow ≥ 9.0.0 (or ≥ 6.0.0 if building `arrowkdb` from source)
 - C++14 or later
 - CMake ≥ 3.1.3
 
+> :warning: If using the packaged version of `arrowkdb` you should install version 9.0.0 of Apache Arrow
 
 
 ### Third-party library installation

diff --git a/docs/reference.md b/docs/reference.md
@@ -108,6 +108,12 @@ object | use
 [`ipc.parseArrowSchema`](#ipcparsearrowschema) | Parse the schema from an Arrow stream
 [`ipc.parseArrowData`](#ipcparsearrowdata) | Parse an Arrow table from an Arrow stream and convert to a kdb+ mixed list of array data
 [`ipc.parseArrowToTable`](#ipcparsearrowtotable) | Parse an Arrow table from an Arrow file and convert to a kdb+ table
+<br>**[Apache ORC files](#apache-orc-files)**
+[`orc.writeOrc`](#orcwriteorc) | Convert a kdb+ mixed list of array data to an Arrow table and write to an Apache ORC file
+[`orc.writeOrcFromTable`](#orcwriteorcfromtable) | Convert a kdb+ table to an Arrow table and write to an Apache ORC file, inferring the schema from the kdb+ table structure
+[`orc.readOrcSchema`](#orcreadorcschema) | Read the schema from an Apache ORC file
+[`orc.readOrcData`](#orcreadorcdata) | Read an Arrow table from an Apache ORC file and convert to a kdb+ mixed list of array data
+[`orc.readOrcToTable`](#orcreadorctotable) | Read an Arrow table from an Apache ORC file and convert to a kdb+ table
 <br>**[Utilities](#utilities)**
 [`util.buildInfo`](#utilbuildinfo) | Return build information regarding the in use Arrow library
 
@@ -2610,6 +2616,177 @@ q)new_table~table
 1b
 ```
 
+## Apache ORC files
+
+### `orc.writeOrc`
+
+*Convert a kdb+ mixed list of array data to an Arrow table and write to an Apache ORC file*
+```txt
+.arrowkdb.orc.writeOrc[orc_file;schema_id;array_data;options]
+```
+
+Where:
+
+- `orc_file` is a string containing the ORC file name
+- `schema_id` is the schema identifier to use for the table
+- `array_data` is a mixed list of array data
+- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults.  Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`.
+
+returns generic null on success
+
+> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**.
+
+The mixed list of Arrow array data should be ordered in schema field number and each list item representing one of the arrays must be structured according to the field’s datatype.
+
+Supported options:
+
+- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column.  Long, default 1MB.
+- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h).  Long, default 0.
+- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details.
+
+> :warning: The Apache ORC file format is [less](https://arrow.apache.org/docs/cpp/orc.html) fully featured compared to Parquet and consequently the ORC dataloader currently **does not support unsigned datatypes**.
+
+```q
+q)i8_fd:.arrowkdb.fd.field[`int8;.arrowkdb.dt.int8[]];
+q)i16_fd:.arrowkdb.fd.field[`int16;.arrowkdb.dt.int16[]];
+q)i32_fd:.arrowkdb.fd.field[`int32;.arrowkdb.dt.int32[]];
+q)orc_schema:.arrowkdb.sc.schema[(i8_fd,i16_fd,i32_fd)];
+q)orc_data:(5?0x64;5?100h;5?100i);
+q).arrowkdb.orc.writeOrc["dataloader.orc";orc_schema;orc_data;options]
+```
+
+## `orc.writeOrcFromTable`
+
+*Convert a kdb+ table to an Arrow table and write to an Apache ORC file, inferring the schema from the kdb+ table structure*
+
+```txt
+.arrowkdb.orc.writeOrcFromTable[orc_file;table;options]
+```
+
+Where:
+
+- `orc_file` is a string containing the ORC file name
+- `table` is a kdb+ table
+- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults.  Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`.
+
+returns generic null on success
+
+> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**.
+
+Supported options:
+
+- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column.  Long, default 1MB.
+- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h).  Long, default 0.
+- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details.
+
+> :warning: The Apache ORC file format is [less](https://arrow.apache.org/docs/cpp/orc.html) fully featured compared to Parquet and consequently the ORC dataloader currently **does not support unsigned datatypes**.
+
+> :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors**
+>
+> Each column in the table is mapped to a field in the schema.  The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes).
+
+```q
+q)table:([] i8_fd:5?0x64; i16_fd:5?100h; i32_fd:5?100i)
+q).arrowkdb.orc.writeOrcFromTable["dataloader.orc";table;::]
+```
+
+### `orc.readOrcSchema`
+
+*Read the schema from an Apache ORC file*
+
+```txt
+.arrowkdb.orc.readOrcSchema[orc_file]
+```
+
+Where `orc_file` is a string containing the ORC file name
+
+returns the schema identifier
+
+> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**.
+
+```q
+q)i8_fd:.arrowkdb.fd.field[`int8;.arrowkdb.dt.int8[]];
+q)i16_fd:.arrowkdb.fd.field[`int16;.arrowkdb.dt.int16[]];
+q)i32_fd:.arrowkdb.fd.field[`int32;.arrowkdb.dt.int32[]];
+q)orc_schema:.arrowkdb.sc.schema[(i8_fd,i16_fd,i32_fd)];
+q)orc_data:(5?0x64;5?100h;5?100i);
+q).arrowkdb.orc.writeOrc["dataloader.orc";orc_schema;orc_data;options]
+q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.orc.readOrcSchema["dataloader.orc"]]
+1b
+```
+
+### `orc.readOrcData`
+
+*Read an Arrow table from an Apache ORC file and convert to a kdb+ mixed list of array data*
+
+```txt
+.arrowkdb.orc.readOrcData[orc_file;options]
+```
+
+Where:
+
+- `orc_file` is a string containing the ORC file name
+- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults.  Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`.
+
+returns the array data
+
+> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**.
+
+Supported options:
+
+- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column.  Long, default 1MB.
+- `USE_MMAP` - Flag indicating whether the ORC file should be memory mapped in.  This can improve performance on systems which support mmap.  Long, default: 0.
+- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h).  Long, default 0.
+- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values.  See [here](null-mapping.md) for more details.
+- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0.
+
+```q
+q)i8_fd:.arrowkdb.fd.field[`int8;.arrowkdb.dt.int8[]];
+q)i16_fd:.arrowkdb.fd.field[`int16;.arrowkdb.dt.int16[]];
+q)i32_fd:.arrowkdb.fd.field[`int32;.arrowkdb.dt.int32[]];
+q)orc_schema:.arrowkdb.sc.schema[(i8_fd,i16_fd,i32_fd)];
+q)orc_data:(5?0x64;5?100h;5?100i);
+q).arrowkdb.orc.writeOrc["dataloader.orc";orc_schema;orc_data;options]
+q)read_data:.arrowkdb.orc.readOrcData["dataloader.orc";::]
+q)orc_data~read_data
+1b
+```
+
+### `orc.readOrcToTable`
+
+*Read an Arrow table from an Apache ORC file and convert to a kdb+ table*
+
+```txt
+.arrowkdb.orc.readOrcToTable[orc_file;options]
+```
+
+Where:
+
+- `orc_file` is a string containing the ORC file name
+- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults.  Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`.
+
+returns the kdb+ table
+
+> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**.
+
+Each schema field name is used as the column name and the Arrow array data is used as the column data.
+
+Supported options:
+
+- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column.  Long, default 1MB.
+- `USE_MMAP` - Flag indicating whether the ORC file should be memory mapped in.  This can improve performance on systems which support mmap.  Long, default: 0.
+- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h).  Long, default 0.
+- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values.  See [here](null-mapping.md) for more details.
+- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0.
+
+```q
+q)table:([] i8_fd:5?0x64; i16_fd:5?100h; i32_fd:5?100i)
+q).arrowkdb.orc.writeOrcFromTable["dataloader.orc";table;::]
+q)read_table:.arrowkdb.orc.readOrcToTable["dataloader.orc";::]
+q)read_table~table
+1b
+```
+
 ## Utilities
 
 ### `util.buildInfo`

diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q
@@ -359,4 +359,4 @@ show nested_union_nulls~stream_union_nulls[1]
 -1 "\n+----------------------------------------+\n";
 
 // Process off
-//exit 0;
+exit 0;
diff --git a/examples/orc_dataloader.q b/examples/orc_dataloader.q
@@ -0,0 +1,85 @@
+// orc_dataloader.q
+// Examples of read/write ORC file
+
+-1"\n+----------|| orc_dataloader.q ||----------+\n";
+
+// import the arrowkdb library
+\l q/arrowkdb.q
+
+// Filesystem functions for Linux/MacOS/Windows
+ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]};
+rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]};
+
+/////////////////////////
+// CONSTRUCTED SCHEMAS //
+/////////////////////////
+
+//-------------------//
+// Create the schema //
+//-------------------//
+
+// Create the datatype identifiers
+ts_dt:.arrowkdb.dt.timestamp[`nano];
+
+i8_dt:.arrowkdb.dt.int8[];
+i16_dt:.arrowkdb.dt.int16[];
+i32_dt:.arrowkdb.dt.int32[];
+i64_dt:.arrowkdb.dt.int64[];
+
+// Create the field identifiers
+ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt];
+
+i8_fd:.arrowkdb.fd.field[`int8;i8_dt];
+i16_fd:.arrowkdb.fd.field[`int16;i16_dt];
+i32_fd:.arrowkdb.fd.field[`int32;i32_dt];
+i64_fd:.arrowkdb.fd.field[`int64;i64_dt];
+
+// Create the schemas for the list of fields
+dataloader_schema:.arrowkdb.sc.schema[(ts_fd,i8_fd,i16_fd,i32_fd,i64_fd)];
+
+// Print the schemas
+.arrowkdb.sc.printSchema[dataloader_schema];
+
+// Number of items in each array
+N:10
+
+// Create data for each column in the table
+ts_data:asc N?0p;
+
+i8_data:N?0x64;
+i16_data:N?100h;
+i32_data:N?100i;
+i64_data:N?100;
+
+// Combine the data for all columns
+dataloader_data:(ts_data;i8_data;i16_data;i32_data;i64_data);
+
+// Pretty print the Arrow table populated from the array data
+.arrowkdb.tb.prettyPrintTable[dataloader_schema;dataloader_data;::];
+
+//---------------------------//
+// Example-1. Apache ORC file//
+//---------------------------//
+
+// Write the schema and array data to a ORC file
+orc_options:(``ORC_CHUNK_SIZE)!((::);1024);
+
+orc_dataloader:"orc_dataloader.orc"
+.arrowkdb.orc.writeOrc[orc_dataloader;dataloader_schema;dataloader_data;orc_options]
+show orc_dataloader;
+
+// Read the schema back and compare
+orc_dataloader_schema:.arrowkdb.orc.readOrcSchema[orc_dataloader];
+show .arrowkdb.sc.equalSchemas[dataloader_schema;orc_dataloader_schema]
+show dataloader_schema~orc_dataloader_schema
+
+// Read the array data back and compare
+orc_dataloader_data:.arrowkdb.orc.readOrcData[orc_dataloader;orc_options];
+show dataloader_data~orc_dataloader_data
+rm orc_dataloader;
+
+
+-1 "\n+----------------------------------------+\n";
+
+// Process off
+exit 0;