diff --git a/.gitignore b/.gitignore index 701d371..391b645 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ arrowkdb.code-workspace .vscode/ build/ +tests/*.q test.q unit.q *.user diff --git a/.travis.yml b/.travis.yml index 0606daf..357f415 100644 --- a/.travis.yml +++ b/.travis.yml @@ -82,7 +82,11 @@ before_install: script: - if [[ $TESTS == "True" && "x$OD" != "x" && "x$QLIC_KC" != "x" ]]; then curl -o test.q -L https://github.com/KxSystems/hdf5/raw/master/test.q; - q test.q tests -q && q test.q tests/null_mapping -q && q test.q tests/null_bitmap -q; + if [[ $TRAVIS_OS_NAME == "windows" ]]; then + q test.q tests -q && q test.q tests/null_mapping -q && q test.q tests/null_bitmap -q; + else + q test.q tests -q && q test.q tests/null_mapping -q && q test.q tests/null_bitmap -q && q test.q tests/orc_dataloader -q; + fi fi - if [[ $TRAVIS_OS_NAME == "windows" && $BUILD == "True" ]]; then 7z a -tzip -r $FILE_NAME ./cmake/$FILE_ROOT/*; diff --git a/CMakeLists.txt b/CMakeLists.txt index 33bfada..8320421 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,11 +5,8 @@ endif() cmake_minimum_required(VERSION 3.1.3) project(arrowkdb CXX) +set(CMAKE_PREFIX_PATH "${CMAKE_PREFIX_PATH};${ARROW_INSTALL}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -DKXVER=3") -set(CMAKE_CXX_STANDARD 14) -IF(APPLE) - set(CMAKE_CXX_STANDARD 17) -endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) @@ -28,6 +25,13 @@ include_directories ( ${CMAKE_BINARY_DIR} # For 'k.h', downloaded below ) +find_package(Arrow REQUIRED) +if(ARROW_SO_VERSION LESS "1000") + set(CMAKE_CXX_STANDARD 14) +else() + set(CMAKE_CXX_STANDARD 17) +endif() + find_library(ARROW_LIBRARY NAMES arrow HINTS "${ARROW_INSTALL}/lib/" @@ -64,7 +68,7 @@ else() set(OSFLAG l) endif() -target_link_libraries(${MY_LIBRARY_NAME} ${ARROW_LIBRARY} ${PARQUET_LIBRARY} ${LINK_LIBS}) +target_link_libraries(${MY_LIBRARY_NAME} ${ARROW_LIBRARY} ${PARQUET_LIBRARY} ${LINK_LIBS}) set_target_properties(${MY_LIBRARY_NAME} PROPERTIES PREFIX "") # Check if 32-bit/64-bit machine diff --git a/README.md b/README.md index dc5e95f..8bbb573 100644 --- a/README.md +++ b/README.md @@ -44,10 +44,11 @@ Conversely, Arrow is an in-memory format meant for direct and efficient use for ### Requirements - kdb+ ≥ 3.5 64-bit (Linux/MacOS/Windows) -- Apache Arrow = 9.0.0 (or ≥ 6.0.0 if building `arrowkdb` from source) +- Apache Arrow ≥ 9.0.0 (or ≥ 6.0.0 if building `arrowkdb` from source) - C++14 or later - CMake ≥ 3.1.3 +> :warning: If using the packaged version of `arrowkdb` you should install version 9.0.0 of Apache Arrow ### Third-party library installation diff --git a/docs/reference.md b/docs/reference.md index 98efb53..66869f6 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -108,6 +108,12 @@ object | use [`ipc.parseArrowSchema`](#ipcparsearrowschema) | Parse the schema from an Arrow stream [`ipc.parseArrowData`](#ipcparsearrowdata) | Parse an Arrow table from an Arrow stream and convert to a kdb+ mixed list of array data [`ipc.parseArrowToTable`](#ipcparsearrowtotable) | Parse an Arrow table from an Arrow file and convert to a kdb+ table +
**[Apache ORC files](#apache-orc-files)** +[`orc.writeOrc`](#orcwriteorc) | Convert a kdb+ mixed list of array data to an Arrow table and write to an Apache ORC file +[`orc.writeOrcFromTable`](#orcwriteorcfromtable) | Convert a kdb+ table to an Arrow table and write to an Apache ORC file, inferring the schema from the kdb+ table structure +[`orc.readOrcSchema`](#orcreadorcschema) | Read the schema from an Apache ORC file +[`orc.readOrcData`](#orcreadorcdata) | Read an Arrow table from an Apache ORC file and convert to a kdb+ mixed list of array data +[`orc.readOrcToTable`](#orcreadorctotable) | Read an Arrow table from an Apache ORC file and convert to a kdb+ table
**[Utilities](#utilities)** [`util.buildInfo`](#utilbuildinfo) | Return build information regarding the in use Arrow library @@ -2610,6 +2616,177 @@ q)new_table~table 1b ``` +## Apache ORC files + +### `orc.writeOrc` + +*Convert a kdb+ mixed list of array data to an Arrow table and write to an Apache ORC file* +```txt +.arrowkdb.orc.writeOrc[orc_file;schema_id;array_data;options] +``` + +Where: + +- `orc_file` is a string containing the ORC file name +- `schema_id` is the schema identifier to use for the table +- `array_data` is a mixed list of array data +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. + +returns generic null on success + +> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. + +The mixed list of Arrow array data should be ordered in schema field number and each list item representing one of the arrays must be structured according to the field’s datatype. + +Supported options: + +- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. + +> :warning: The Apache ORC file format is [less](https://arrow.apache.org/docs/cpp/orc.html) fully featured compared to Parquet and consequently the ORC dataloader currently **does not support unsigned datatypes**. + +```q +q)i8_fd:.arrowkdb.fd.field[`int8;.arrowkdb.dt.int8[]]; +q)i16_fd:.arrowkdb.fd.field[`int16;.arrowkdb.dt.int16[]]; +q)i32_fd:.arrowkdb.fd.field[`int32;.arrowkdb.dt.int32[]]; +q)orc_schema:.arrowkdb.sc.schema[(i8_fd,i16_fd,i32_fd)]; +q)orc_data:(5?0x64;5?100h;5?100i); +q).arrowkdb.orc.writeOrc["dataloader.orc";orc_schema;orc_data;options] +``` + +## `orc.writeOrcFromTable` + +*Convert a kdb+ table to an Arrow table and write to an Apache ORC file, inferring the schema from the kdb+ table structure* + +```txt +.arrowkdb.orc.writeOrcFromTable[orc_file;table;options] +``` + +Where: + +- `orc_file` is a string containing the ORC file name +- `table` is a kdb+ table +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. + +returns generic null on success + +> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. + +Supported options: + +- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. + +> :warning: The Apache ORC file format is [less](https://arrow.apache.org/docs/cpp/orc.html) fully featured compared to Parquet and consequently the ORC dataloader currently **does not support unsigned datatypes**. + +> :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** +> +> Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). + +```q +q)table:([] i8_fd:5?0x64; i16_fd:5?100h; i32_fd:5?100i) +q).arrowkdb.orc.writeOrcFromTable["dataloader.orc";table;::] +``` + +### `orc.readOrcSchema` + +*Read the schema from an Apache ORC file* + +```txt +.arrowkdb.orc.readOrcSchema[orc_file] +``` + +Where `orc_file` is a string containing the ORC file name + +returns the schema identifier + +> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. + +```q +q)i8_fd:.arrowkdb.fd.field[`int8;.arrowkdb.dt.int8[]]; +q)i16_fd:.arrowkdb.fd.field[`int16;.arrowkdb.dt.int16[]]; +q)i32_fd:.arrowkdb.fd.field[`int32;.arrowkdb.dt.int32[]]; +q)orc_schema:.arrowkdb.sc.schema[(i8_fd,i16_fd,i32_fd)]; +q)orc_data:(5?0x64;5?100h;5?100i); +q).arrowkdb.orc.writeOrc["dataloader.orc";orc_schema;orc_data;options] +q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.orc.readOrcSchema["dataloader.orc"]] +1b +``` + +### `orc.readOrcData` + +*Read an Arrow table from an Apache ORC file and convert to a kdb+ mixed list of array data* + +```txt +.arrowkdb.orc.readOrcData[orc_file;options] +``` + +Where: + +- `orc_file` is a string containing the ORC file name +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. + +returns the array data + +> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. + +Supported options: + +- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. +- `USE_MMAP` - Flag indicating whether the ORC file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. + +```q +q)i8_fd:.arrowkdb.fd.field[`int8;.arrowkdb.dt.int8[]]; +q)i16_fd:.arrowkdb.fd.field[`int16;.arrowkdb.dt.int16[]]; +q)i32_fd:.arrowkdb.fd.field[`int32;.arrowkdb.dt.int32[]]; +q)orc_schema:.arrowkdb.sc.schema[(i8_fd,i16_fd,i32_fd)]; +q)orc_data:(5?0x64;5?100h;5?100i); +q).arrowkdb.orc.writeOrc["dataloader.orc";orc_schema;orc_data;options] +q)read_data:.arrowkdb.orc.readOrcData["dataloader.orc";::] +q)orc_data~read_data +1b +``` + +### `orc.readOrcToTable` + +*Read an Arrow table from an Apache ORC file and convert to a kdb+ table* + +```txt +.arrowkdb.orc.readOrcToTable[orc_file;options] +``` + +Where: + +- `orc_file` is a string containing the ORC file name +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. + +returns the kdb+ table + +> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. + +Each schema field name is used as the column name and the Arrow array data is used as the column data. + +Supported options: + +- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. +- `USE_MMAP` - Flag indicating whether the ORC file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. + +```q +q)table:([] i8_fd:5?0x64; i16_fd:5?100h; i32_fd:5?100i) +q).arrowkdb.orc.writeOrcFromTable["dataloader.orc";table;::] +q)read_table:.arrowkdb.orc.readOrcToTable["dataloader.orc";::] +q)read_table~table +1b +``` + ## Utilities ### `util.buildInfo` diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 9dbb7b4..99e75d2 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -359,4 +359,4 @@ show nested_union_nulls~stream_union_nulls[1] -1 "\n+----------------------------------------+\n"; // Process off -//exit 0; +exit 0; diff --git a/examples/orc_dataloader.q b/examples/orc_dataloader.q new file mode 100644 index 0000000..8ed9054 --- /dev/null +++ b/examples/orc_dataloader.q @@ -0,0 +1,85 @@ +// orc_dataloader.q +// Examples of read/write ORC file + +-1"\n+----------|| orc_dataloader.q ||----------+\n"; + +// import the arrowkdb library +\l q/arrowkdb.q + +// Filesystem functions for Linux/MacOS/Windows +ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +///////////////////////// +// CONSTRUCTED SCHEMAS // +///////////////////////// + +//-------------------// +// Create the schema // +//-------------------// + +// Create the datatype identifiers +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +i8_dt:.arrowkdb.dt.int8[]; +i16_dt:.arrowkdb.dt.int16[]; +i32_dt:.arrowkdb.dt.int32[]; +i64_dt:.arrowkdb.dt.int64[]; + +// Create the field identifiers +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; +i16_fd:.arrowkdb.fd.field[`int16;i16_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; + +// Create the schemas for the list of fields +dataloader_schema:.arrowkdb.sc.schema[(ts_fd,i8_fd,i16_fd,i32_fd,i64_fd)]; + +// Print the schemas +.arrowkdb.sc.printSchema[dataloader_schema]; + +// Number of items in each array +N:10 + +// Create data for each column in the table +ts_data:asc N?0p; + +i8_data:N?0x64; +i16_data:N?100h; +i32_data:N?100i; +i64_data:N?100; + +// Combine the data for all columns +dataloader_data:(ts_data;i8_data;i16_data;i32_data;i64_data); + +// Pretty print the Arrow table populated from the array data +.arrowkdb.tb.prettyPrintTable[dataloader_schema;dataloader_data;::]; + +//---------------------------// +// Example-1. Apache ORC file// +//---------------------------// + +// Write the schema and array data to a ORC file +orc_options:(``ORC_CHUNK_SIZE)!((::);1024); + +orc_dataloader:"orc_dataloader.orc" +.arrowkdb.orc.writeOrc[orc_dataloader;dataloader_schema;dataloader_data;orc_options] +show orc_dataloader; + +// Read the schema back and compare +orc_dataloader_schema:.arrowkdb.orc.readOrcSchema[orc_dataloader]; +show .arrowkdb.sc.equalSchemas[dataloader_schema;orc_dataloader_schema] +show dataloader_schema~orc_dataloader_schema + +// Read the array data back and compare +orc_dataloader_data:.arrowkdb.orc.readOrcData[orc_dataloader;orc_options]; +show dataloader_data~orc_dataloader_data +rm orc_dataloader; + + +-1 "\n+----------------------------------------+\n"; + +// Process off +exit 0; diff --git a/examples/orc_null_support.q b/examples/orc_null_support.q new file mode 100644 index 0000000..53aee10 --- /dev/null +++ b/examples/orc_null_support.q @@ -0,0 +1,217 @@ +// orc_null_support.q +// Examples of creating a schema supporting null mapping and using it to read/write +// Apache ORC file with exposing null bitmap as a separate structure to kdb + +-1"\n+----------|| orc_null_support.q ||----------+\n"; + +// import the arrowkdb library +\l q/arrowkdb.q + +// Filesystem functions for Linux/MacOS/Windows +ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +///////////////////////// +// CONSTRUCTED SCHEMAS // +///////////////////////// + +//-------------------// +// Create the schema // +//-------------------// + +// Support null mapping in Apache ORC +int_opts:(`bool`int8`int16`int32`int64)!(1b;0x02;3h;4i;5); +float_opts:(`float32`float64`decimal)!(9.87e;6.54;3.21f); +cont_opts:(`utf8`binary)!("start";"x"$"alert"); +time_opts:(`date32`timestamp)!(2012.11.10;2011.01.01D00:00:00.000000000); + +compound_options:(``NULL_MAPPING)!((::);int_opts,float_opts,cont_opts,time_opts); + +// Create the datatype identifiers +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +i8_dt:.arrowkdb.dt.int8[]; +i16_dt:.arrowkdb.dt.int16[]; +i32_dt:.arrowkdb.dt.int32[]; +i64_dt:.arrowkdb.dt.int64[]; +f64_dt:.arrowkdb.dt.float64[]; + +str_dt:.arrowkdb.dt.utf8[]; +bin_dt:.arrowkdb.dt.binary[]; +dec_dt:.arrowkdb.dt.decimal128[38i;2i]; + +bool_dt:.arrowkdb.dt.boolean[]; +f32_dt:.arrowkdb.dt.float32[]; +d32_dt:.arrowkdb.dt.date32[]; + +// Create the field identifiers +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; +i16_fd:.arrowkdb.fd.field[`int16;i16_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; + +str_fd:.arrowkdb.fd.field[`string;str_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; + +numeric_schema:.arrowkdb.sc.schema[(ts_fd, i16_fd, i32_fd, i64_fd, f64_fd)]; +contiguous_schema:.arrowkdb.sc.schema[(str_fd, bin_fd, dec_fd)]; + +// Create a field containing the list datatype +list_dt:.arrowkdb.dt.list[i8_fd]; +list_fd:.arrowkdb.fd.field[`list_field;list_dt]; + +// Create a field containing the struct datatype +struct_dt:.arrowkdb.dt.struct[(bool_fd,f32_fd,d32_fd)]; +struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; + +// Create fields containing the map datatype +map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] +map_fd:.arrowkdb.fd.field[`map;map_dt]; + +// Create the schema containing the list and struct fields +compound_schema:.arrowkdb.sc.schema[(list_fd,struct_fd,map_fd)]; + +// Print the schema +-1"\nNumeric schema:"; +.arrowkdb.sc.printSchema[numeric_schema]; + +-1"\nContiguous schema:"; +.arrowkdb.sc.printSchema[contiguous_schema]; + +-1"\nCompound schema:"; +.arrowkdb.sc.printSchema[compound_schema]; + +// Number of items in each array +N:5 + +// Create data for each column in the table +ts_data:asc N?0p; + +i16_data:N?100h; +i16_data[0]:3h; +i32_data:N?100i; +i32_data[1]:4i; +i64_data:N?100; +i64_data[2]:5; +f64_data:N?100f; +f64_data[3]:6.54f; + +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[0]:"start" +bin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[1]:"x"$"alert" +dec_data:{"F"$.Q.f[2]x} each N?(10f) +dec_data[2]:3.21f + +N:3 +bool_data:N?(0b;1b); +bool_data[0]:1b; +f32_data:N?100e; +f32_data[1]:9.87e; +d32_data:N?(2012.11.10;2010.07.18;2011.07.16;2014.07.15;2016.07.11); +d32_data[2]:2012.11.10; + +// Combine the data for numeric columns +numeric_data:(ts_data;i16_data;i32_data;i64_data;f64_data); +// Combine the data for contiguous columns +contiguous_data:(str_data;bin_data;dec_data); + +// Combine the array data for the list and struct columns +list_array:(enlist 0x00;(0x0102);(0x030405)); +struct_array:(bool_data;f32_data;d32_data); +map_array:((enlist 1)!(enlist 1.23);(2 2)!(4.56 7.89);(3 3 3)!(9.87 6.54 3.21)) +compound_data:(list_array;struct_array;map_array); + +// Pretty print the Arrow table populated from the numeric data +compound_options[`DECIMAL128_AS_DOUBLE]:1 + +-1"\nNumeric table:"; +.arrowkdb.tb.prettyPrintTable[numeric_schema;numeric_data;compound_options]; + +// Show the string data as an arrow table +-1"\nContiguous table:"; +.arrowkdb.tb.prettyPrintTable[contiguous_schema;contiguous_data;compound_options] + +// Show the list data as an arrow table +-1"\nCompound table:"; +.arrowkdb.tb.prettyPrintTable[compound_schema;compound_data;compound_options] + +//---------------------------// +// Example-1. Apache ORC file// +//---------------------------// + +// Write the schema and array data to a ORC file +compound_options[`WITH_NULL_BITMAP]:1; + +orc_numeric:"numeric_bitmap.orc"; +orc_contiguous:"contiguous_bitmap.orc"; +orc_compound:"compound_bitmap.orc"; + +compound_options[`ORC_CHUNK_SIZE]:1024 + +.arrowkdb.orc.writeOrc[orc_numeric;numeric_schema;numeric_data;compound_options] +.arrowkdb.orc.writeOrc[orc_contiguous;contiguous_schema;contiguous_data;compound_options] +.arrowkdb.orc.writeOrc[orc_compound;compound_schema;compound_data;compound_options] + +show ls orc_numeric +show ls orc_contiguous +show ls orc_compound + +// Read the schema back and compare +orc_numeric_schema:.arrowkdb.orc.readOrcSchema[orc_numeric]; +orc_contiguous_schema:.arrowkdb.orc.readOrcSchema[orc_contiguous]; +orc_compound_schema:.arrowkdb.orc.readOrcSchema[orc_compound]; + +show .arrowkdb.sc.equalSchemas[numeric_schema;orc_numeric_schema] +show .arrowkdb.sc.equalSchemas[contiguous_schema;orc_contiguous_schema] +show .arrowkdb.sc.equalSchemas[compound_schema;orc_compound_schema] + +show numeric_schema~orc_numeric_schema +show contiguous_schema~orc_contiguous_schema +show compound_schema~orc_compound_schema + +// Read the array data back and compare +orc_numeric_data:.arrowkdb.orc.readOrcData[orc_numeric;compound_options]; +orc_contiguous_data:.arrowkdb.orc.readOrcData[orc_contiguous;compound_options]; +orc_compound_data:.arrowkdb.orc.readOrcData[orc_compound;compound_options]; + +show numeric_data~first orc_numeric_data +show contiguous_data~first orc_contiguous_data +show compound_data~first orc_compound_data + +// Compare null bitmaps of arrow data +numeric_nulls:(00000b;10000b;01000b;00100b;00010b); +contiguous_nulls:(10000b;01000b;00100b); +list_nulls:(enlist 0b;01b;000b); +struct_nulls:(100b;010b;001b); +map_nulls:((enlist 0b)!(enlist 0b);00b!00b;000b!010b) + +orc_numeric_nulls:last orc_numeric_data; +orc_contiguous_nulls:last orc_contiguous_data; +orc_list_nulls:last[orc_compound_data][0] +orc_struct_nulls:last[orc_compound_data][1] +orc_map_nulls:last[orc_compound_data][2] + +show numeric_nulls~numeric_nulls & orc_numeric_nulls +show contiguous_nulls~contiguous_nulls & orc_contiguous_nulls +show list_nulls~orc_list_nulls +show struct_nulls~struct_nulls & orc_struct_nulls +show map_nulls~orc_map_nulls + +rm orc_numeric; +rm orc_contiguous; +rm orc_compound; + + +-1 "\n+----------------------------------------+\n"; + +// Process off +exit 0; diff --git a/q/arrowkdb.q b/q/arrowkdb.q index b3d56cd..d2f92f3 100644 --- a/q/arrowkdb.q +++ b/q/arrowkdb.q @@ -106,6 +106,19 @@ tb.prettyPrintTable_:`arrowkdb 2:(`prettyPrintTable;3); tb.prettyPrintTable:{[x;y;z] -1 tb.prettyPrintTable_[x;y;z];}; tb.prettyPrintTableFromTable:{[table;options] tb.prettyPrintTable[sc.inferSchema[table];value flip table;options]}; +// ORC files +orc.writeOrc:`arrowkdb 2:(`writeORC;4); +orc.writeOrcFromTable:{[filename;table;options] orc.writeOrc[filename;sc.inferSchema[table];value flip table;options]}; +orc.readOrcSchema:`arrowkdb 2:(`readORCSchema;1); +orc.readOrcData:`arrowkdb 2:(`readORCData;2); +orc.readOrcToTable:{[filename;options] + fields:fd.fieldName each sc.schemaFields[orc.readOrcSchema[filename]]; + data:orc.readOrcData[filename;options]; + $[1~options`WITH_NULL_BITMAP; + (flip fields!first data;flip fields!last data); + flip fields!data + ] + }; // parquet files pq.writeParquet:`arrowkdb 2:(`writeParquet;4); @@ -132,7 +145,6 @@ pq.readParquetRowGroupsToTable:{[filename;row_groups;columns;options] ] }; - // arrow files ipc.writeArrow:`arrowkdb 2:(`writeArrow;4); ipc.writeArrowFromTable:{[filename;table;options] ipc.writeArrow[filename;sc.inferSchema[table];value flip table;options]}; diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 353d709..1d78225 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -144,176 +144,199 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint8_array = static_pointer_cast(array_data); + auto length = uint8_array->length(); if( type_overrides.null_mapping.have_uint8 && uint8_array->null_count() ){ - for( auto i = 0ll; i < uint8_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kG( k_array )[i] = ( uint8_array->IsNull( i ) * type_overrides.null_mapping.uint8_null ) + ( !uint8_array->IsNull( i ) * uint8_array->Value( i ) ); } } else { - memcpy(kG(k_array), uint8_array->raw_values(), uint8_array->length() * sizeof(arrow::UInt8Array::value_type)); + memcpy( &kG( k_array )[index], uint8_array->raw_values(), length * sizeof( arrow::UInt8Array::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int8_array = static_pointer_cast(array_data); + auto length = int8_array->length(); if( type_overrides.null_mapping.have_int8 && int8_array->null_count() ){ - for( auto i = 0ll; i < int8_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kG( k_array )[i] = ( int8_array->IsNull( i ) * type_overrides.null_mapping.int8_null ) + ( !int8_array->IsNull( i ) * int8_array->Value( i ) ); } } else { - memcpy(kG(k_array), int8_array->raw_values(), int8_array->length() * sizeof(arrow::Int8Array::value_type)); + memcpy( &kG( k_array )[index], int8_array->raw_values(), length * sizeof( arrow::Int8Array::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint16_array = static_pointer_cast(array_data); + auto length = uint16_array->length(); if( type_overrides.null_mapping.have_uint16 && uint16_array->null_count() ){ - for( auto i = 0ll; i < uint16_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kH( k_array )[i] = ( uint16_array->IsNull( i ) * type_overrides.null_mapping.uint16_null ) + ( !uint16_array->IsNull( i ) * uint16_array->Value( i ) ); } } else { - memcpy(kH(k_array), uint16_array->raw_values(), uint16_array->length() * sizeof(arrow::UInt16Array::value_type)); + memcpy( &kH( k_array )[index], uint16_array->raw_values(), length * sizeof( arrow::UInt16Array::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int16_array = static_pointer_cast(array_data); + auto length = int16_array->length(); if( type_overrides.null_mapping.have_int16 && int16_array->null_count() ){ - for( auto i = 0ll; i < int16_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kH( k_array )[i] = ( int16_array->IsNull( i ) * type_overrides.null_mapping.int16_null ) + ( !int16_array->IsNull( i ) * int16_array->Value( i ) ); } } else { - memcpy(kH(k_array), int16_array->raw_values(), int16_array->length() * sizeof(arrow::Int16Array::value_type)); + memcpy( &kH( k_array )[index], int16_array->raw_values(), length * sizeof( arrow::Int16Array::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint32_array = static_pointer_cast(array_data); + auto length = uint32_array->length(); if( type_overrides.null_mapping.have_uint32 && uint32_array->null_count() ){ - for( auto i = 0ll; i < uint32_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kI( k_array )[i] = ( uint32_array->IsNull( i ) * type_overrides.null_mapping.uint32_null ) + ( !uint32_array->IsNull( i ) * uint32_array->Value( i ) ); } } else { - memcpy(kI(k_array), uint32_array->raw_values(), uint32_array->length() * sizeof(arrow::UInt32Array::value_type)); + memcpy( &kI( k_array )[index], uint32_array->raw_values(), length * sizeof( arrow::UInt32Array::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int32_array = static_pointer_cast(array_data); + auto length = int32_array->length(); if( type_overrides.null_mapping.have_int32 && int32_array->null_count() ){ - for( auto i = 0ll; i < int32_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kI( k_array )[i] = ( int32_array->IsNull( i ) * type_overrides.null_mapping.int32_null ) + (!int32_array->IsNull( i ) * int32_array->Value( i ) ); } } else { - memcpy(kI(k_array), int32_array->raw_values(), int32_array->length() * sizeof(arrow::Int32Array::value_type)); + memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint64_array = static_pointer_cast(array_data); + auto length = uint64_array->length(); if( type_overrides.null_mapping.have_uint64 && uint64_array->null_count() ){ - for( auto i = 0ll; i < uint64_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kJ( k_array )[i] = ( uint64_array->IsNull( i ) * type_overrides.null_mapping.uint64_null ) + ( !uint64_array->IsNull( i ) * uint64_array->Value( i ) ); } } else { - memcpy(kJ(k_array), uint64_array->raw_values(), uint64_array->length() * sizeof(arrow::UInt64Array::value_type)); + memcpy( &kJ( k_array )[index], uint64_array->raw_values(), length * sizeof( arrow::UInt64Array::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int64_array = static_pointer_cast(array_data); + auto length = int64_array->length(); if( type_overrides.null_mapping.have_int64 && int64_array->null_count() ){ - for( auto i = 0ll; i < int64_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kJ( k_array )[i] = ( int64_array->IsNull( i ) * type_overrides.null_mapping.int64_null ) + (!int64_array->IsNull( i ) * int64_array->Value( i ) ); } } else { - memcpy(kJ(k_array), int64_array->raw_values(), int64_array->length() * sizeof(arrow::Int64Array::value_type)); + memcpy( &kJ( k_array )[index], int64_array->raw_values(), length * sizeof( arrow::Int64Array::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto hfl_array = static_pointer_cast(array_data); + auto length = hfl_array->length(); if( type_overrides.null_mapping.have_float16 && hfl_array->null_count() ){ - for( auto i = 0ll; i < hfl_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kH( k_array )[i] = ( hfl_array->IsNull( i ) * type_overrides.null_mapping.float16_null ) + ( !hfl_array->IsNull( i ) * hfl_array->Value( i ) ); } } else { - memcpy(kH(k_array), hfl_array->raw_values(), hfl_array->length() * sizeof(arrow::HalfFloatArray::value_type)); + memcpy( &kH( k_array )[index], hfl_array->raw_values(), length * sizeof( arrow::HalfFloatArray::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto fl_array = static_pointer_cast(array_data); + auto length = fl_array->length(); if( type_overrides.null_mapping.have_float32 && fl_array->null_count() ){ - for( auto i = 0ll; i < fl_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kE( k_array )[i] = ( fl_array->IsNull( i ) * type_overrides.null_mapping.float32_null ) + ( !fl_array->IsNull( i ) * fl_array->Value( i ) ); } } else { - memcpy(kE(k_array), fl_array->raw_values(), fl_array->length() * sizeof(arrow::FloatArray::value_type)); + memcpy( &kE( k_array )[index], fl_array->raw_values(), length * sizeof( arrow::FloatArray::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto dbl_array = static_pointer_cast(array_data); + auto length = dbl_array->length(); if( type_overrides.null_mapping.have_float64 && dbl_array->null_count() ){ - for( auto i = 0ll; i < dbl_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kF( k_array )[i] = ( dbl_array->IsNull( i ) * type_overrides.null_mapping.float64_null ) + ( !dbl_array->IsNull( i ) * dbl_array->Value( i ) ); } } else { - memcpy(kF(k_array), dbl_array->raw_values(), dbl_array->length() * sizeof(arrow::DoubleArray::value_type)); + memcpy( &kF( k_array )[index], dbl_array->raw_values(), length * sizeof( arrow::DoubleArray::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto str_array = static_pointer_cast(array_data); - for (auto i = 0; i < str_array->length(); ++i) { + auto length = str_array->length(); + for( auto i = 0; i < length; ++i ){ K k_str = nullptr; if( type_overrides.null_mapping.have_string && str_array->IsNull( i ) ){ k_str = ktn( KC, type_overrides.null_mapping.string_null.length() ); - memcpy( kG(k_str), type_overrides.null_mapping.string_null.data(), type_overrides.null_mapping.string_null.length() ); + memcpy( kG( k_str ), type_overrides.null_mapping.string_null.data(), type_overrides.null_mapping.string_null.length() ); } else{ auto str_data = str_array->GetString(i); @@ -328,7 +351,8 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto str_array = static_pointer_cast(array_data); - for (auto i = 0; i < str_array->length(); ++i) { + auto length = str_array->length(); + for( auto i = 0; i < length; ++i ){ K k_str = nullptr; if( type_overrides.null_mapping.have_large_string && str_array->IsNull( i ) ){ k_str = ktn( KC, type_overrides.null_mapping.large_string_null.length() ); @@ -347,7 +371,8 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto bin_array = static_pointer_cast(array_data); - for (auto i = 0; i < bin_array->length(); ++i) { + auto length = bin_array->length(); + for( auto i = 0; i < length; ++i ){ K k_bin = nullptr; if( type_overrides.null_mapping.have_binary && bin_array->IsNull( i ) ){ k_bin = ktn( KG, type_overrides.null_mapping.binary_null.length() ); @@ -366,7 +391,8 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto bin_array = static_pointer_cast(array_data); - for (auto i = 0; i < bin_array->length(); ++i) { + auto length = bin_array->length(); + for( auto i = 0; i < length; ++i ){ K k_bin = nullptr; if( type_overrides.null_mapping.have_large_binary && bin_array->IsNull( i ) ){ k_bin = ktn( KG, type_overrides.null_mapping.large_binary_null.length() ); @@ -385,7 +411,8 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto fixed_bin_array = static_pointer_cast(array_data); - for (auto i = 0; i < fixed_bin_array->length(); ++i) { + auto length = fixed_bin_array->length(); + for( auto i = 0; i < length; ++i ){ K k_bin = nullptr; if( type_overrides.null_mapping.have_fixed_binary && fixed_bin_array->IsNull( i ) ){ k_bin = ktn( KG, type_overrides.null_mapping.fixed_binary_null.length() ); @@ -405,7 +432,8 @@ void AppendArray(shared_ptr array_data, K k_a { TemporalConversion tc(array_data->type()); auto d32_array = static_pointer_cast(array_data); - for (auto i = 0; i < d32_array->length(); ++i){ + auto length = d32_array->length(); + for( auto i = 0; i < length; ++i ){ kI( k_array )[index++] = ( ( type_overrides.null_mapping.have_date32 && d32_array->IsNull( i ) ) * type_overrides.null_mapping.date32_null ) + ( !( type_overrides.null_mapping.have_date32 && d32_array->IsNull( i ) ) * tc.ArrowToKdb( d32_array->Value( i ) ) ); @@ -417,7 +445,8 @@ void AppendArray(shared_ptr array_data, K k_a { TemporalConversion tc(array_data->type()); auto d64_array = static_pointer_cast(array_data); - for (auto i = 0; i < d64_array->length(); ++i){ + auto length = d64_array->length(); + for( auto i = 0; i < length; ++i ){ kJ( k_array )[index++] = ( ( type_overrides.null_mapping.have_date64 && d64_array->IsNull( i ) ) * type_overrides.null_mapping.date64_null ) + ( !( type_overrides.null_mapping.have_date64 && d64_array->IsNull( i ) ) * tc.ArrowToKdb( d64_array->Value( i ) ) ); @@ -430,7 +459,8 @@ void AppendArray(shared_ptr array_data, K TemporalConversion tc(array_data->type()); auto ts_array = static_pointer_cast(array_data); auto timestamp_type = static_pointer_cast(ts_array->type()); - for (auto i = 0; i < ts_array->length(); ++i){ + auto length = ts_array->length(); + for( auto i = 0; i < length; ++i ){ kJ( k_array )[index++] = ( ( type_overrides.null_mapping.have_timestamp && ts_array->IsNull( i ) ) * type_overrides.null_mapping.timestamp_null ) + ( !( type_overrides.null_mapping.have_timestamp && ts_array->IsNull( i ) ) * tc.ArrowToKdb( ts_array->Value( i ) ) ); @@ -443,7 +473,8 @@ void AppendArray(shared_ptr array_data, K k_a TemporalConversion tc(array_data->type()); auto t32_array = static_pointer_cast(array_data); auto time32_type = static_pointer_cast(t32_array->type()); - for (auto i = 0; i < t32_array->length(); ++i){ + auto length = t32_array->length(); + for( auto i = 0; i < length; ++i ){ kI( k_array )[index++] = ( ( type_overrides.null_mapping.have_time32 && t32_array->IsNull( i ) ) * type_overrides.null_mapping.time32_null ) + ( !( type_overrides.null_mapping.have_time32 && t32_array->IsNull( i ) ) * tc.ArrowToKdb( t32_array->Value( i ) ) ); @@ -456,7 +487,8 @@ void AppendArray(shared_ptr array_data, K k_a TemporalConversion tc(array_data->type()); auto t64_array = static_pointer_cast(array_data); auto time64_type = static_pointer_cast(t64_array->type()); - for (auto i = 0; i < t64_array->length(); ++i){ + auto length = t64_array->length(); + for( auto i = 0; i < length; ++i ){ kJ( k_array )[index++] = ( ( type_overrides.null_mapping.have_time64 && t64_array->IsNull( i ) ) * type_overrides.null_mapping.time64_null ) + ( !( type_overrides.null_mapping.have_time64 && t64_array->IsNull( i ) ) * tc.ArrowToKdb( t64_array->Value( i ) ) ); @@ -468,7 +500,8 @@ void AppendArray(shared_ptr array_data, K k_ { auto dec_array = static_pointer_cast(array_data); auto dec_type = static_pointer_cast(dec_array->type()); - for (auto i = 0; i < dec_array->length(); ++i) { + auto length = dec_array->length(); + for( auto i = 0; i < length; ++i ){ auto decimal = arrow::Decimal128(dec_array->Value(i)); if (type_overrides.decimal128_as_double) { // Convert the decimal to a double @@ -492,7 +525,8 @@ void AppendArray(shared_ptr array_data, K k TemporalConversion tc(array_data->type()); auto dur_array = static_pointer_cast(array_data); auto duration_type = static_pointer_cast(dur_array->type()); - for (auto i = 0; i < dur_array->length(); ++i){ + auto length = dur_array->length(); + for( auto i = 0; i < length; ++i ){ kJ( k_array )[index++] = ( ( type_overrides.null_mapping.have_duration && dur_array->IsNull( i ) ) * type_overrides.null_mapping.duration_null ) + ( !( type_overrides.null_mapping.have_duration && dur_array->IsNull( i ) ) * tc.ArrowToKdb( dur_array->Value( i ) ) ); @@ -503,22 +537,25 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto month_array = static_pointer_cast(array_data); + auto length = month_array->length(); if( type_overrides.null_mapping.have_month_interval && month_array->null_count() ){ - for( auto i = 0ll; i < month_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kI( k_array )[i] = ( month_array->IsNull( i ) * type_overrides.null_mapping.month_interval_null ) + ( !month_array->IsNull( i ) * month_array->Value( i ) ); } } else { - memcpy(kI(k_array), month_array->raw_values(), month_array->length() * sizeof(arrow::MonthIntervalArray::value_type)); + memcpy( &kI( k_array )[index], month_array->raw_values(), length * sizeof( arrow::MonthIntervalArray::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto dt_array = static_pointer_cast(array_data); - for (auto i = 0; i < dt_array->length(); ++i){ + auto length = dt_array->length(); + for( auto i = 0; i < length; ++i ){ kJ( k_array )[index++] = ( ( type_overrides.null_mapping.have_day_time_interval && dt_array->IsNull( i ) ) * type_overrides.null_mapping.day_time_interval_null ) + ( !( type_overrides.null_mapping.have_day_time_interval && dt_array->IsNull( i ) ) * DayTimeInterval_KTimespan( dt_array->Value( i ) ) ); diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 80254b6..4700fc5 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -21,6 +21,7 @@ namespace Options // Int options const std::string ARROW_CHUNK_ROWS = "ARROW_CHUNK_ROWS"; const std::string PARQUET_CHUNK_SIZE = "PARQUET_CHUNK_SIZE"; + const std::string ORC_CHUNK_SIZE = "ORC_CHUNK_SIZE"; const std::string PARQUET_MULTITHREADED_READ = "PARQUET_MULTITHREADED_READ"; const std::string USE_MMAP = "USE_MMAP"; const std::string DECIMAL128_AS_DOUBLE = "DECIMAL128_AS_DOUBLE"; @@ -63,6 +64,7 @@ namespace Options const static std::set int_options = { ARROW_CHUNK_ROWS, PARQUET_CHUNK_SIZE, + ORC_CHUNK_SIZE, PARQUET_MULTITHREADED_READ, USE_MMAP, DECIMAL128_AS_DOUBLE, diff --git a/src/TableData.cpp b/src/TableData.cpp index b9ba2e9..7302038 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -2,6 +2,10 @@ #include #include +#ifndef _WIN32 +#include +#endif + #include #include #include @@ -24,7 +28,7 @@ #include "KdbOptions.h" -// @@@ +// @@@ // It is possible to check a loaded schema (from parquet file/arrow file/arrow // stream) to see if any of the fields have been defined as nullable. But what do you do // with nullable fields in externlly loaded schemas: nothing, warning, error? @@ -52,7 +56,7 @@ std::vector> MakeArrays(std::shared_ptrt == 0 && array_data->n == 0) { // Empty table } else { - // Only count up to the number of schema fields. Additional trailing data + // Only count up to the number of schema fields. Additional trailing data // in the kdb mixed list is ignored (to allow for ::) for (auto i = 0; i < schema->num_fields(); ++i) { auto k_array = kK(array_data)[i]; @@ -231,7 +235,7 @@ K readParquetSchema(K parquet_file) std::shared_ptr schema; PARQUET_THROW_NOT_OK(reader->GetSchema(&schema)); - + // Add each field from the table to the field store // Add each datatype from the table to the datatype store //const auto schema = table->schema(); @@ -590,7 +594,7 @@ K readArrowData(K arrow_file, K options) if (use_mmap) { PARQUET_ASSIGN_OR_THROW( infile, - arrow::io::MemoryMappedFile::Open(kx::arrowkdb::GetKdbString(arrow_file), + arrow::io::MemoryMappedFile::Open(kx::arrowkdb::GetKdbString(arrow_file), arrow::io::FileMode::READ)); } else { PARQUET_ASSIGN_OR_THROW( @@ -805,3 +809,166 @@ K parseArrowData(K char_array, K options) KDB_EXCEPTION_CATCH; } + +K readORCData(K orc_file, K options) +{ + KDB_EXCEPTION_TRY; + +#ifdef _WIN32 + return krr((S)"ORC files are not supported on Windows"); +#else + if (!kx::arrowkdb::IsKdbString(orc_file)) + return krr((S)"orc_file not 11h or 0 of 10h"); + + // Parse the options + auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + + // Use memmap + int64_t use_mmap = 0; + read_options.GetIntOption(kx::arrowkdb::Options::USE_MMAP, use_mmap); + + // Type mapping overrides + kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; + + std::shared_ptr infile; + if (use_mmap) { + PARQUET_ASSIGN_OR_THROW( + infile, + arrow::io::MemoryMappedFile::Open(kx::arrowkdb::GetKdbString(orc_file), + arrow::io::FileMode::READ)); + } else { + PARQUET_ASSIGN_OR_THROW( + infile, + arrow::io::ReadableFile::Open(kx::arrowkdb::GetKdbString(orc_file), + arrow::default_memory_pool())); + } + // Open ORC file reader + auto maybe_reader = arrow::adapters::orc::ORCFileReader::Open(infile, arrow::default_memory_pool()); + + std::unique_ptr reader = std::move(maybe_reader.ValueOrDie()); + + // Read entire file as a single Arrow table + auto maybe_table = reader->Read(); + + std::shared_ptr table = maybe_table.ValueOrDie(); + + const auto schema = table->schema(); + SchemaContainsNullable(schema); + const auto col_num = schema->num_fields(); + K data = ktn(0, col_num); + for (auto i = 0; i < col_num; ++i) { + auto chunked_array = table->column(i); + kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); + } + + int64_t with_null_bitmap = 0; + read_options.GetIntOption( kx::arrowkdb::Options::WITH_NULL_BITMAP, with_null_bitmap ); + if( with_null_bitmap ){ + K bitmap = ktn( 0, col_num ); + for( auto i = 0; i < col_num; ++i ){ + auto chunked_array = table->column( i ); + kK( bitmap )[i] = kx::arrowkdb::ReadChunkedArrayNullBitmap( chunked_array, type_overrides ); + } + K array = data; + data = ktn( 0, 2 ); + kK( data )[0] = array; + kK( data )[1] = bitmap; + } + + return data; +#endif + + KDB_EXCEPTION_CATCH; +} + +K readORCSchema(K orc_file) +{ + KDB_EXCEPTION_TRY; + +#ifdef _WIN32 + return krr((S)"ORC files are not supported on Windows"); +#else + if (!kx::arrowkdb::IsKdbString(orc_file)) + return krr((S)"orc_file not 11h or 0 of 10h"); + + std::shared_ptr infile; + PARQUET_ASSIGN_OR_THROW( + infile, + arrow::io::ReadableFile::Open(kx::arrowkdb::GetKdbString(orc_file), + arrow::default_memory_pool())); + + auto maybe_reader = arrow::adapters::orc::ORCFileReader::Open(infile, arrow::default_memory_pool()); + + std::unique_ptr reader = std::move(maybe_reader.ValueOrDie()); + + auto maybe_schema = reader->ReadSchema(); + + std::shared_ptr schema = maybe_schema.ValueOrDie(); + // Add each field from the table to the field store + // Add each datatype from the table to the datatype store + //const auto schema = table->schema(); + SchemaContainsNullable(schema); + for (auto field : schema->fields()) { + kx::arrowkdb::GetFieldStore()->Add(field); + kx::arrowkdb::GetDatatypeStore()->Add(field->type()); + } + + // Return the new schema_id + return ki(kx::arrowkdb::GetSchemaStore()->Add(schema)); +#endif + + KDB_EXCEPTION_CATCH; +} + + +K writeORC(K orc_file, K schema_id, K array_data, K options) +{ + KDB_EXCEPTION_TRY; + +#ifdef _WIN32 + return krr((S)"ORC files are not supported on Windows"); +#else + if (!kx::arrowkdb::IsKdbString(orc_file)) + return krr((S)"orc_file not 11h or 0 of 10h"); + if (schema_id->t != -KI) + return krr((S)"schema_id not -6h"); + + const auto schema = kx::arrowkdb::GetSchemaStore()->Find(schema_id->i); + if (!schema) + return krr((S)"unknown schema"); + + std::string path = kx::arrowkdb::GetKdbString( orc_file ); + std::shared_ptr outfile; + PARQUET_ASSIGN_OR_THROW( + outfile, + arrow::io::FileOutputStream::Open( path ) ); + + // Parse the options + auto write_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + + int64_t orc_chunk_size = 1024*1024; + write_options.GetIntOption(kx::arrowkdb::Options::ORC_CHUNK_SIZE, orc_chunk_size); + + auto used_write = arrow::adapters::orc::WriteOptions(); + used_write.batch_size = orc_chunk_size; + + auto maybe_writer = arrow::adapters::orc::ORCFileWriter::Open(outfile.get(), used_write); + + std::unique_ptr writer = std::move(maybe_writer.ValueOrDie()); + + + // Type mapping overrides + kx::arrowkdb::TypeMappingOverride type_overrides{ write_options }; + + // Create the arrow table + auto table = MakeTable(schema, array_data, type_overrides); + + PARQUET_THROW_NOT_OK( writer->Write( *table ) ); + + PARQUET_THROW_NOT_OK( writer->Close() ); + + return ( K )0; +#endif + + KDB_EXCEPTION_CATCH; +} diff --git a/src/TableData.h b/src/TableData.h index 35f42b3..c66cd95 100644 --- a/src/TableData.h +++ b/src/TableData.h @@ -44,10 +44,10 @@ extern "C" * number. Each kdb object representing one of the arrays must be structured * according to the field's datatype. This required array data structure is * detailed for each of the datatype constructor functions. - * + * * Developer use only - Only useful for manual testing, do not expose in * release version of arrowkdb.q since it has no practical use - * + * * Supported options: * * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the @@ -82,7 +82,7 @@ extern "C" * MICRO granularity. In such cases the parquet/arrow file writer will return * an error. * - * Supported options: + * Supported options: * * PARQUET_CHUNK_SIZE (long) - Controls the approximate size of encoded data * pages within a column chunk. Default 1MB @@ -212,7 +212,7 @@ extern "C" * * @param arrow_file String name of the arrow file to write * @param schema_id The schema identifier - * @param array_data Mixed list of arrow array data to be written to the + * @param array_data Mixed list of arrow array data to be written to the * file * @options Dictionary of options or generic null (::) to use * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or @@ -231,19 +231,19 @@ extern "C" EXP K readArrowSchema(K arrow_file); /** - * @brief Reads the arrow array data from the specified arrow IPC record + * @brief Reads the arrow array data from the specified arrow IPC record * batch file * * Supported options: * - * USE_MMAP (long) - Flag indicating whether the parquet file should be memory + * USE_MMAP (long) - Flag indicating whether the IPC file should be memory * mapped in. This can improve performance on systems which support mmap. * Default 0 * * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the * default type mapping for the arrow decimal128 datatype and instead * represent it as a double (9h). Default 0. - * + * * @param arrow_file String name of the arrow file to read * @options Dictionary of options or generic null (::) to use * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or @@ -303,6 +303,67 @@ extern "C" */ EXP K parseArrowData(K char_array, K options); + /** + * @brief Reads the arrow array data from the specified ORC file + * + * USE_MMAP (long) - Flag indicating whether the ORC file should be memory + * mapped in. This can improve performance on systems which support mmap. + * Default 0 + * + * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the + * default type mapping for the arrow decimal128 datatype and instead + * represent it as a double (9h). Default 0. + * + * @param orc_file String name of the ORC file to read + * @options Dictionary of options or generic null (::) to use + * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or + * mixed list of -7|-11|4h. + * @return Mixed list of arrow array objects + */ + EXP K readORCData(K orc_file, K options); + + /** + * @brief Reads the arrow schema from the specified ORC file + * + * @param orc_file String name of the ORC file to read + * @return Schema identifier + */ + EXP K readORCSchema(K orc_file); + + /** + * @brief Creates an ORC file with the specified arrow schema and populates + * it from a mixed list of arrow array objects. + * + * The mixed list of arrow array data should be ordered in schema field + * number. Each kdb object representing one of the arrays must be structured + * according to the field's datatype. This required array data structure is + * detailed for each of the datatype constructor functions. + * + * Note that in general ORC only supports a small subset of the arrow + * datatypes with more then limited functionality. Most importantly ORC doesn't + * support unsigned integer types. In such case the ORC writer may fail + * to write the file. + * + * Supported options: + * + * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the + * default type mapping for the arrow decimal128 datatype and instead + * represent it as a double (9h). Default 0. + * + * ORC_CHUNK_SIZE (long) - ORC stripe size, to control the approximate size + * of data within a column stripe. This currently defaults to 1MB. + * + * @param orc_file String name of the ORC file to write + * @param schema_id The schema identifier + * @param array_data Mixed list of arrow array data to be written to the + * file + * @options Dictionary of options or generic null (::) to use + * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or + * mixed list of -7|-11|4h. + * @return NULL on success, error otherwise + */ + EXP K writeORC(K orc_file, K schema_id, K array_data, K options); + } #endif // __TABLE_DATA_H__ diff --git a/tests/orc_dataloader/orc_compound_nulls.t b/tests/orc_dataloader/orc_compound_nulls.t new file mode 100644 index 0000000..4053aa2 --- /dev/null +++ b/tests/orc_dataloader/orc_compound_nulls.t @@ -0,0 +1,104 @@ +// orc_compound_nulls.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping in Apache ORC ||----------+\n"; +comp_opts:(`bool`int8`int64`float32`float64`date32)!(1b;0x02;5;9.87e;6.54;2012.11.10); + +compound_options:(``NULL_MAPPING)!((::);comp_opts); + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +i8_dt:.arrowkdb.dt.int8[]; + +bool_dt:.arrowkdb.dt.boolean[]; +f32_dt:.arrowkdb.dt.float32[]; +d32_dt:.arrowkdb.dt.date32[]; + +i64_dt:.arrowkdb.dt.int64[]; +f64_dt:.arrowkdb.dt.float64[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; + +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; + +-1"\n+----------|| Create a field containing the list datatype ||----------+\n"; +list_dt:.arrowkdb.dt.list[i8_fd]; +list_fd:.arrowkdb.fd.field[`list_field;list_dt]; + +-1"\n+----------|| Create a field containing the struct datatype ||----------+\n"; +struct_dt:.arrowkdb.dt.struct[(bool_fd,f32_fd,d32_fd)]; +struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; + +-1"\n+----------|| Create fields containing the map datatype ||----------+\n"; +map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] +map_fd:.arrowkdb.fd.field[`map;map_dt]; + +-1"\n+----------|| Create the schema containing the list and struct fields ||----------+\n"; +compound_schema:.arrowkdb.sc.schema[(list_fd,struct_fd,map_fd)]; + +-1"\n+----------|| Number of items in each array ||----------+\n"; +N:3 + +bool_data:N?(0b;1b); +bool_data[0]:1b; +f32_data:N?100e; +f32_data[1]:9.87e; +d32_data:N?(2012.11.10;2010.07.18;2011.07.16;2014.07.15;2016.07.11); +d32_data[2]:2012.11.10; + +-1"\n+----------|| Combine the array data for the list and struct columns ||----------+\n"; +list_array:(enlist 0x00;(0x0102);(0x030405)); +struct_array:(bool_data;f32_data;d32_data); +map_array:((enlist 1)!(enlist 1.23);(2 2)!(4.56 7.89);(3 3 3)!(9.87 6.54 3.21)) +compound_data:(list_array;struct_array;map_array); + +-1"\n+----------|| Write the schema and array data to a ORC file ||----------+\n"; +compound_options[`ORC_CHUNK_SIZE]:1024 + +orc_compound:"compound_bitmap.orc"; +.arrowkdb.orc.writeOrc[orc_compound;compound_schema;compound_data;compound_options] + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +compound_options[`WITH_NULL_BITMAP]:1; + +orc_compound_schema:.arrowkdb.orc.readOrcSchema[orc_compound]; +.arrowkdb.sc.equalSchemas[compound_schema;orc_compound_schema] +compound_schema~orc_compound_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +orc_compound_data:.arrowkdb.orc.readOrcData[orc_compound;compound_options]; +compound_data~first orc_compound_data + +-1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; +list_nulls:(enlist 0b;01b;000b); +struct_nulls:(100b;010b;001b); +map_nulls:((enlist 0b)!(enlist 0b);00b!00b;000b!010b) + +orc_list_nulls:last[orc_compound_data][0] +orc_struct_nulls:last[orc_compound_data][1] +orc_map_nulls:last[orc_compound_data][2] + +list_nulls~orc_list_nulls +struct_nulls~struct_nulls & orc_struct_nulls +map_nulls~orc_map_nulls + +rm orc_compound; + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/orc_dataloader/orc_contiguous_nulls.t b/tests/orc_dataloader/orc_contiguous_nulls.t new file mode 100644 index 0000000..42e28d4 --- /dev/null +++ b/tests/orc_dataloader/orc_contiguous_nulls.t @@ -0,0 +1,79 @@ +// orc_contiguous_nulls.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping in Apache ORC ||----------+\n"; +cont_opts:(`utf8`binary`decimal)!("start";"x"$"alert";3.21f); + +contiguous_options:(``NULL_MAPPING)!((::);cont_opts); + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +str_dt:.arrowkdb.dt.utf8[]; +bin_dt:.arrowkdb.dt.binary[]; +dec_dt:.arrowkdb.dt.decimal128[38i;2i]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +str_fd:.arrowkdb.fd.field[`string;str_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; + +contiguous_schema:.arrowkdb.sc.schema[(str_fd, bin_fd, dec_fd)]; + +-1"\n+----------|| Number of items in each array ||----------+\n"; +N:5 + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[0]:"start" +bin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[1]:"x"$"alert" +dec_data:{"F"$.Q.f[2]x} each N?(10f) +dec_data[2]:3.21f + +-1"\n+----------|| Combine the data for contiguous columns ||----------+\n"; +contiguous_options[`DECIMAL128_AS_DOUBLE]:1 + +contiguous_data:(str_data;bin_data;dec_data); + +-1"\n+----------|| Write the schema and array data to a ORC file ||----------+\n"; +contiguous_options[`ORC_CHUNK_SIZE]:1024 + +orc_contiguous:"contiguous_bitmap.orc"; +.arrowkdb.orc.writeOrc[orc_contiguous;contiguous_schema;contiguous_data;contiguous_options] + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +contiguous_options[`WITH_NULL_BITMAP]:1; + +orc_contiguous_schema:.arrowkdb.orc.readOrcSchema[orc_contiguous]; +.arrowkdb.sc.equalSchemas[contiguous_schema;orc_contiguous_schema] +contiguous_schema~orc_contiguous_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +orc_contiguous_data:.arrowkdb.orc.readOrcData[orc_contiguous;contiguous_options]; +contiguous_data~first orc_contiguous_data + +-1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; +contiguous_nulls:(10000b;01000b;00100b); +orc_contiguous_nulls:last orc_contiguous_data; +contiguous_nulls~contiguous_nulls & orc_contiguous_nulls + +rm orc_contiguous; + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/orc_dataloader/orc_dataloader.t b/tests/orc_dataloader/orc_dataloader.t new file mode 100644 index 0000000..7b0f381 --- /dev/null +++ b/tests/orc_dataloader/orc_dataloader.t @@ -0,0 +1,65 @@ +// orc_dataloader.t + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +i8_dt:.arrowkdb.dt.int8[]; +i16_dt:.arrowkdb.dt.int16[]; +i32_dt:.arrowkdb.dt.int32[]; +i64_dt:.arrowkdb.dt.int64[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; +i16_fd:.arrowkdb.fd.field[`int16;i16_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +dataloader_schema:.arrowkdb.sc.schema[(ts_fd,i8_fd,i16_fd,i32_fd,i64_fd)]; + +-1"\n+----------|| Number of items in each array ||----------+\n"; +N:10 + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +i8_data:N?0x64; +i16_data:N?100h; +i32_data:N?100i; +i64_data:N?100; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +dataloader_data:(ts_data;i8_data;i16_data;i32_data;i64_data); + +-1"\n+----------|| Write the schema and array data to a ORC file ||----------+\n"; +orc_options:(``ORC_CHUNK_SIZE)!((::);1024); + +orc_dataloader:"orc_dataloader.orc" +.arrowkdb.orc.writeOrc[orc_dataloader;dataloader_schema;dataloader_data;orc_options] + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +orc_dataloader_schema:.arrowkdb.orc.readOrcSchema[orc_dataloader]; +.arrowkdb.sc.equalSchemas[dataloader_schema;orc_dataloader_schema] +dataloader_schema~orc_dataloader_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +orc_dataloader_data:.arrowkdb.orc.readOrcData[orc_dataloader;orc_options]; +dataloader_data~orc_dataloader_data +rm orc_dataloader; + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/orc_dataloader/orc_numeric_nulls.t b/tests/orc_dataloader/orc_numeric_nulls.t new file mode 100644 index 0000000..bb5a4b5 --- /dev/null +++ b/tests/orc_dataloader/orc_numeric_nulls.t @@ -0,0 +1,84 @@ +// orc_numeric_nulls.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping in Apache ORC ||----------+\n"; +num_opts:(`int8`int16`int32`int64`float64)!(0x02;3h;4i;5;6.54); + +numeric_options:(``NULL_MAPPING)!((::);num_opts); + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +i8_dt:.arrowkdb.dt.int8[]; +i16_dt:.arrowkdb.dt.int16[]; +i32_dt:.arrowkdb.dt.int32[]; +i64_dt:.arrowkdb.dt.int64[]; +f64_dt:.arrowkdb.dt.float64[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; +i16_fd:.arrowkdb.fd.field[`int16;i16_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; + +-1"\n+----------|| Create the schema for the list of fields ||----------+\n"; +numeric_schema:.arrowkdb.sc.schema[(ts_fd, i16_fd, i32_fd, i64_fd, f64_fd)]; + +-1"\n+----------|| Number of items in each array ||----------+\n"; +N:5 + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +i16_data:N?100h; +i16_data[0]:3h; +i32_data:N?100i; +i32_data[1]:4i; +i64_data:N?100; +i64_data[2]:5; +f64_data:N?100f; +f64_data[3]:6.54f; + +-1"\n+----------|| Combine the data for numeric columns ||----------+\n"; +numeric_data:(ts_data;i16_data;i32_data;i64_data;f64_data); + +-1"\n+----------|| Write the schema and array data to a ORC file ||----------+\n"; +numeric_options[`ORC_CHUNK_SIZE]:1024 + +orc_numeric:"numeric_bitmap.orc"; +.arrowkdb.orc.writeOrc[orc_numeric;numeric_schema;numeric_data;numeric_options] + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +numeric_options[`WITH_NULL_BITMAP]:1; + +orc_numeric_schema:.arrowkdb.orc.readOrcSchema[orc_numeric]; +.arrowkdb.sc.equalSchemas[numeric_schema;orc_numeric_schema] +numeric_schema~orc_numeric_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +orc_numeric_data:.arrowkdb.orc.readOrcData[orc_numeric;numeric_options]; +numeric_data~first orc_numeric_data + +-1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; +numeric_nulls:(00000b;10000b;01000b;00100b;00010b); +orc_numeric_nulls:last orc_numeric_data; +numeric_nulls~numeric_nulls & orc_numeric_nulls + +rm orc_numeric; + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n";