Merge pull request #3 from KxSystems/multithreaded_9.0

Updates for 1.1.0-rc.1
KxSystems · Nov 1, 2022 · 73ef9fc · 73ef9fc
2 parents 751d6e6 + 501e497
commit 73ef9fc
Show file tree

Hide file tree

Showing 12 changed files with 352 additions and 93 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -4,7 +4,7 @@ jobs:
     os: linux
   - dist: focal
     os: linux    
-  - osx_image: xcode10.2
+  - osx_image: xcode12.5
     os: osx
   - os: windows
 language: c

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 3.1.3)
 project(arrowkdb CXX)
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -DKXVER=3")
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
@@ -21,6 +21,7 @@ message(STATUS "C API : ${ARROW_INSTALL}")
 include_directories (
     ${ARROW_INSTALL}/include 
     ${_VCPKG_ROOT_DIR}/installed/${VCPKG_TARGET_TRIPLET}/include  # where arrow has been installed using vcpkg
+		C:/Git/vcpkg/installed/x64-windows-static/include
     ${CMAKE_BINARY_DIR} # For 'k.h', downloaded below
 )
 

diff --git a/README.md b/README.md
@@ -42,8 +42,8 @@ Conversely, Arrow is an in-memory format meant for direct and efficient use for
 ### Requirements
 
 - kdb+ ≥ 3.5 64-bit (Linux/MacOS/Windows)
-- Apache Arrow ≥ 2.0.0
-- C++11 or later
+- Apache Arrow = 9.0.0 (or ≥ 6.0.0 if building `arrowkdb` from source)
+- C++14 or later
 - CMake ≥ 3.1.3
 
 
@@ -54,65 +54,33 @@ Conversely, Arrow is an in-memory format meant for direct and efficient use for
 
 Follow the instructions [here](https://arrow.apache.org/install/#c-and-glib-c-packages-for-debian-gnulinux-ubuntu-and-centos) to install `libarrow-dev` and `libparquet-dev` from Apache's APT or Yum repositories.
 
-#### MacOS
-
-Follow the instructions [here](https://arrow.apache.org/install/#c-and-glib-c-packages-on-homebrew) to install `apache-arrow` using Homebrew.
-
-#### Windows (using `vcpkg`)
-
-A `vcpkg` installation of Arrow is available as described [here](https://arrow.apache.org/install/#c-package-on-vcpkg).  This requires installation the of the `x64-windows` triplet for Arrow then copying the `vcpkg` installed DLLs (Arrow, Parquet and compression libs) to the `%QHOME%\w64` directory:
-
-```bash
-C:\Git> git clone https://github.com/Microsoft/vcpkg.git
-C:\Git> cd vcpkg
-C:\Git\vcpkg> bootstrap-vcpkg.bat
-C:\Git\vcpkg> vcpkg integrate install
-C:\Git\vcpkg> vcpkg install arrow:x64-windows
-C:\Git\vcpkg> copy C:\Git\vcpkg\installed\x64-windows\bin\*.dll %QHOME%\w64
-```
-
-#### Windows (building Arrow from source)
-
-It is also possible to build Arrow from source.  Full details are provided [here](https://arrow.apache.org/docs/developers/cpp/windows.html) but the basic steps are as follows:
-
-##### Snappy
-
-First download and build snappy which is required by Parquet.  From a Visual Studio command prompt:
+Note: If using the packaged version of `arrowkdb` you should install version 9.0.0 of both:
 
 ```bash
-C:\Git> git clone https://github.com/google/snappy.git
-C:\Git> cd snappy
+sudo apt install -y -V libarrow-dev=9.0.0-1
+sudo apt install -y -V libparquet-dev=9.0.0-1
 ```
 
-Create an install directory and set an environment variable to this directory (substituting the correct absolute path as appropriate).  This environment variable is used again later when building Arrow:
+#### MacOS
 
-```bash
-C:\Git\snappy> mkdir install
-C:\Git\snappy> set SNAPPY_INSTALL=C:\Git\snappy\install
-```
+Follow the instructions [here](https://arrow.apache.org/install/#c-and-glib-c-packages-on-homebrew) to install `apache-arrow` using Homebrew.
 
-Create the CMake build directory and generate the build files (this will default to using the Visual Studio CMake generator when run from a VS command prompt):
+#### Windows
 
-```bash
-C:\Git\snappy> mkdir build
-C:\Git\snappy> cd build
-C:\Git\snappy\build> cmake -DCMAKE_INSTALL_PREFIX=%SNAPPY_INSTALL% -DSNAPPY_BUILD_BENCHMARKS:BOOL=0 -DSNAPPY_BUILD_TESTS:BOOL=0 ..
-```
+On Windows it is necessary to build Arrow from source.  Full details are provided [here](https://arrow.apache.org/docs/developers/cpp/windows.html) but the basic steps are as follows.
 
-Build and install snappy:
+From a Visual Studio command prompt, clone the Arrow source from github:
 
 ```bash
-C:\Git\snappy\build> cmake --build . --config Release
-C:\Git\snappy\build> cmake --build . --config Release --target install
+C:\Git> git clone https://github.com/apache/arrow.git
+C:\Git> cd arrow
 ```
 
-##### Arrow
-
-From a Visual Studio command prompt, clone the Arrow source from github:
+Switch to the 9.0.0 tag:
 
 ```bash
-C:\Git> git clone https://github.com/apache/arrow.git
-C:\Git> cd arrow\cpp
+C:\Git\arrow> git checkout refs/tags/apache-arrow-9.0.0 --
+C:\Git> cd cpp
 ```
 
 Create an install directory and set an environment variable to this directory (substituting the correct absolute path as appropriate).  This environment variable is used again later when building `arrowkdb`:
@@ -127,7 +95,7 @@ Create the CMake build directory and generate the build files (this will default
 ```bash
 C:\Git\arrow\cpp> mkdir build
 C:\Git\arrow\cpp> cd build
-C:\Git\arrow\cpp\build> cmake .. -DARROW_PARQUET=ON -DARROW_WITH_SNAPPY=ON -DARROW_BUILD_STATIC=OFF -DSnappy_LIB=%SNAPPY_INSTALL%\lib\snappy.lib -DSnappy_INCLUDE_DIR=%SNAPPY_INSTALL%\include -DCMAKE_INSTALL_PREFIX=%ARROW_INSTALL% 
+C:\Git\arrow\cpp\build> cmake .. -DARROW_PARQUET=ON -DARROW_WITH_SNAPPY=ON -DARROW_BUILD_STATIC=OFF -DARROW_COMPUTE=OFF -DARROW_DEPENDENCY_USE_SHARED=OFF -DCMAKE_INSTALL_PREFIX=%ARROW_INSTALL% 
 ```
 
 Build and install Arrow:
@@ -154,7 +122,7 @@ It is recommended that a user install this interface through a release. This is
 2. Download a release from [here](https://github.com/KxSystems/arrowkdb/releases) for your system architecture.
 3. Install script `arrowkdb.q` to `$QHOME`, and binary file `lib/arrowkdb.(so|dll)` to `$QHOME/[mlw](64)`, by executing the following from the Release directory:
 
-```
+```bash
 ## Linux/MacOS
 chmod +x install.sh && ./install.sh
 
@@ -187,9 +155,6 @@ cd build
 ## Linux/MacOS
 cmake ..
 
-## Windows (using the vcpkg Arrow installation)
-cmake .. -DCMAKE_TOOLCHAIN_FILE=C:/Git/vcpkg/scripts/buildsystems/vcpkg.cmake
-
 ## Windows (using the Arrow installation which was build from source as above)
 cmake .. -DARROW_INSTALL=%ARROW_INSTALL%
 ```
@@ -216,8 +181,6 @@ Documentation outlining the functionality available for this interface can be fo
 
 ## Status
 
-**Warning: This interface is currently a pre-release alpha and subject to non-backwards compatible changes without notice.**
-
 The arrowkdb interface is provided here under an Apache 2.0 license.
 
 If you find issues with the interface or have feature requests, please consider raising an issue [here](https://github.com/KxSystems/arrowkdb/issues).

diff --git a/docs/reference.md b/docs/reference.md
@@ -156,6 +156,11 @@ These functions are exposed within the `.arrowkdb` namespace, allowing users to
                                  kdb+ list
   [pq.readParquetToTable](#pqreadparquettotable)          Read an Arrow table from a Parquet file and convert to a 
                                  kdb+ table
+  [pq.readParquetNumRowGroups](#pqreadparquetnumrowgroups)          Read the number of row groups used by a Parquet file 
+  [pq.readParquetRowGroups](#pqreadparquetrowgroups)          Read a set of row groups from a Parquet file into an Arrow 
+																table then convert to a kdb+ mixed list of array data
+  [pq.readParquetRowGroupsToTable](#pqreadparquetrowgroupstotable)          Read a set of row groups from a Parquet file into an Arrow 
+																table then convert to a kdb+ table
 
 [Arrow IPC files](#arrow-ipc-files)
   [ipc.writeArrow](#ipcwritearrow)                 Convert a kdb+ mixed list of array data to an Arrow table 
@@ -2061,7 +2066,7 @@ The mixed list of Arrow array data should be ordered in schema field number and
 Supported options:
 
 - `PARQUET_CHUNK_SIZE` - Controls the approximate size of encoded data pages within a column chunk.  Long, default 1MB.
-- `PARQUET_VERSION` - Select the Parquet format version, either `V1.0` or `V2.0`.  `V2.0` is more fully featured but may be incompatible with older Parquet implementations.  String, default `V1.0`
+- `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`.  Later versions are more fully featured but may be incompatible with older Parquet implementations.  Default `V1.0`
 - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h).  Long, default 0.
 
 ??? warning "The Parquet format is compressed and designed for for maximum space efficiency which may cause a performance overhead compared to Arrow.  Parquet is also less fully featured than Arrow which can result in schema limitations"
@@ -2099,7 +2104,7 @@ returns generic null on success
 Supported options:
 
 - `PARQUET_CHUNK_SIZE` - Controls the approximate size of encoded data pages within a column chunk.  Long, default 1MB.
-- `PARQUET_VERSION` - Select the Parquet format version, either `V1.0` or `V2.0`.  `V2.0` is more fully featured but may be incompatible with older Parquet implementations.  String, default `V1.0`
+- `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`.  Later versions are more fully featured but may be incompatible with older Parquet implementations.  Default `V1.0`
 
 ??? warning "Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors"
 
@@ -2232,7 +2237,94 @@ q)read_table~table
 1b
 ```
 
-## Arrow IPC files
+### `pq.readParquetNumRowGroups`
+
+*Read the number of row groups used by a Parquet file*
+
+```syntax
+.arrowkdb.pq.readParquetNumRowGroups[parquet_file]
+```
+
+Where `parquet_file` is a string containing the Parquet file name
+
+returns the number of row groups
+
+```q
+q)table:([]a:10000000#0;b:10000000#1)
+q).arrowkdb.pq.writeParquetFromTable["file.parquet";table;::]
+q).arrowkdb.pq.readParquetNumRowGroups["file.parquet"]
+10i
+```
+
+### `pq.readParquetRowGroups`
+
+*Read a set of row groups from a Parquet file into an Arrow table then convert to a kdb+ mixed list of array data*
+
+```syntax
+.arrowkdb.pq.readParquetRowGroups[parquet_file;row_groups;columns;options]
+```
+
+Where:
+
+- `parquet_file` is a string containing the Parquet file name
+- `row_groups` is an integer list (6h) of row groups indices to read, or generic null (::) to read all row groups
+- `columns` is an integer list (6h) of column indices to read, or generic null (::) to read all columns
+- `options` is a kdb+ dictionary of options or generic null (::) to use defaults.  Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h.
+
+returns the array data
+
+Supported options:
+
+- `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode.   This can improve performance by processing multiple columns in parallel.  Long, default 0.
+- `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in.  This can improve performance on systems which support mmap.  Long, default: 0.
+- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h).  Long, default 0.
+
+```q
+q)table:([]a:10000000#0;b:10000000#1)
+q).arrowkdb.pq.writeParquetFromTable["file.parquet";table;::]
+q).arrowkdb.pq.readParquetRowGroups["file.parquet";1 2i;enlist 0i;::]
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0..
+q)count .arrowkdb.pq.readParquetRowGroups["file.parquet";1 2i;enlist 0i;::]
+1
+q)count first .arrowkdb.pq.readParquetRowGroups["file.parquet";1 2i;enlist 0i;::]
+2097152
+```
+
+### `pq.readParquetRowGroupsToTable`
+
+*Read a set of row groups from a Parquet file into an Arrow table then convert to a kdb+ table*
+
+```syntax
+.arrowkdb.pq.readParquetRowGroupsToTable[parquet_file;row_groups;columns;options]
+```
+
+Where:
+
+- `parquet_file` is a string containing the Parquet file name
+- `row_groups` is an integer list (6h) of row groups indices to read, or generic null (::) to read all row groups
+- `columns` is an integer list (6h) of column indices to read, or generic null (::) to read all columns
+- `options` is a kdb+ dictionary of options or generic null (::) to use defaults.  Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h.
+
+returns the kdb+ table
+
+Supported options:
+
+- `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode.   This can improve performance by processing multiple columns in parallel.  Long, default 0.
+- `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in.  This can improve performance on systems which support mmap.  Long, default: 0.
+- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h).  Long, default 0.
+
+```q
+q)table:([]a:10000000#0;b:10000000#1)
+q).arrowkdb.pq.writeParquetFromTable["file.parquet";table;::]
+q)meta .arrowkdb.pq.readParquetRowGroupsToTable["file.parquet";1 2i;enlist 0i;::]
+c| t f a
+-| -----
+a| j
+q)count .arrowkdb.pq.readParquetRowGroupsToTable["file.parquet";1 2i;enlist 0i;::]
+2097152
+```
+
+### Arrow IPC files
 
 ### `ipc.writeArrow`
 

diff --git a/q/arrowkdb.q b/q/arrowkdb.q
@@ -114,6 +114,9 @@ pq.readParquetSchema:`arrowkdb 2:(`readParquetSchema;1);
 pq.readParquetData:`arrowkdb 2:(`readParquetData;2);
 pq.readParquetToTable:{[filename;options] flip (fd.fieldName each sc.schemaFields[pq.readParquetSchema[filename]])!(pq.readParquetData[filename;options])};
 pq.readParquetColumn:`arrowkdb 2:(`readParquetColumn;3);
+pq.readParquetNumRowGroups:`arrowkdb 2:(`readParquetNumRowGroups;1);
+pq.readParquetRowGroups:`arrowkdb 2:(`readParquetRowGroups;4);
+pq.readParquetRowGroupsToTable:{[filename;row_groups;columns;options] flip (fd.fieldName each sc.schemaFields[pq.readParquetSchema[filename]](columns))!(pq.readParquetRowGroups[filename;row_groups;columns;options])};
 
 
 // arrow files
@@ -134,8 +137,13 @@ ipc.parseArrowToTable:{[serialized;options] flip (fd.fieldName each sc.schemaFie
 
 // utils
 util.buildInfo:`arrowkdb 2:(`buildInfo;1);
+util.init:`arrowkdb 2:(`init;1);
 
 
 // testing
 ts.writeReadArray:`arrowkdb 2:(`writeReadArray;3);
-ts.writeReadTable:`arrowkdb 2:(`writeReadTable;3);
+ts.writeReadTable:`arrowkdb 2:(`writeReadTable;3);
+
+
+// initialise
+util.init[];
diff --git a/src/ArrowKdb.cpp b/src/ArrowKdb.cpp
@@ -8,6 +8,9 @@
 #include "HelperFunctions.h"
 
 #include "ArrowKdb.h"
+#include "DatatypeStore.h"
+#include "FieldStore.h"
+#include "SchemaStore.h"
 
 
 // Main is only used for profiling on windows with arrowkdb.exe
@@ -77,3 +80,16 @@ EXP K buildInfo(K unused)
 
   return xD(keys, values);
 }
+
+EXP K init(K unused)
+{
+  // Turn on symbol locking
+  setm(1);
+
+  // Create the singletons
+  kx::arrowkdb::GetDatatypeStore();
+  kx::arrowkdb::GetFieldStore();
+  kx::arrowkdb::GetSchemaStore();
+
+  return (K)0;
+}
diff --git a/src/ArrowKdb.h b/src/ArrowKdb.h
@@ -20,6 +20,13 @@ extern "C"
    * version, shared object version, git description and compiler used.
   */
   EXP K buildInfo(K unused);
+
+  /**
+   * @brief Initialise the library
+   * @param unused 
+   * @return null
+  */
+  EXP K init(K unused);
 }
 
 #endif // __ARROW_KDB_H__