From 367b970f27505db687ef7484d4ff8c95fc35521b Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 16 Mar 2023 12:43:20 +0300 Subject: [PATCH 01/71] Stripes processing for ORC files --- src/ArrayReader.cpp | 113 +++++++++++++++++++++++++++++--------------- 1 file changed, 75 insertions(+), 38 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 353d709..1d78225 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -144,176 +144,199 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint8_array = static_pointer_cast(array_data); + auto length = uint8_array->length(); if( type_overrides.null_mapping.have_uint8 && uint8_array->null_count() ){ - for( auto i = 0ll; i < uint8_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kG( k_array )[i] = ( uint8_array->IsNull( i ) * type_overrides.null_mapping.uint8_null ) + ( !uint8_array->IsNull( i ) * uint8_array->Value( i ) ); } } else { - memcpy(kG(k_array), uint8_array->raw_values(), uint8_array->length() * sizeof(arrow::UInt8Array::value_type)); + memcpy( &kG( k_array )[index], uint8_array->raw_values(), length * sizeof( arrow::UInt8Array::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int8_array = static_pointer_cast(array_data); + auto length = int8_array->length(); if( type_overrides.null_mapping.have_int8 && int8_array->null_count() ){ - for( auto i = 0ll; i < int8_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kG( k_array )[i] = ( int8_array->IsNull( i ) * type_overrides.null_mapping.int8_null ) + ( !int8_array->IsNull( i ) * int8_array->Value( i ) ); } } else { - memcpy(kG(k_array), int8_array->raw_values(), int8_array->length() * sizeof(arrow::Int8Array::value_type)); + memcpy( &kG( k_array )[index], int8_array->raw_values(), length * sizeof( arrow::Int8Array::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint16_array = static_pointer_cast(array_data); + auto length = uint16_array->length(); if( type_overrides.null_mapping.have_uint16 && uint16_array->null_count() ){ - for( auto i = 0ll; i < uint16_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kH( k_array )[i] = ( uint16_array->IsNull( i ) * type_overrides.null_mapping.uint16_null ) + ( !uint16_array->IsNull( i ) * uint16_array->Value( i ) ); } } else { - memcpy(kH(k_array), uint16_array->raw_values(), uint16_array->length() * sizeof(arrow::UInt16Array::value_type)); + memcpy( &kH( k_array )[index], uint16_array->raw_values(), length * sizeof( arrow::UInt16Array::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int16_array = static_pointer_cast(array_data); + auto length = int16_array->length(); if( type_overrides.null_mapping.have_int16 && int16_array->null_count() ){ - for( auto i = 0ll; i < int16_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kH( k_array )[i] = ( int16_array->IsNull( i ) * type_overrides.null_mapping.int16_null ) + ( !int16_array->IsNull( i ) * int16_array->Value( i ) ); } } else { - memcpy(kH(k_array), int16_array->raw_values(), int16_array->length() * sizeof(arrow::Int16Array::value_type)); + memcpy( &kH( k_array )[index], int16_array->raw_values(), length * sizeof( arrow::Int16Array::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint32_array = static_pointer_cast(array_data); + auto length = uint32_array->length(); if( type_overrides.null_mapping.have_uint32 && uint32_array->null_count() ){ - for( auto i = 0ll; i < uint32_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kI( k_array )[i] = ( uint32_array->IsNull( i ) * type_overrides.null_mapping.uint32_null ) + ( !uint32_array->IsNull( i ) * uint32_array->Value( i ) ); } } else { - memcpy(kI(k_array), uint32_array->raw_values(), uint32_array->length() * sizeof(arrow::UInt32Array::value_type)); + memcpy( &kI( k_array )[index], uint32_array->raw_values(), length * sizeof( arrow::UInt32Array::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int32_array = static_pointer_cast(array_data); + auto length = int32_array->length(); if( type_overrides.null_mapping.have_int32 && int32_array->null_count() ){ - for( auto i = 0ll; i < int32_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kI( k_array )[i] = ( int32_array->IsNull( i ) * type_overrides.null_mapping.int32_null ) + (!int32_array->IsNull( i ) * int32_array->Value( i ) ); } } else { - memcpy(kI(k_array), int32_array->raw_values(), int32_array->length() * sizeof(arrow::Int32Array::value_type)); + memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint64_array = static_pointer_cast(array_data); + auto length = uint64_array->length(); if( type_overrides.null_mapping.have_uint64 && uint64_array->null_count() ){ - for( auto i = 0ll; i < uint64_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kJ( k_array )[i] = ( uint64_array->IsNull( i ) * type_overrides.null_mapping.uint64_null ) + ( !uint64_array->IsNull( i ) * uint64_array->Value( i ) ); } } else { - memcpy(kJ(k_array), uint64_array->raw_values(), uint64_array->length() * sizeof(arrow::UInt64Array::value_type)); + memcpy( &kJ( k_array )[index], uint64_array->raw_values(), length * sizeof( arrow::UInt64Array::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int64_array = static_pointer_cast(array_data); + auto length = int64_array->length(); if( type_overrides.null_mapping.have_int64 && int64_array->null_count() ){ - for( auto i = 0ll; i < int64_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kJ( k_array )[i] = ( int64_array->IsNull( i ) * type_overrides.null_mapping.int64_null ) + (!int64_array->IsNull( i ) * int64_array->Value( i ) ); } } else { - memcpy(kJ(k_array), int64_array->raw_values(), int64_array->length() * sizeof(arrow::Int64Array::value_type)); + memcpy( &kJ( k_array )[index], int64_array->raw_values(), length * sizeof( arrow::Int64Array::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto hfl_array = static_pointer_cast(array_data); + auto length = hfl_array->length(); if( type_overrides.null_mapping.have_float16 && hfl_array->null_count() ){ - for( auto i = 0ll; i < hfl_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kH( k_array )[i] = ( hfl_array->IsNull( i ) * type_overrides.null_mapping.float16_null ) + ( !hfl_array->IsNull( i ) * hfl_array->Value( i ) ); } } else { - memcpy(kH(k_array), hfl_array->raw_values(), hfl_array->length() * sizeof(arrow::HalfFloatArray::value_type)); + memcpy( &kH( k_array )[index], hfl_array->raw_values(), length * sizeof( arrow::HalfFloatArray::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto fl_array = static_pointer_cast(array_data); + auto length = fl_array->length(); if( type_overrides.null_mapping.have_float32 && fl_array->null_count() ){ - for( auto i = 0ll; i < fl_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kE( k_array )[i] = ( fl_array->IsNull( i ) * type_overrides.null_mapping.float32_null ) + ( !fl_array->IsNull( i ) * fl_array->Value( i ) ); } } else { - memcpy(kE(k_array), fl_array->raw_values(), fl_array->length() * sizeof(arrow::FloatArray::value_type)); + memcpy( &kE( k_array )[index], fl_array->raw_values(), length * sizeof( arrow::FloatArray::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto dbl_array = static_pointer_cast(array_data); + auto length = dbl_array->length(); if( type_overrides.null_mapping.have_float64 && dbl_array->null_count() ){ - for( auto i = 0ll; i < dbl_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kF( k_array )[i] = ( dbl_array->IsNull( i ) * type_overrides.null_mapping.float64_null ) + ( !dbl_array->IsNull( i ) * dbl_array->Value( i ) ); } } else { - memcpy(kF(k_array), dbl_array->raw_values(), dbl_array->length() * sizeof(arrow::DoubleArray::value_type)); + memcpy( &kF( k_array )[index], dbl_array->raw_values(), length * sizeof( arrow::DoubleArray::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto str_array = static_pointer_cast(array_data); - for (auto i = 0; i < str_array->length(); ++i) { + auto length = str_array->length(); + for( auto i = 0; i < length; ++i ){ K k_str = nullptr; if( type_overrides.null_mapping.have_string && str_array->IsNull( i ) ){ k_str = ktn( KC, type_overrides.null_mapping.string_null.length() ); - memcpy( kG(k_str), type_overrides.null_mapping.string_null.data(), type_overrides.null_mapping.string_null.length() ); + memcpy( kG( k_str ), type_overrides.null_mapping.string_null.data(), type_overrides.null_mapping.string_null.length() ); } else{ auto str_data = str_array->GetString(i); @@ -328,7 +351,8 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto str_array = static_pointer_cast(array_data); - for (auto i = 0; i < str_array->length(); ++i) { + auto length = str_array->length(); + for( auto i = 0; i < length; ++i ){ K k_str = nullptr; if( type_overrides.null_mapping.have_large_string && str_array->IsNull( i ) ){ k_str = ktn( KC, type_overrides.null_mapping.large_string_null.length() ); @@ -347,7 +371,8 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto bin_array = static_pointer_cast(array_data); - for (auto i = 0; i < bin_array->length(); ++i) { + auto length = bin_array->length(); + for( auto i = 0; i < length; ++i ){ K k_bin = nullptr; if( type_overrides.null_mapping.have_binary && bin_array->IsNull( i ) ){ k_bin = ktn( KG, type_overrides.null_mapping.binary_null.length() ); @@ -366,7 +391,8 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto bin_array = static_pointer_cast(array_data); - for (auto i = 0; i < bin_array->length(); ++i) { + auto length = bin_array->length(); + for( auto i = 0; i < length; ++i ){ K k_bin = nullptr; if( type_overrides.null_mapping.have_large_binary && bin_array->IsNull( i ) ){ k_bin = ktn( KG, type_overrides.null_mapping.large_binary_null.length() ); @@ -385,7 +411,8 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto fixed_bin_array = static_pointer_cast(array_data); - for (auto i = 0; i < fixed_bin_array->length(); ++i) { + auto length = fixed_bin_array->length(); + for( auto i = 0; i < length; ++i ){ K k_bin = nullptr; if( type_overrides.null_mapping.have_fixed_binary && fixed_bin_array->IsNull( i ) ){ k_bin = ktn( KG, type_overrides.null_mapping.fixed_binary_null.length() ); @@ -405,7 +432,8 @@ void AppendArray(shared_ptr array_data, K k_a { TemporalConversion tc(array_data->type()); auto d32_array = static_pointer_cast(array_data); - for (auto i = 0; i < d32_array->length(); ++i){ + auto length = d32_array->length(); + for( auto i = 0; i < length; ++i ){ kI( k_array )[index++] = ( ( type_overrides.null_mapping.have_date32 && d32_array->IsNull( i ) ) * type_overrides.null_mapping.date32_null ) + ( !( type_overrides.null_mapping.have_date32 && d32_array->IsNull( i ) ) * tc.ArrowToKdb( d32_array->Value( i ) ) ); @@ -417,7 +445,8 @@ void AppendArray(shared_ptr array_data, K k_a { TemporalConversion tc(array_data->type()); auto d64_array = static_pointer_cast(array_data); - for (auto i = 0; i < d64_array->length(); ++i){ + auto length = d64_array->length(); + for( auto i = 0; i < length; ++i ){ kJ( k_array )[index++] = ( ( type_overrides.null_mapping.have_date64 && d64_array->IsNull( i ) ) * type_overrides.null_mapping.date64_null ) + ( !( type_overrides.null_mapping.have_date64 && d64_array->IsNull( i ) ) * tc.ArrowToKdb( d64_array->Value( i ) ) ); @@ -430,7 +459,8 @@ void AppendArray(shared_ptr array_data, K TemporalConversion tc(array_data->type()); auto ts_array = static_pointer_cast(array_data); auto timestamp_type = static_pointer_cast(ts_array->type()); - for (auto i = 0; i < ts_array->length(); ++i){ + auto length = ts_array->length(); + for( auto i = 0; i < length; ++i ){ kJ( k_array )[index++] = ( ( type_overrides.null_mapping.have_timestamp && ts_array->IsNull( i ) ) * type_overrides.null_mapping.timestamp_null ) + ( !( type_overrides.null_mapping.have_timestamp && ts_array->IsNull( i ) ) * tc.ArrowToKdb( ts_array->Value( i ) ) ); @@ -443,7 +473,8 @@ void AppendArray(shared_ptr array_data, K k_a TemporalConversion tc(array_data->type()); auto t32_array = static_pointer_cast(array_data); auto time32_type = static_pointer_cast(t32_array->type()); - for (auto i = 0; i < t32_array->length(); ++i){ + auto length = t32_array->length(); + for( auto i = 0; i < length; ++i ){ kI( k_array )[index++] = ( ( type_overrides.null_mapping.have_time32 && t32_array->IsNull( i ) ) * type_overrides.null_mapping.time32_null ) + ( !( type_overrides.null_mapping.have_time32 && t32_array->IsNull( i ) ) * tc.ArrowToKdb( t32_array->Value( i ) ) ); @@ -456,7 +487,8 @@ void AppendArray(shared_ptr array_data, K k_a TemporalConversion tc(array_data->type()); auto t64_array = static_pointer_cast(array_data); auto time64_type = static_pointer_cast(t64_array->type()); - for (auto i = 0; i < t64_array->length(); ++i){ + auto length = t64_array->length(); + for( auto i = 0; i < length; ++i ){ kJ( k_array )[index++] = ( ( type_overrides.null_mapping.have_time64 && t64_array->IsNull( i ) ) * type_overrides.null_mapping.time64_null ) + ( !( type_overrides.null_mapping.have_time64 && t64_array->IsNull( i ) ) * tc.ArrowToKdb( t64_array->Value( i ) ) ); @@ -468,7 +500,8 @@ void AppendArray(shared_ptr array_data, K k_ { auto dec_array = static_pointer_cast(array_data); auto dec_type = static_pointer_cast(dec_array->type()); - for (auto i = 0; i < dec_array->length(); ++i) { + auto length = dec_array->length(); + for( auto i = 0; i < length; ++i ){ auto decimal = arrow::Decimal128(dec_array->Value(i)); if (type_overrides.decimal128_as_double) { // Convert the decimal to a double @@ -492,7 +525,8 @@ void AppendArray(shared_ptr array_data, K k TemporalConversion tc(array_data->type()); auto dur_array = static_pointer_cast(array_data); auto duration_type = static_pointer_cast(dur_array->type()); - for (auto i = 0; i < dur_array->length(); ++i){ + auto length = dur_array->length(); + for( auto i = 0; i < length; ++i ){ kJ( k_array )[index++] = ( ( type_overrides.null_mapping.have_duration && dur_array->IsNull( i ) ) * type_overrides.null_mapping.duration_null ) + ( !( type_overrides.null_mapping.have_duration && dur_array->IsNull( i ) ) * tc.ArrowToKdb( dur_array->Value( i ) ) ); @@ -503,22 +537,25 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto month_array = static_pointer_cast(array_data); + auto length = month_array->length(); if( type_overrides.null_mapping.have_month_interval && month_array->null_count() ){ - for( auto i = 0ll; i < month_array->length(); ++i ){ + for( auto i = 0ll; i < length; ++i ){ kI( k_array )[i] = ( month_array->IsNull( i ) * type_overrides.null_mapping.month_interval_null ) + ( !month_array->IsNull( i ) * month_array->Value( i ) ); } } else { - memcpy(kI(k_array), month_array->raw_values(), month_array->length() * sizeof(arrow::MonthIntervalArray::value_type)); + memcpy( &kI( k_array )[index], month_array->raw_values(), length * sizeof( arrow::MonthIntervalArray::value_type ) ); } + index += length; } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto dt_array = static_pointer_cast(array_data); - for (auto i = 0; i < dt_array->length(); ++i){ + auto length = dt_array->length(); + for( auto i = 0; i < length; ++i ){ kJ( k_array )[index++] = ( ( type_overrides.null_mapping.have_day_time_interval && dt_array->IsNull( i ) ) * type_overrides.null_mapping.day_time_interval_null ) + ( !( type_overrides.null_mapping.have_day_time_interval && dt_array->IsNull( i ) ) * DayTimeInterval_KTimespan( dt_array->Value( i ) ) ); From 33ab957fdff9e4eb20af05de628a17ebf9707eae Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Tue, 20 Sep 2022 09:54:10 +1000 Subject: [PATCH 02/71] Initial Commit of ORC Functionality --- q/arrowkdb.q | 7 +++ src/TableData.cpp | 143 ++++++++++++++++++++++++++++++++++++++++++++-- src/TableData.h | 80 +++++++++++++++++++++++--- 3 files changed, 219 insertions(+), 11 deletions(-) diff --git a/q/arrowkdb.q b/q/arrowkdb.q index b3d56cd..cfff1c2 100644 --- a/q/arrowkdb.q +++ b/q/arrowkdb.q @@ -132,6 +132,13 @@ pq.readParquetRowGroupsToTable:{[filename;row_groups;columns;options] ] }; +// ORC files +orc.writeOrc:`arrowkdb 2:(`writeORC;4); +orc.writeOrcFromTable:{[filename;table;options] orc.writeOrc[filename;sc.inferSchema[table];value flip table;options]}; +orc.readOrcSchema:`arrowkdb 2:(`readORCSchema;1); +orc.readOrcData:`arrowkdb 2:(`readORCData;2); +orc.readOrcToTable:{[filename;options] flip (fd.fieldName each sc.schemaFields[orc.readOrcSchema[filename]])!(orc.readOrcData[filename;options])}; +// orc.readColumn (Functionality is different since dealing with stripes) // arrow files ipc.writeArrow:`arrowkdb 2:(`writeArrow;4); diff --git a/src/TableData.cpp b/src/TableData.cpp index b9ba2e9..8d4bbd6 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -24,7 +25,7 @@ #include "KdbOptions.h" -// @@@ +// @@@ // It is possible to check a loaded schema (from parquet file/arrow file/arrow // stream) to see if any of the fields have been defined as nullable. But what do you do // with nullable fields in externlly loaded schemas: nothing, warning, error? @@ -52,7 +53,7 @@ std::vector> MakeArrays(std::shared_ptrt == 0 && array_data->n == 0) { // Empty table } else { - // Only count up to the number of schema fields. Additional trailing data + // Only count up to the number of schema fields. Additional trailing data // in the kdb mixed list is ignored (to allow for ::) for (auto i = 0; i < schema->num_fields(); ++i) { auto k_array = kK(array_data)[i]; @@ -231,7 +232,7 @@ K readParquetSchema(K parquet_file) std::shared_ptr schema; PARQUET_THROW_NOT_OK(reader->GetSchema(&schema)); - + // Add each field from the table to the field store // Add each datatype from the table to the datatype store //const auto schema = table->schema(); @@ -590,7 +591,7 @@ K readArrowData(K arrow_file, K options) if (use_mmap) { PARQUET_ASSIGN_OR_THROW( infile, - arrow::io::MemoryMappedFile::Open(kx::arrowkdb::GetKdbString(arrow_file), + arrow::io::MemoryMappedFile::Open(kx::arrowkdb::GetKdbString(arrow_file), arrow::io::FileMode::READ)); } else { PARQUET_ASSIGN_OR_THROW( @@ -805,3 +806,137 @@ K parseArrowData(K char_array, K options) KDB_EXCEPTION_CATCH; } + +K readORCData(K orc_file, K options) +{ + KDB_EXCEPTION_TRY; + + if (!kx::arrowkdb::IsKdbString(orc_file)) + return krr((S)"orc_file not 11h or 0 of 10h"); + + // Parse the options + auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + + // Use multi threading + int64_t parquet_multithreaded_read = 0; + read_options.GetIntOption(kx::arrowkdb::Options::PARQUET_MULTITHREADED_READ, parquet_multithreaded_read); + + // Use memmap + int64_t use_mmap = 0; + read_options.GetIntOption(kx::arrowkdb::Options::USE_MMAP, use_mmap); + + // Type mapping overrides + kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; + + std::shared_ptr infile; + if (use_mmap) { + PARQUET_ASSIGN_OR_THROW( + infile, + arrow::io::MemoryMappedFile::Open(kx::arrowkdb::GetKdbString(orc_file), + arrow::io::FileMode::READ)); + } else { + PARQUET_ASSIGN_OR_THROW( + infile, + arrow::io::ReadableFile::Open(kx::arrowkdb::GetKdbString(orc_file), + arrow::default_memory_pool())); + } + // Open ORC file reader + auto maybe_reader = arrow::adapters::orc::ORCFileReader::Open(infile, arrow::default_memory_pool()); + + std::unique_ptr reader = std::move(maybe_reader.ValueOrDie()); + + // Read entire file as a single Arrow table + auto maybe_table = reader->Read(); + + std::shared_ptr table = maybe_table.ValueOrDie(); + + const auto schema = table->schema(); + SchemaContainsNullable(schema); + const auto col_num = schema->num_fields(); + K data = ktn(0, col_num); + for (auto i = 0; i < col_num; ++i) { + auto chunked_array = table->column(i); + kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); + } + + return data; + + KDB_EXCEPTION_CATCH; +} + +K readORCSchema(K orc_file) +{ + KDB_EXCEPTION_TRY; + + if (!kx::arrowkdb::IsKdbString(orc_file)) + return krr((S)"orc_file not 11h or 0 of 10h"); + + std::shared_ptr infile; + PARQUET_ASSIGN_OR_THROW( + infile, + arrow::io::ReadableFile::Open(kx::arrowkdb::GetKdbString(orc_file), + arrow::default_memory_pool())); + + auto maybe_reader = arrow::adapters::orc::ORCFileReader::Open(infile, arrow::default_memory_pool()); + + std::unique_ptr reader = std::move(maybe_reader.ValueOrDie()); + + auto maybe_schema = reader->ReadSchema(); + + std::shared_ptr schema = maybe_schema.ValueOrDie(); + // Add each field from the table to the field store + // Add each datatype from the table to the datatype store + //const auto schema = table->schema(); + SchemaContainsNullable(schema); + for (auto field : schema->fields()) { + kx::arrowkdb::GetFieldStore()->Add(field); + kx::arrowkdb::GetDatatypeStore()->Add(field->type()); + } + + // Return the new schema_id + return ki(kx::arrowkdb::GetSchemaStore()->Add(schema)); + + KDB_EXCEPTION_CATCH; +} + + +K writeORC(K orc_file, K schema_id, K array_data, K options) +{ + KDB_EXCEPTION_TRY; + + if (!kx::arrowkdb::IsKdbString(orc_file)) + return krr((S)"orc_file not 11h or 0 of 10h"); + if (schema_id->t != -KI) + return krr((S)"schema_id not -6h"); + + const auto schema = kx::arrowkdb::GetSchemaStore()->Find(schema_id->i); + if (!schema) + return krr((S)"unknown schema"); + + std::shared_ptr outfile; + PARQUET_ASSIGN_OR_THROW( + outfile, + arrow::io::FileOutputStream::Open(kx::arrowkdb::GetKdbString(orc_file))); + + // Parse the options + auto write_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + + auto maybe_writer = arrow::adapters::orc::ORCFileWriter::Open(outfile.get(), arrow::adapters::orc::WriteOptions()); + + + std::unique_ptr writer = std::move(maybe_writer.ValueOrDie()); + + + // Type mapping overrides + kx::arrowkdb::TypeMappingOverride type_overrides{ write_options }; + + // Create the arrow table + auto table = MakeTable(schema, array_data, type_overrides); + + writer->Write(*table); + writer->Close(); + + return (K)0; + + KDB_EXCEPTION_CATCH; +} \ No newline at end of file diff --git a/src/TableData.h b/src/TableData.h index 35f42b3..d7b7463 100644 --- a/src/TableData.h +++ b/src/TableData.h @@ -44,10 +44,10 @@ extern "C" * number. Each kdb object representing one of the arrays must be structured * according to the field's datatype. This required array data structure is * detailed for each of the datatype constructor functions. - * + * * Developer use only - Only useful for manual testing, do not expose in * release version of arrowkdb.q since it has no practical use - * + * * Supported options: * * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the @@ -82,7 +82,7 @@ extern "C" * MICRO granularity. In such cases the parquet/arrow file writer will return * an error. * - * Supported options: + * Supported options: * * PARQUET_CHUNK_SIZE (long) - Controls the approximate size of encoded data * pages within a column chunk. Default 1MB @@ -212,7 +212,7 @@ extern "C" * * @param arrow_file String name of the arrow file to write * @param schema_id The schema identifier - * @param array_data Mixed list of arrow array data to be written to the + * @param array_data Mixed list of arrow array data to be written to the * file * @options Dictionary of options or generic null (::) to use * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or @@ -231,7 +231,7 @@ extern "C" EXP K readArrowSchema(K arrow_file); /** - * @brief Reads the arrow array data from the specified arrow IPC record + * @brief Reads the arrow array data from the specified arrow IPC record * batch file * * Supported options: @@ -243,7 +243,7 @@ extern "C" * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the * default type mapping for the arrow decimal128 datatype and instead * represent it as a double (9h). Default 0. - * + * * @param arrow_file String name of the arrow file to read * @options Dictionary of options or generic null (::) to use * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or @@ -303,6 +303,72 @@ extern "C" */ EXP K parseArrowData(K char_array, K options); + /** + * @brief Reads the arrow array data from the specified parquet file + * + * Supported options: + * + * PARQUET_MULTITHREADED_READ (long) - Flag indicating whether the parquet + * reader should run in multithreaded mode. This can improve performance by + * processing multiple columns in parallel. Default 0 + * + * USE_MMAP (long) - Flag indicating whether the parquet file should be memory + * mapped in. This can improve performance on systems which support mmap. + * Default 0 + * + * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the + * default type mapping for the arrow decimal128 datatype and instead + * represent it as a double (9h). Default 0. + * + * @param orc_file String name of the parquet file to read + * @options Dictionary of options or generic null (::) to use + * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or + * mixed list of -7|-11|4h. + * @return Mixed list of arrow array objects + */ + EXP K readORCData(K orc_file, K options); + + /** + * @brief Reads the arrow schema from the specified parquet file + * + * @param orc_file String name of the parquet file to read + * @return Schema identifier + */ + EXP K readORCSchema(K orc_file); + + /** + * @brief Creates a parquet file with the specified arrow schema and populates + * it from a mixed list of arrow array objects. + * + * The mixed list of arrow array data should be ordered in schema field + * number. Each kdb object representing one of the arrays must be structured + * according to the field's datatype. This required array data structure is + * detailed for each of the datatype constructor functions. + * + * Note that in general parquet only supports a subset of the the arrow + * datatypes with more limited functionality. For example the only supported + * nested datatypes are top level lists and structs (without further nesting). + * Similarly temporal datatypes with TimeUnit parameters only support MILLI or + * MICRO granularity. In such cases the parquet/arrow file writer will return + * an error. + * + * Supported options: + * + * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the + * default type mapping for the arrow decimal128 datatype and instead + * represent it as a double (9h). Default 0. + * + * @param parquet_file String name of the parquet file to write + * @param schema_id The schema identifier + * @param array_data Mixed list of arrow array data to be written to the + * file + * @options Dictionary of options or generic null (::) to use + * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or + * mixed list of -7|-11|4h. + * @return NULL on success, error otherwise + */ + EXP K writeORC(K parquet_file, K schema_id, K array_data, K options); + } -#endif // __TABLE_DATA_H__ +#endif // __TABLE_DATA_H__ \ No newline at end of file From 0a43aa03a6c34daeff026b9c8e0d49aae64ba9f6 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Tue, 20 Sep 2022 11:01:55 +1000 Subject: [PATCH 03/71] Testing Write Options --- src/TableData.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TableData.cpp b/src/TableData.cpp index 8d4bbd6..5e39660 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -920,8 +920,8 @@ K writeORC(K orc_file, K schema_id, K array_data, K options) // Parse the options auto write_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); - - auto maybe_writer = arrow::adapters::orc::ORCFileWriter::Open(outfile.get(), arrow::adapters::orc::WriteOptions()); + + auto maybe_writer = arrow::adapters::orc::ORCFileWriter::Open(outfile.get(), arrow::adapters::orc::WriteOptions(batch_size = kx::arrowkdb::Options::PARQUET_CHUNK_SIZE, 1024)); std::unique_ptr writer = std::move(maybe_writer.ValueOrDie()); From a87637bc9e4f5faa98a678cdbb34208d1c8fd1ac Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Tue, 20 Sep 2022 11:30:50 +1000 Subject: [PATCH 04/71] Testing Write Options --- src/TableData.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/TableData.cpp b/src/TableData.cpp index 5e39660..ca58bfa 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -921,7 +921,14 @@ K writeORC(K orc_file, K schema_id, K array_data, K options) // Parse the options auto write_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); - auto maybe_writer = arrow::adapters::orc::ORCFileWriter::Open(outfile.get(), arrow::adapters::orc::WriteOptions(batch_size = kx::arrowkdb::Options::PARQUET_CHUNK_SIZE, 1024)); + int64_t parquet_chunk_size = 1024*1024; + write_options.GetIntOption(kx::arrowkdb::Options::PARQUET_CHUNK_SIZE, parquet_chunk_size); + + auto used_write = arrow::adapters::orc::WriteOptions(); + used_write.batch_size = parquet_chunk_size; + + + auto maybe_writer = arrow::adapters::orc::ORCFileWriter::Open(outfile.get(), used_write); std::unique_ptr writer = std::move(maybe_writer.ValueOrDie()); From c3a7ed69d296755a0ab73fd3ae2b83662553bdc6 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Tue, 20 Sep 2022 11:44:03 +1000 Subject: [PATCH 05/71] Working ORC Write Functionality --- src/TableData.cpp | 8 +++----- src/TableData.h | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/TableData.cpp b/src/TableData.cpp index ca58bfa..738e8bd 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -921,16 +921,14 @@ K writeORC(K orc_file, K schema_id, K array_data, K options) // Parse the options auto write_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); - int64_t parquet_chunk_size = 1024*1024; - write_options.GetIntOption(kx::arrowkdb::Options::PARQUET_CHUNK_SIZE, parquet_chunk_size); + int64_t orc_chunk_size = 1024*1024; + write_options.GetIntOption(kx::arrowkdb::Options::ORC_CHUNK_SIZE, orc_chunk_size); auto used_write = arrow::adapters::orc::WriteOptions(); - used_write.batch_size = parquet_chunk_size; - + used_write.batch_size = orc_chunk_size; auto maybe_writer = arrow::adapters::orc::ORCFileWriter::Open(outfile.get(), used_write); - std::unique_ptr writer = std::move(maybe_writer.ValueOrDie()); diff --git a/src/TableData.h b/src/TableData.h index d7b7463..acfaa1a 100644 --- a/src/TableData.h +++ b/src/TableData.h @@ -358,7 +358,7 @@ extern "C" * default type mapping for the arrow decimal128 datatype and instead * represent it as a double (9h). Default 0. * - * @param parquet_file String name of the parquet file to write + * @param orc_file String name of the parquet file to write * @param schema_id The schema identifier * @param array_data Mixed list of arrow array data to be written to the * file @@ -367,7 +367,7 @@ extern "C" * mixed list of -7|-11|4h. * @return NULL on success, error otherwise */ - EXP K writeORC(K parquet_file, K schema_id, K array_data, K options); + EXP K writeORC(K orc_file, K schema_id, K array_data, K options); } From 6195db2321ea0358f66cb82747e1f72a8aa7d2db Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Tue, 20 Sep 2022 11:47:18 +1000 Subject: [PATCH 06/71] Updating Options for ORC Write --- src/KdbOptions.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 80254b6..1801d71 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -21,6 +21,7 @@ namespace Options // Int options const std::string ARROW_CHUNK_ROWS = "ARROW_CHUNK_ROWS"; const std::string PARQUET_CHUNK_SIZE = "PARQUET_CHUNK_SIZE"; + const std::string ORC_CHUNK_SIZE = "ORC_CHUNK_SIZE"; const std::string PARQUET_MULTITHREADED_READ = "PARQUET_MULTITHREADED_READ"; const std::string USE_MMAP = "USE_MMAP"; const std::string DECIMAL128_AS_DOUBLE = "DECIMAL128_AS_DOUBLE"; From 762c0072f00b8cf5f51d543b7955e9dbaaa4850a Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Tue, 20 Sep 2022 11:53:31 +1000 Subject: [PATCH 07/71] Updating Options for ORC Write --- src/KdbOptions.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 1801d71..4700fc5 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -64,6 +64,7 @@ namespace Options const static std::set int_options = { ARROW_CHUNK_ROWS, PARQUET_CHUNK_SIZE, + ORC_CHUNK_SIZE, PARQUET_MULTITHREADED_READ, USE_MMAP, DECIMAL128_AS_DOUBLE, From 30688158b29774a74f25d1f073a086117c36301c Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 23 Sep 2022 11:34:40 +1000 Subject: [PATCH 08/71] Adding ORC to unit tests --- tests/basic.t | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/basic.t b/tests/basic.t index f42ae75..aa2a0b5 100644 --- a/tests/basic.t +++ b/tests/basic.t @@ -2,7 +2,7 @@ -1 "\n+----------|| Load arrowkdb library ||----------+\n"; -\l q/arrowkdb.q +\l arrowkdb.q // Move to arrowkdb namespace \d .arrowkdb @@ -176,6 +176,14 @@ pq.readParquetSchema[filename]~schema pq.readParquetData[filename;::]~array_data rm filename; +-1 "<--- Read/write ORC --->"; + +filename:"ints.orc" +pq.writeOrc[filename;schema;array_data;(::)] +pq.readOrcSchema[filename]~schema +pq.readOrcData[filename;::]~array_data +rm filename; + -1 "<--- Read/write arrow file --->"; filename:"ints.arrow" From 3a8a5eaf2cc0801c70d1d8c8405729fd174631b5 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 23 Sep 2022 11:36:16 +1000 Subject: [PATCH 09/71] Adding ORC to unit tests --- tests/basic.t | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/basic.t b/tests/basic.t index aa2a0b5..dfa924d 100644 --- a/tests/basic.t +++ b/tests/basic.t @@ -179,9 +179,9 @@ rm filename; -1 "<--- Read/write ORC --->"; filename:"ints.orc" -pq.writeOrc[filename;schema;array_data;(::)] -pq.readOrcSchema[filename]~schema -pq.readOrcData[filename;::]~array_data +orc.writeOrc[filename;schema;array_data;(::)] +orc.readOrcSchema[filename]~schema +orc.readOrcData[filename;::]~array_data rm filename; -1 "<--- Read/write arrow file --->"; From 3da7990327c59b71b582c1298b2325c038b0338d Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 23 Sep 2022 11:55:42 +1000 Subject: [PATCH 10/71] Adding ORC to unit tests --- tests/basic.t | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/basic.t b/tests/basic.t index dfa924d..c3f10e0 100644 --- a/tests/basic.t +++ b/tests/basic.t @@ -177,9 +177,12 @@ pq.readParquetData[filename;::]~array_data rm filename; -1 "<--- Read/write ORC --->"; +// This is required for writedown of small files +orc_write_options:enlist[`ORC_CHUNK_SIZE]!enlist[1024] + filename:"ints.orc" -orc.writeOrc[filename;schema;array_data;(::)] +orc.writeOrc[filename;schema;array_data;orc_write_options] orc.readOrcSchema[filename]~schema orc.readOrcData[filename;::]~array_data rm filename; From d1bf47e55df97130e819db926a17309fcda0b21d Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 23 Sep 2022 11:56:58 +1000 Subject: [PATCH 11/71] Adding ORC to unit tests --- tests/basic.t | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/basic.t b/tests/basic.t index c3f10e0..8519ae3 100644 --- a/tests/basic.t +++ b/tests/basic.t @@ -179,6 +179,7 @@ rm filename; -1 "<--- Read/write ORC --->"; // This is required for writedown of small files orc_write_options:enlist[`ORC_CHUNK_SIZE]!enlist[1024] +orc_read_options:enlist[`ORC_CHUNK_SIZE]!enlist[1024] filename:"ints.orc" From 4cfeb7f5670dc1195d9f29e6d4dddc918eb2bafe Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 11:08:27 +1100 Subject: [PATCH 12/71] Allocating Size as if 64 bit --- src/ArrayReader.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 1d78225..36693b6 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -237,7 +237,9 @@ void AppendArray(shared_ptr array_data, K k_ar } } else { - memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); +// SAM TEST WOODSIDE +// memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); + memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int64Array::value_type ) ); } index += length; } From 7ad23154481e838bcfda59a16e4f3513381ef4ac Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 11:15:00 +1100 Subject: [PATCH 13/71] Adjusting Array Reader --- src/ArrayReader.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 36693b6..be67294 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -228,18 +228,18 @@ void AppendArray(shared_ptr array_data, K k_a template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto int32_array = static_pointer_cast(array_data); - auto length = int32_array->length(); - if( type_overrides.null_mapping.have_int32 && int32_array->null_count() ){ +//auto int32_array = static_pointer_cast(array_data); + auto int64_array = static_pointer_cast(array_data); + auto length = int64_array->length(); + if( type_overrides.null_mapping.have_int64 && int64_array->null_count() ){ for( auto i = 0ll; i < length; ++i ){ - kI( k_array )[i] = ( int32_array->IsNull( i ) * type_overrides.null_mapping.int32_null ) - + (!int32_array->IsNull( i ) * int32_array->Value( i ) ); + kI( k_array )[i] = ( int64_array->IsNull( i ) * type_overrides.null_mapping.int64_null ) + + (!int64_array->IsNull( i ) * int64_array->Value( i ) ); } } else { -// SAM TEST WOODSIDE // memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); - memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int64Array::value_type ) ); + memcpy( &kJ( k_array )[index], int64_array->raw_values(), length * sizeof( arrow::Int64Array::value_type ) ); } index += length; } From 9664b4c2eb1679459690ac4674e9fd626d86c599 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 11:19:19 +1100 Subject: [PATCH 14/71] Adjusting Array Reader --- src/ArrayReader.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index be67294..618fb28 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -211,16 +211,18 @@ void AppendArray(shared_ptr array_data, K k_ar template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto uint32_array = static_pointer_cast(array_data); - auto length = uint32_array->length(); - if( type_overrides.null_mapping.have_uint32 && uint32_array->null_count() ){ +//auto uint32_array = static_pointer_cast(array_data); + auto uint64_array = static_pointer_cast(array_data); + auto length = uint64_array->length(); + if( type_overrides.null_mapping.have_uint64 && uint64_array->null_count() ){ for( auto i = 0ll; i < length; ++i ){ - kI( k_array )[i] = ( uint32_array->IsNull( i ) * type_overrides.null_mapping.uint32_null ) - + ( !uint32_array->IsNull( i ) * uint32_array->Value( i ) ); + kI( k_array )[i] = ( uint64_array->IsNull( i ) * type_overrides.null_mapping.uint64_null ) + + ( !uint64_array->IsNull( i ) * uint64_array->Value( i ) ); } } else { - memcpy( &kI( k_array )[index], uint32_array->raw_values(), length * sizeof( arrow::UInt32Array::value_type ) ); +// memcpy( &kI( k_array )[index], uint32_array->raw_values(), length * sizeof( arrow::UInt32Array::value_type ) ); + memcpy( &kI( k_array )[index], uint64_array->raw_values(), length * sizeof( arrow::UInt64Array::value_type ) ); } index += length; } @@ -228,18 +230,16 @@ void AppendArray(shared_ptr array_data, K k_a template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { -//auto int32_array = static_pointer_cast(array_data); - auto int64_array = static_pointer_cast(array_data); - auto length = int64_array->length(); - if( type_overrides.null_mapping.have_int64 && int64_array->null_count() ){ + auto int32_array = static_pointer_cast(array_data); + auto length = int32_array->length(); + if( type_overrides.null_mapping.have_int32 && int32_array->null_count() ){ for( auto i = 0ll; i < length; ++i ){ - kI( k_array )[i] = ( int64_array->IsNull( i ) * type_overrides.null_mapping.int64_null ) - + (!int64_array->IsNull( i ) * int64_array->Value( i ) ); + kI( k_array )[i] = ( int32_array->IsNull( i ) * type_overrides.null_mapping.int32_null ) + + (!int32_array->IsNull( i ) * int32_array->Value( i ) ); } } else { -// memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); - memcpy( &kJ( k_array )[index], int64_array->raw_values(), length * sizeof( arrow::Int64Array::value_type ) ); + memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); } index += length; } From ad94bda6436caca4bc28cc2d9fdbd8fd959cf176 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 11:21:58 +1100 Subject: [PATCH 15/71] Reverting --- src/ArrayReader.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 618fb28..1d78225 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -211,18 +211,16 @@ void AppendArray(shared_ptr array_data, K k_ar template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { -//auto uint32_array = static_pointer_cast(array_data); - auto uint64_array = static_pointer_cast(array_data); - auto length = uint64_array->length(); - if( type_overrides.null_mapping.have_uint64 && uint64_array->null_count() ){ + auto uint32_array = static_pointer_cast(array_data); + auto length = uint32_array->length(); + if( type_overrides.null_mapping.have_uint32 && uint32_array->null_count() ){ for( auto i = 0ll; i < length; ++i ){ - kI( k_array )[i] = ( uint64_array->IsNull( i ) * type_overrides.null_mapping.uint64_null ) - + ( !uint64_array->IsNull( i ) * uint64_array->Value( i ) ); + kI( k_array )[i] = ( uint32_array->IsNull( i ) * type_overrides.null_mapping.uint32_null ) + + ( !uint32_array->IsNull( i ) * uint32_array->Value( i ) ); } } else { -// memcpy( &kI( k_array )[index], uint32_array->raw_values(), length * sizeof( arrow::UInt32Array::value_type ) ); - memcpy( &kI( k_array )[index], uint64_array->raw_values(), length * sizeof( arrow::UInt64Array::value_type ) ); + memcpy( &kI( k_array )[index], uint32_array->raw_values(), length * sizeof( arrow::UInt32Array::value_type ) ); } index += length; } From 8be97e39287fdeff4b365eba2b6a8ff728c4b809 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 11:31:00 +1100 Subject: [PATCH 16/71] Adding Logging --- src/ArrayReader.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 1d78225..098b53f 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -470,6 +470,7 @@ void AppendArray(shared_ptr array_data, K template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { + std::cout << "TIME32"; TemporalConversion tc(array_data->type()); auto t32_array = static_pointer_cast(array_data); auto time32_type = static_pointer_cast(t32_array->type()); @@ -484,6 +485,7 @@ void AppendArray(shared_ptr array_data, K k_a template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { + std::cout << "TIME64"; TemporalConversion tc(array_data->type()); auto t64_array = static_pointer_cast(array_data); auto time64_type = static_pointer_cast(t64_array->type()); From 1a573be0908a9ad7057da1b44be334ba32070ecc Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 11:53:52 +1100 Subject: [PATCH 17/71] Debugging --- src/TableData.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/TableData.cpp b/src/TableData.cpp index 738e8bd..240238b 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -814,6 +814,7 @@ K readORCData(K orc_file, K options) if (!kx::arrowkdb::IsKdbString(orc_file)) return krr((S)"orc_file not 11h or 0 of 10h"); + return krr((S)"This Is readORCData"); // Parse the options auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); From 5586948300e9f2cfe51e194cea9bb04ce3d3161e Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 11:57:31 +1100 Subject: [PATCH 18/71] Debugging --- src/TableData.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TableData.cpp b/src/TableData.cpp index 240238b..1b3f63d 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -814,7 +814,7 @@ K readORCData(K orc_file, K options) if (!kx::arrowkdb::IsKdbString(orc_file)) return krr((S)"orc_file not 11h or 0 of 10h"); - return krr((S)"This Is readORCData"); + krr((S)"This Is readORCData"); // Parse the options auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); From 375e8f8281886b56a411b5c528d0a64d079981b9 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 11:57:51 +1100 Subject: [PATCH 19/71] Debugging --- src/TableData.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TableData.cpp b/src/TableData.cpp index 1b3f63d..9285b55 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -814,7 +814,7 @@ K readORCData(K orc_file, K options) if (!kx::arrowkdb::IsKdbString(orc_file)) return krr((S)"orc_file not 11h or 0 of 10h"); - krr((S)"This Is readORCData"); + krr((S)"Debug Line"); // Parse the options auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); From 1758b41b5ba3a76b94477c6b4922578a69c678a4 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 11:59:08 +1100 Subject: [PATCH 20/71] Adding Debugging --- src/TableData.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/TableData.cpp b/src/TableData.cpp index 9285b55..3ccf1c8 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -815,6 +815,7 @@ K readORCData(K orc_file, K options) return krr((S)"orc_file not 11h or 0 of 10h"); krr((S)"Debug Line"); + std::cout << "TIME32"; // Parse the options auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); From cf987e495ea55091858474b2f2307aff8eaa9da0 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 12:02:52 +1100 Subject: [PATCH 21/71] Adding Debugging --- src/TableData.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TableData.cpp b/src/TableData.cpp index 3ccf1c8..8494761 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -814,8 +814,8 @@ K readORCData(K orc_file, K options) if (!kx::arrowkdb::IsKdbString(orc_file)) return krr((S)"orc_file not 11h or 0 of 10h"); - krr((S)"Debug Line"); std::cout << "TIME32"; + krr((S)"Debug Line"); // Parse the options auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); From 407f6d2140890997f937f6fe42ad1cc8ee0283bb Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 12:13:35 +1100 Subject: [PATCH 22/71] Adding Debugging --- src/ArrayReader.cpp | 12 ++++++++++-- src/TableData.cpp | 2 -- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 098b53f..1baf8e9 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -143,6 +143,7 @@ void AppendArray(shared_ptr array_data, K k_arr template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { + krr((S)"UINT8"); auto uint8_array = static_pointer_cast(array_data); auto length = uint8_array->length(); if( type_overrides.null_mapping.have_uint8 && uint8_array->null_count() ){ @@ -160,6 +161,7 @@ void AppendArray(shared_ptr array_data, K k_ar template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { + krr((S)"INT8"); auto int8_array = static_pointer_cast(array_data); auto length = int8_array->length(); if( type_overrides.null_mapping.have_int8 && int8_array->null_count() ){ @@ -177,6 +179,7 @@ void AppendArray(shared_ptr array_data, K k_arr template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { + krr((S)"UINT16"); auto uint16_array = static_pointer_cast(array_data); auto length = uint16_array->length(); if( type_overrides.null_mapping.have_uint16 && uint16_array->null_count() ){ @@ -194,6 +197,7 @@ void AppendArray(shared_ptr array_data, K k_a template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { + krr((S)"INT16"); auto int16_array = static_pointer_cast(array_data); auto length = int16_array->length(); if( type_overrides.null_mapping.have_int16 && int16_array->null_count() ){ @@ -211,6 +215,7 @@ void AppendArray(shared_ptr array_data, K k_ar template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { + krr((S)"UINT32"); auto uint32_array = static_pointer_cast(array_data); auto length = uint32_array->length(); if( type_overrides.null_mapping.have_uint32 && uint32_array->null_count() ){ @@ -228,6 +233,7 @@ void AppendArray(shared_ptr array_data, K k_a template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { + krr((S)"INT32"); auto int32_array = static_pointer_cast(array_data); auto length = int32_array->length(); if( type_overrides.null_mapping.have_int32 && int32_array->null_count() ){ @@ -245,6 +251,7 @@ void AppendArray(shared_ptr array_data, K k_ar template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { + krr((S)"UINT64"); auto uint64_array = static_pointer_cast(array_data); auto length = uint64_array->length(); if( type_overrides.null_mapping.have_uint64 && uint64_array->null_count() ){ @@ -262,6 +269,7 @@ void AppendArray(shared_ptr array_data, K k_a template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { + krr((S)"INT64"); auto int64_array = static_pointer_cast(array_data); auto length = int64_array->length(); if( type_overrides.null_mapping.have_int64 && int64_array->null_count() ){ @@ -470,7 +478,7 @@ void AppendArray(shared_ptr array_data, K template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - std::cout << "TIME32"; + krr((S)"TIME32"); TemporalConversion tc(array_data->type()); auto t32_array = static_pointer_cast(array_data); auto time32_type = static_pointer_cast(t32_array->type()); @@ -485,7 +493,7 @@ void AppendArray(shared_ptr array_data, K k_a template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - std::cout << "TIME64"; + krr((S)"TIME64"); TemporalConversion tc(array_data->type()); auto t64_array = static_pointer_cast(array_data); auto time64_type = static_pointer_cast(t64_array->type()); diff --git a/src/TableData.cpp b/src/TableData.cpp index 8494761..738e8bd 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -814,8 +814,6 @@ K readORCData(K orc_file, K options) if (!kx::arrowkdb::IsKdbString(orc_file)) return krr((S)"orc_file not 11h or 0 of 10h"); - std::cout << "TIME32"; - krr((S)"Debug Line"); // Parse the options auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); From a728fc8a96836c823af14fc5b959b59cdc88d5fc Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 12:19:43 +1100 Subject: [PATCH 23/71] Checking if just int32 or from time --- src/ArrayReader.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 1baf8e9..4bcb48f 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -478,7 +478,6 @@ void AppendArray(shared_ptr array_data, K template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - krr((S)"TIME32"); TemporalConversion tc(array_data->type()); auto t32_array = static_pointer_cast(array_data); auto time32_type = static_pointer_cast(t32_array->type()); @@ -493,7 +492,6 @@ void AppendArray(shared_ptr array_data, K k_a template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - krr((S)"TIME64"); TemporalConversion tc(array_data->type()); auto t64_array = static_pointer_cast(array_data); auto time64_type = static_pointer_cast(t64_array->type()); From 497635f536c1057ba9884389c05743f797f0abcf Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 12:22:29 +1100 Subject: [PATCH 24/71] Replacing 32 bit logic with 64 --- src/ArrayReader.cpp | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 4bcb48f..30ee133 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -143,7 +143,7 @@ void AppendArray(shared_ptr array_data, K k_arr template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - krr((S)"UINT8"); + // krr((S)"UINT8"); auto uint8_array = static_pointer_cast(array_data); auto length = uint8_array->length(); if( type_overrides.null_mapping.have_uint8 && uint8_array->null_count() ){ @@ -161,7 +161,7 @@ void AppendArray(shared_ptr array_data, K k_ar template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - krr((S)"INT8"); + // krr((S)"INT8"); auto int8_array = static_pointer_cast(array_data); auto length = int8_array->length(); if( type_overrides.null_mapping.have_int8 && int8_array->null_count() ){ @@ -179,7 +179,7 @@ void AppendArray(shared_ptr array_data, K k_arr template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - krr((S)"UINT16"); + // krr((S)"UINT16"); auto uint16_array = static_pointer_cast(array_data); auto length = uint16_array->length(); if( type_overrides.null_mapping.have_uint16 && uint16_array->null_count() ){ @@ -197,7 +197,7 @@ void AppendArray(shared_ptr array_data, K k_a template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - krr((S)"INT16"); + // krr((S)"INT16"); auto int16_array = static_pointer_cast(array_data); auto length = int16_array->length(); if( type_overrides.null_mapping.have_int16 && int16_array->null_count() ){ @@ -215,7 +215,7 @@ void AppendArray(shared_ptr array_data, K k_ar template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - krr((S)"UINT32"); + // krr((S)"UINT32"); auto uint32_array = static_pointer_cast(array_data); auto length = uint32_array->length(); if( type_overrides.null_mapping.have_uint32 && uint32_array->null_count() ){ @@ -233,17 +233,19 @@ void AppendArray(shared_ptr array_data, K k_a template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - krr((S)"INT32"); - auto int32_array = static_pointer_cast(array_data); - auto length = int32_array->length(); - if( type_overrides.null_mapping.have_int32 && int32_array->null_count() ){ + // krr((S)"INT32"); + // auto int32_array = static_pointer_cast(array_data); + auto int64_array = static_pointer_cast(array_data); + auto length = int64_array->length(); + if( type_overrides.null_mapping.have_int64 && int64_array->null_count() ){ for( auto i = 0ll; i < length; ++i ){ - kI( k_array )[i] = ( int32_array->IsNull( i ) * type_overrides.null_mapping.int32_null ) - + (!int32_array->IsNull( i ) * int32_array->Value( i ) ); + kI( k_array )[i] = ( int64_array->IsNull( i ) * type_overrides.null_mapping.int64_null ) + + (!int64_array->IsNull( i ) * int64_array->Value( i ) ); } } else { - memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); + // memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); + memcpy( &kJ( k_array )[index], int64_array->raw_values(), length * sizeof( arrow::Int64Array::value_type ) ); } index += length; } @@ -251,7 +253,7 @@ void AppendArray(shared_ptr array_data, K k_ar template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - krr((S)"UINT64"); + // krr((S)"UINT64"); auto uint64_array = static_pointer_cast(array_data); auto length = uint64_array->length(); if( type_overrides.null_mapping.have_uint64 && uint64_array->null_count() ){ @@ -269,7 +271,7 @@ void AppendArray(shared_ptr array_data, K k_a template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - krr((S)"INT64"); + // krr((S)"INT64"); auto int64_array = static_pointer_cast(array_data); auto length = int64_array->length(); if( type_overrides.null_mapping.have_int64 && int64_array->null_count() ){ From 5a44f8fb8fd95ff1627d7b658a54e3838208f568 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 12:47:13 +1100 Subject: [PATCH 25/71] Using mixed list --- src/ArrayReader.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 30ee133..daab9e5 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -234,18 +234,22 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { // krr((S)"INT32"); - // auto int32_array = static_pointer_cast(array_data); - auto int64_array = static_pointer_cast(array_data); - auto length = int64_array->length(); - if( type_overrides.null_mapping.have_int64 && int64_array->null_count() ){ + auto int32_array = static_pointer_cast(array_data); + auto length = int32_array->length(); + if( type_overrides.null_mapping.have_int32 && int32_array->null_count() ){ for( auto i = 0ll; i < length; ++i ){ - kI( k_array )[i] = ( int64_array->IsNull( i ) * type_overrides.null_mapping.int64_null ) - + (!int64_array->IsNull( i ) * int64_array->Value( i ) ); + kI( k_array )[i] = ( int32_array->IsNull( i ) * type_overrides.null_mapping.int32_null ) + + (!int32_array->IsNull( i ) * int32_array->Value( i ) ); } } else { + for (auto i = 0; i < int32_array->length(); ++i) { + auto integer32 = arrow::Int32(int32_array->Value(i)); + K k_int = ktn(KG, 16); // Can change the second argument to 16 or 32 or 64 to test + integer32.ToBytes(kG(k_int)); + kK(k_array)[index++] = k_int; // Might have to change kI? - Check back on Decimal way + } // memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); - memcpy( &kJ( k_array )[index], int64_array->raw_values(), length * sizeof( arrow::Int64Array::value_type ) ); } index += length; } From a73e6b916540d93c49a416a0b37bde2b6c1c58a2 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 12:50:52 +1100 Subject: [PATCH 26/71] Using mixed list --- src/ArrayReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index daab9e5..38ca3a8 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -244,7 +244,7 @@ void AppendArray(shared_ptr array_data, K k_ar } else { for (auto i = 0; i < int32_array->length(); ++i) { - auto integer32 = arrow::Int32(int32_array->Value(i)); + auto integer32 = arrow::int32(int32_array->Value(i)); K k_int = ktn(KG, 16); // Can change the second argument to 16 or 32 or 64 to test integer32.ToBytes(kG(k_int)); kK(k_array)[index++] = k_int; // Might have to change kI? - Check back on Decimal way From 55c4806ccf173c3ae52a6f4c551f26b107d1012e Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 12:57:13 +1100 Subject: [PATCH 27/71] kI without scaling --- src/ArrayReader.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 38ca3a8..94d2861 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -235,6 +235,7 @@ void AppendArray(shared_ptr array_data, K k_ar { // krr((S)"INT32"); auto int32_array = static_pointer_cast(array_data); + auto int32_type = std::static_pointer_cast(int32_array->type()); auto length = int32_array->length(); if( type_overrides.null_mapping.have_int32 && int32_array->null_count() ){ for( auto i = 0ll; i < length; ++i ){ @@ -245,9 +246,9 @@ void AppendArray(shared_ptr array_data, K k_ar else { for (auto i = 0; i < int32_array->length(); ++i) { auto integer32 = arrow::int32(int32_array->Value(i)); - K k_int = ktn(KG, 16); // Can change the second argument to 16 or 32 or 64 to test - integer32.ToBytes(kG(k_int)); - kK(k_array)[index++] = k_int; // Might have to change kI? - Check back on Decimal way + // K k_int = ktn(KG, 16); // Can change the second argument to 16 or 32 or 64 to test + // integer32.ToBytes(kG(k_int)); + kI(k_array)[index++] = integer32; // Might have to change kI? - Check back on Decimal way } // memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); } From e25160c2fae0054ca9ef84cc651388c2a9f784a7 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 13:52:05 +1100 Subject: [PATCH 28/71] Removing argument --- src/ArrayReader.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 94d2861..d34de71 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -245,7 +245,8 @@ void AppendArray(shared_ptr array_data, K k_ar } else { for (auto i = 0; i < int32_array->length(); ++i) { - auto integer32 = arrow::int32(int32_array->Value(i)); + // auto integer32 = arrow::int32(int32_array->Value(i)); // Doesn't work because it can't take arguments + auto integer32 = int32_array->Value(i); // K k_int = ktn(KG, 16); // Can change the second argument to 16 or 32 or 64 to test // integer32.ToBytes(kG(k_int)); kI(k_array)[index++] = integer32; // Might have to change kI? - Check back on Decimal way From fb0cfe5572a2b083c5343d78dedc967251c2f4bc Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 14:05:41 +1100 Subject: [PATCH 29/71] Once removed --- src/ArrayReader.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index d34de71..f9be659 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -246,10 +246,9 @@ void AppendArray(shared_ptr array_data, K k_ar else { for (auto i = 0; i < int32_array->length(); ++i) { // auto integer32 = arrow::int32(int32_array->Value(i)); // Doesn't work because it can't take arguments - auto integer32 = int32_array->Value(i); // K k_int = ktn(KG, 16); // Can change the second argument to 16 or 32 or 64 to test // integer32.ToBytes(kG(k_int)); - kI(k_array)[index++] = integer32; // Might have to change kI? - Check back on Decimal way + kI(k_array)[index++] = int32_array->Value(i); // Might have to change kI? - Check back on Decimal way } // memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); } From 74f680cb490b37111b2f8b7a08a3ab0bffe44a34 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 14:20:59 +1100 Subject: [PATCH 30/71] Test --- src/ArrayReader.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index f9be659..6d3cb64 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -246,9 +246,12 @@ void AppendArray(shared_ptr array_data, K k_ar else { for (auto i = 0; i < int32_array->length(); ++i) { // auto integer32 = arrow::int32(int32_array->Value(i)); // Doesn't work because it can't take arguments + auto integer32 = int32_array->Value(i); // K k_int = ktn(KG, 16); // Can change the second argument to 16 or 32 or 64 to test + K k_int = ktn(KI, 32); + memcpy(kI(k_int), integer32, 32); // integer32.ToBytes(kG(k_int)); - kI(k_array)[index++] = int32_array->Value(i); // Might have to change kI? - Check back on Decimal way + kI(k_array)[index++] = ; // Might have to change kI? - Check back on Decimal way } // memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); } From 6cbf8761f769e73485f913f37876d34728ebd4c8 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 14:24:03 +1100 Subject: [PATCH 31/71] Test --- src/ArrayReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 6d3cb64..d17133c 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -249,7 +249,7 @@ void AppendArray(shared_ptr array_data, K k_ar auto integer32 = int32_array->Value(i); // K k_int = ktn(KG, 16); // Can change the second argument to 16 or 32 or 64 to test K k_int = ktn(KI, 32); - memcpy(kI(k_int), integer32, 32); + memcpy(kI(k_int), integer32.data(), 32); // integer32.ToBytes(kG(k_int)); kI(k_array)[index++] = ; // Might have to change kI? - Check back on Decimal way } From afe713ba270b51c3f29e3a80b1e685baeffbd16a Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 14:27:05 +1100 Subject: [PATCH 32/71] Test --- src/ArrayReader.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index d17133c..d34de71 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -248,10 +248,8 @@ void AppendArray(shared_ptr array_data, K k_ar // auto integer32 = arrow::int32(int32_array->Value(i)); // Doesn't work because it can't take arguments auto integer32 = int32_array->Value(i); // K k_int = ktn(KG, 16); // Can change the second argument to 16 or 32 or 64 to test - K k_int = ktn(KI, 32); - memcpy(kI(k_int), integer32.data(), 32); // integer32.ToBytes(kG(k_int)); - kI(k_array)[index++] = ; // Might have to change kI? - Check back on Decimal way + kI(k_array)[index++] = integer32; // Might have to change kI? - Check back on Decimal way } // memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); } From dcf15dce3ea259f2101fd0c355024c2579969061 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 16:43:49 +1100 Subject: [PATCH 33/71] String Int Test --- src/ArrayReader.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index d34de71..49d8446 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -246,10 +246,11 @@ void AppendArray(shared_ptr array_data, K k_ar else { for (auto i = 0; i < int32_array->length(); ++i) { // auto integer32 = arrow::int32(int32_array->Value(i)); // Doesn't work because it can't take arguments - auto integer32 = int32_array->Value(i); - // K k_int = ktn(KG, 16); // Can change the second argument to 16 or 32 or 64 to test - // integer32.ToBytes(kG(k_int)); - kI(k_array)[index++] = integer32; // Might have to change kI? - Check back on Decimal way + auto str_data = int32_array->Value(i); + str_data.ToString(kG(k_int)); + K k_str = ktn(KC, str_data.length()); + memcpy(kG(k_str), str_data.data(), str_data.length()); + kK(k_array)[index++] = k_str; } // memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); } From 8e0c0331a8e30305d2cd3990a24e176369399b2d Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 16:52:08 +1100 Subject: [PATCH 34/71] String Int Test --- src/ArrayReader.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 49d8446..d873a97 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -246,8 +246,8 @@ void AppendArray(shared_ptr array_data, K k_ar else { for (auto i = 0; i < int32_array->length(); ++i) { // auto integer32 = arrow::int32(int32_array->Value(i)); // Doesn't work because it can't take arguments - auto str_data = int32_array->Value(i); - str_data.ToString(kG(k_int)); + auto integer32 = int32_array->Value(i); + string str_data = to_string(integer32); K k_str = ktn(KC, str_data.length()); memcpy(kG(k_str), str_data.data(), str_data.length()); kK(k_array)[index++] = k_str; From 4f0a45dcc4016c091cdd98d2ebe02baf193228b2 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 16:53:11 +1100 Subject: [PATCH 35/71] String Int Test --- src/ArrayReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index d873a97..4689256 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -247,7 +247,7 @@ void AppendArray(shared_ptr array_data, K k_ar for (auto i = 0; i < int32_array->length(); ++i) { // auto integer32 = arrow::int32(int32_array->Value(i)); // Doesn't work because it can't take arguments auto integer32 = int32_array->Value(i); - string str_data = to_string(integer32); + std::string str_data = to_string(integer32); K k_str = ktn(KC, str_data.length()); memcpy(kG(k_str), str_data.data(), str_data.length()); kK(k_array)[index++] = k_str; From 4ea91fcf6ab417873404cc6c9b5fab0e74d13ee6 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 16:55:19 +1100 Subject: [PATCH 36/71] String Int Test --- src/ArrayReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 4689256..a10d6a1 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -247,7 +247,7 @@ void AppendArray(shared_ptr array_data, K k_ar for (auto i = 0; i < int32_array->length(); ++i) { // auto integer32 = arrow::int32(int32_array->Value(i)); // Doesn't work because it can't take arguments auto integer32 = int32_array->Value(i); - std::string str_data = to_string(integer32); + std::string str_data = std::to_string(integer32); K k_str = ktn(KC, str_data.length()); memcpy(kG(k_str), str_data.data(), str_data.length()); kK(k_array)[index++] = k_str; From 2659f5458cef8386da12ca39c279f5f4a9d15bb3 Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 16:56:53 +1100 Subject: [PATCH 37/71] String Int Test --- src/ArrayReader.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index a10d6a1..de074b3 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -235,7 +235,6 @@ void AppendArray(shared_ptr array_data, K k_ar { // krr((S)"INT32"); auto int32_array = static_pointer_cast(array_data); - auto int32_type = std::static_pointer_cast(int32_array->type()); auto length = int32_array->length(); if( type_overrides.null_mapping.have_int32 && int32_array->null_count() ){ for( auto i = 0ll; i < length; ++i ){ From 60945fe1419d1bc5c38c329b216462b196502d4f Mon Sep 17 00:00:00 2001 From: Samuel Bruce Date: Fri, 21 Oct 2022 17:07:29 +1100 Subject: [PATCH 38/71] Trying without defining type --- src/ArrayReader.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index de074b3..e9bb391 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -233,7 +233,6 @@ void AppendArray(shared_ptr array_data, K k_a template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - // krr((S)"INT32"); auto int32_array = static_pointer_cast(array_data); auto length = int32_array->length(); if( type_overrides.null_mapping.have_int32 && int32_array->null_count() ){ @@ -246,10 +245,9 @@ void AppendArray(shared_ptr array_data, K k_ar for (auto i = 0; i < int32_array->length(); ++i) { // auto integer32 = arrow::int32(int32_array->Value(i)); // Doesn't work because it can't take arguments auto integer32 = int32_array->Value(i); - std::string str_data = std::to_string(integer32); - K k_str = ktn(KC, str_data.length()); - memcpy(kG(k_str), str_data.data(), str_data.length()); - kK(k_array)[index++] = k_str; + // K k_int = ktn(KG, 16); // Can change the second argument to 16 or 32 or 64 to test + // integer32.ToBytes(kG(k_int)); + kI(k_array)[index++] = integer32; // Might have to change kI? - Check back on Decimal way } // memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); } From ced83257e54437a28b76f95ea7345540377da1a8 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 16 Mar 2023 14:10:27 +0300 Subject: [PATCH 39/71] Cleanup logging and debugging --- src/ArrayReader.cpp | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index e9bb391..1d78225 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -143,7 +143,6 @@ void AppendArray(shared_ptr array_data, K k_arr template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - // krr((S)"UINT8"); auto uint8_array = static_pointer_cast(array_data); auto length = uint8_array->length(); if( type_overrides.null_mapping.have_uint8 && uint8_array->null_count() ){ @@ -161,7 +160,6 @@ void AppendArray(shared_ptr array_data, K k_ar template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - // krr((S)"INT8"); auto int8_array = static_pointer_cast(array_data); auto length = int8_array->length(); if( type_overrides.null_mapping.have_int8 && int8_array->null_count() ){ @@ -179,7 +177,6 @@ void AppendArray(shared_ptr array_data, K k_arr template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - // krr((S)"UINT16"); auto uint16_array = static_pointer_cast(array_data); auto length = uint16_array->length(); if( type_overrides.null_mapping.have_uint16 && uint16_array->null_count() ){ @@ -197,7 +194,6 @@ void AppendArray(shared_ptr array_data, K k_a template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - // krr((S)"INT16"); auto int16_array = static_pointer_cast(array_data); auto length = int16_array->length(); if( type_overrides.null_mapping.have_int16 && int16_array->null_count() ){ @@ -215,7 +211,6 @@ void AppendArray(shared_ptr array_data, K k_ar template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - // krr((S)"UINT32"); auto uint32_array = static_pointer_cast(array_data); auto length = uint32_array->length(); if( type_overrides.null_mapping.have_uint32 && uint32_array->null_count() ){ @@ -242,14 +237,7 @@ void AppendArray(shared_ptr array_data, K k_ar } } else { - for (auto i = 0; i < int32_array->length(); ++i) { - // auto integer32 = arrow::int32(int32_array->Value(i)); // Doesn't work because it can't take arguments - auto integer32 = int32_array->Value(i); - // K k_int = ktn(KG, 16); // Can change the second argument to 16 or 32 or 64 to test - // integer32.ToBytes(kG(k_int)); - kI(k_array)[index++] = integer32; // Might have to change kI? - Check back on Decimal way - } - // memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); + memcpy( &kI( k_array )[index], int32_array->raw_values(), length * sizeof( arrow::Int32Array::value_type ) ); } index += length; } @@ -257,7 +245,6 @@ void AppendArray(shared_ptr array_data, K k_ar template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - // krr((S)"UINT64"); auto uint64_array = static_pointer_cast(array_data); auto length = uint64_array->length(); if( type_overrides.null_mapping.have_uint64 && uint64_array->null_count() ){ @@ -275,7 +262,6 @@ void AppendArray(shared_ptr array_data, K k_a template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - // krr((S)"INT64"); auto int64_array = static_pointer_cast(array_data); auto length = int64_array->length(); if( type_overrides.null_mapping.have_int64 && int64_array->null_count() ){ From fa9fd38bca59bd39262f4573372d89f7860d7ad2 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 16 Mar 2023 14:18:21 +0300 Subject: [PATCH 40/71] Fixing ORC Unit-tests --- tests/basic.t | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/basic.t b/tests/basic.t index 8519ae3..e4cbf1b 100644 --- a/tests/basic.t +++ b/tests/basic.t @@ -158,10 +158,10 @@ sparse_union_data:dense_union_data:(0 1 0h;1 2 3;4 5 6f) -1 "\n+----------|| Test integer types schema ||----------+\n"; -fields:(uint8_fd,int8_fd,uint16_fd,int16_fd,uint32_fd,int32_fd,uint64_fd,int64_fd) +fields:(int8_fd,int16_fd,int32_fd,int64_fd) schema:sc.schema[fields] sc.schemaFields[schema]~fields -array_data:(uint8_data;int8_data;uint16_data;int16_data;uint32_data;int32_data;uint64_data;int64_data) +array_data:(int8_data;int16_data;int32_data;int64_data) rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]} -1 "<--- Read/write parquet --->"; @@ -174,6 +174,7 @@ filename:"ints.parquet" pq.writeParquet[filename;schema;array_data;parquet_write_options] pq.readParquetSchema[filename]~schema pq.readParquetData[filename;::]~array_data +show filename; rm filename; -1 "<--- Read/write ORC --->"; @@ -186,6 +187,7 @@ filename:"ints.orc" orc.writeOrc[filename;schema;array_data;orc_write_options] orc.readOrcSchema[filename]~schema orc.readOrcData[filename;::]~array_data +show filename; rm filename; -1 "<--- Read/write arrow file --->"; From 29dc4978d6963ac02d2cb267c5c164462d280faf Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 16 Mar 2023 19:36:15 +0300 Subject: [PATCH 41/71] ORC pull-request dicsussion changes https://github.com/KxSystems/arrowkdb/pull/26 --- src/TableData.cpp | 6 +----- src/TableData.h | 43 +++++++++++++++++++------------------------ 2 files changed, 20 insertions(+), 29 deletions(-) diff --git a/src/TableData.cpp b/src/TableData.cpp index 738e8bd..06a4cc7 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -817,10 +817,6 @@ K readORCData(K orc_file, K options) // Parse the options auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); - // Use multi threading - int64_t parquet_multithreaded_read = 0; - read_options.GetIntOption(kx::arrowkdb::Options::PARQUET_MULTITHREADED_READ, parquet_multithreaded_read); - // Use memmap int64_t use_mmap = 0; read_options.GetIntOption(kx::arrowkdb::Options::USE_MMAP, use_mmap); @@ -944,4 +940,4 @@ K writeORC(K orc_file, K schema_id, K array_data, K options) return (K)0; KDB_EXCEPTION_CATCH; -} \ No newline at end of file +} diff --git a/src/TableData.h b/src/TableData.h index acfaa1a..c66cd95 100644 --- a/src/TableData.h +++ b/src/TableData.h @@ -236,7 +236,7 @@ extern "C" * * Supported options: * - * USE_MMAP (long) - Flag indicating whether the parquet file should be memory + * USE_MMAP (long) - Flag indicating whether the IPC file should be memory * mapped in. This can improve performance on systems which support mmap. * Default 0 * @@ -303,16 +303,10 @@ extern "C" */ EXP K parseArrowData(K char_array, K options); - /** - * @brief Reads the arrow array data from the specified parquet file - * - * Supported options: - * - * PARQUET_MULTITHREADED_READ (long) - Flag indicating whether the parquet - * reader should run in multithreaded mode. This can improve performance by - * processing multiple columns in parallel. Default 0 + /** + * @brief Reads the arrow array data from the specified ORC file * - * USE_MMAP (long) - Flag indicating whether the parquet file should be memory + * USE_MMAP (long) - Flag indicating whether the ORC file should be memory * mapped in. This can improve performance on systems which support mmap. * Default 0 * @@ -320,7 +314,7 @@ extern "C" * default type mapping for the arrow decimal128 datatype and instead * represent it as a double (9h). Default 0. * - * @param orc_file String name of the parquet file to read + * @param orc_file String name of the ORC file to read * @options Dictionary of options or generic null (::) to use * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or * mixed list of -7|-11|4h. @@ -328,16 +322,16 @@ extern "C" */ EXP K readORCData(K orc_file, K options); - /** - * @brief Reads the arrow schema from the specified parquet file + /** + * @brief Reads the arrow schema from the specified ORC file * - * @param orc_file String name of the parquet file to read + * @param orc_file String name of the ORC file to read * @return Schema identifier */ EXP K readORCSchema(K orc_file); - /** - * @brief Creates a parquet file with the specified arrow schema and populates + /** + * @brief Creates an ORC file with the specified arrow schema and populates * it from a mixed list of arrow array objects. * * The mixed list of arrow array data should be ordered in schema field @@ -345,12 +339,10 @@ extern "C" * according to the field's datatype. This required array data structure is * detailed for each of the datatype constructor functions. * - * Note that in general parquet only supports a subset of the the arrow - * datatypes with more limited functionality. For example the only supported - * nested datatypes are top level lists and structs (without further nesting). - * Similarly temporal datatypes with TimeUnit parameters only support MILLI or - * MICRO granularity. In such cases the parquet/arrow file writer will return - * an error. + * Note that in general ORC only supports a small subset of the arrow + * datatypes with more then limited functionality. Most importantly ORC doesn't + * support unsigned integer types. In such case the ORC writer may fail + * to write the file. * * Supported options: * @@ -358,7 +350,10 @@ extern "C" * default type mapping for the arrow decimal128 datatype and instead * represent it as a double (9h). Default 0. * - * @param orc_file String name of the parquet file to write + * ORC_CHUNK_SIZE (long) - ORC stripe size, to control the approximate size + * of data within a column stripe. This currently defaults to 1MB. + * + * @param orc_file String name of the ORC file to write * @param schema_id The schema identifier * @param array_data Mixed list of arrow array data to be written to the * file @@ -371,4 +366,4 @@ extern "C" } -#endif // __TABLE_DATA_H__ \ No newline at end of file +#endif // __TABLE_DATA_H__ From 6d8851384cd8c324277eafad2e75aab04a819dd6 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 16 Mar 2023 19:37:16 +0300 Subject: [PATCH 42/71] Reverted folder prefix to restore Windows build --- tests/basic.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/basic.t b/tests/basic.t index e4cbf1b..e61fcf5 100644 --- a/tests/basic.t +++ b/tests/basic.t @@ -2,7 +2,7 @@ -1 "\n+----------|| Load arrowkdb library ||----------+\n"; -\l arrowkdb.q +\l q/arrowkdb.q // Move to arrowkdb namespace \d .arrowkdb From 25895f29d1ee4058aed1da64ea28ca4e41636e57 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 16 Mar 2023 20:04:34 +0300 Subject: [PATCH 43/71] Handling write and close error statuses --- src/TableData.cpp | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/TableData.cpp b/src/TableData.cpp index 06a4cc7..151060a 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -909,10 +909,11 @@ K writeORC(K orc_file, K schema_id, K array_data, K options) if (!schema) return krr((S)"unknown schema"); + std::string path = kx::arrowkdb::GetKdbString( orc_file ); std::shared_ptr outfile; PARQUET_ASSIGN_OR_THROW( outfile, - arrow::io::FileOutputStream::Open(kx::arrowkdb::GetKdbString(orc_file))); + arrow::io::FileOutputStream::Open( path ) ); // Parse the options auto write_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); @@ -934,10 +935,28 @@ K writeORC(K orc_file, K schema_id, K array_data, K options) // Create the arrow table auto table = MakeTable(schema, array_data, type_overrides); - writer->Write(*table); - writer->Close(); + std::string reason; + auto writeStatus = writer->Write( *table ); + if( writeStatus != writeStatus.OK() ){ + reason = std::string( "Failed to write ORC file, name: " ) + .append( path ) + .append( ", reason: " ) + .append( writeStatus.ToString() ); + } - return (K)0; + auto closeStatus = writer->Close(); + if( closeStatus != closeStatus.OK() ){ + reason = std::string( "Failed to close ORC file, name: " ) + .append( path ) + .append( ", reason: " ) + .append( closeStatus.ToString() ); + } + + K result = reason.empty() + ? ( K )0 + : knk( 2, ks( S( "error" ) ), ks( S( reason.c_str() ) ) ); + + return result; KDB_EXCEPTION_CATCH; } From 375397b8b3d4d81f514cd4963c374b148c0dd6d6 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 16 Mar 2023 21:25:32 +0300 Subject: [PATCH 44/71] ORC Dataloader example --- examples/null_bitmap.q | 2 +- examples/orc_dataloader.q | 142 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 examples/orc_dataloader.q diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 9dbb7b4..99e75d2 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -359,4 +359,4 @@ show nested_union_nulls~stream_union_nulls[1] -1 "\n+----------------------------------------+\n"; // Process off -//exit 0; +exit 0; diff --git a/examples/orc_dataloader.q b/examples/orc_dataloader.q new file mode 100644 index 0000000..97cdffb --- /dev/null +++ b/examples/orc_dataloader.q @@ -0,0 +1,142 @@ +// orc_dataloader.q +// Examples of read/write ORC file + +-1"\n+----------|| orc_dataloader.q ||----------+\n"; + +// import the arrowkdb library +\l q/arrowkdb.q + +// Filesystem functions for Linux/MacOS/Windows +ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +///////////////////////// +// CONSTRUCTED SCHEMAS // +///////////////////////// + +//-------------------// +// Create the schema // +//-------------------// + +// Create the datatype identifiers +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +i8_dt:.arrowkdb.dt.int8[]; +i16_dt:.arrowkdb.dt.int16[]; +i32_dt:.arrowkdb.dt.int32[]; +i64_dt:.arrowkdb.dt.int64[]; + +// Create the field identifiers +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; +i16_fd:.arrowkdb.fd.field[`int16;i16_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; + +// Create the schemas for the list of fields +dataloader_schema:.arrowkdb.sc.schema[(ts_fd,i8_fd,i16_fd,i32_fd,i64_fd)]; + +// Print the schemas +.arrowkdb.sc.printSchema[dataloader_schema]; + +// Number of items in each array +N:10 + +// Create data for each column in the table +ts_data:asc N?0p; + +i8_data:N?0x64; +i16_data:N?100h; +i32_data:N?100i; +i64_data:N?100; + +// Combine the data for all columns +dataloader_data:(ts_data;i8_data;i16_data;i32_data;i64_data); + +// Pretty print the Arrow table populated from the array data +.arrowkdb.tb.prettyPrintTable[dataloader_schema;dataloader_data;::]; + +//-------------------------// +// Example-1. Parquet file // +//-------------------------// + +// Write the schema and array data to a parquet file +orc_options:(``PARQUET_VERSION)!((::);`V2.0); + +parquet_dataloader:"orc_dataloader.parquet"; +.arrowkdb.pq.writeParquet[parquet_dataloader;dataloader_schema;dataloader_data;orc_options]; +show ls parquet_dataloader + +// Read the schema back and compare +parquet_dataloader_schema:.arrowkdb.pq.readParquetSchema[parquet_dataloader]; +show .arrowkdb.sc.equalSchemas[dataloader_schema;parquet_dataloader_schema] +show dataloader_schema~parquet_dataloader_schema + +// Read the array data back and compare +parquet_dataloader_data:.arrowkdb.pq.readParquetData[parquet_dataloader;orc_options]; +show dataloader_data~parquet_dataloader_data +rm parquet_dataloader; + +//---------------------------// +// Example-2. Apache ORC file// +//---------------------------// + +// Write the schema and array data to a ORC file +orc_options[`ORC_CHUNK_SIZE]:1024 + +orc_dataloader:"orc_dataloader.orc" +.arrowkdb.orc.writeOrc[orc_dataloader;dataloader_schema;dataloader_data;orc_options] +show orc_dataloader; + +// Read the schema back and compare +orc_dataloader_schema:.arrowkdb.orc.readOrcSchema[orc_dataloader]; +show .arrowkdb.sc.equalSchemas[dataloader_schema;orc_dataloader_schema] +show dataloader_schema~orc_dataloader_schema + +// Read the array data back and compare +orc_dataloader_data:.arrowkdb.orc.readOrcData[orc_dataloader;orc_options]; +show dataloader_data~orc_dataloader_data +rm orc_dataloader; + +//---------------------------// +// Example-3. Arrow IPC file // +//---------------------------// + +// Write the schema and array data to an arrow file +arrow_dataloader:"orc_dataloader.arrow"; +.arrowkdb.ipc.writeArrow[arrow_dataloader;dataloader_schema;dataloader_data;orc_options]; +show ls arrow_dataloader + +// Read the schema back and compare +arrow_dataloader_schema:.arrowkdb.ipc.readArrowSchema[arrow_dataloader]; +show .arrowkdb.sc.equalSchemas[dataloader_schema;arrow_dataloader_schema] +show dataloader_schema~arrow_dataloader_schema + +// Read the array data back and compare +arrow_dataloader_data:.arrowkdb.ipc.readArrowData[arrow_dataloader;orc_options]; +show dataloader_data~arrow_dataloader_data +rm arrow_dataloader; + +//-----------------------------// +// Example-4. Arrow IPC stream // +//-----------------------------// + +// Serialize the schema and array data to an arrow stream +serialized_dataloader:.arrowkdb.ipc.serializeArrow[dataloader_schema;dataloader_data;orc_options]; +show serialized_dataloader + +// Parse the schema back abd compare +stream_dataloader_schema:.arrowkdb.ipc.parseArrowSchema[serialized_dataloader]; +show .arrowkdb.sc.equalSchemas[dataloader_schema;stream_dataloader_schema] +show dataloader_schema~stream_dataloader_schema + +// Parse the array data back and compare +stream_dataloader_data:.arrowkdb.ipc.parseArrowData[serialized_dataloader;orc_options]; +show dataloader_data~stream_dataloader_data + + +-1 "\n+----------------------------------------+\n"; + +// Process off +exit 0; From 728f1bed45b19dd602ab8da51ec59e86a03beb69 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 16 Mar 2023 21:26:17 +0300 Subject: [PATCH 45/71] Reduced schema for ORC Dataloader only --- tests/basic.t | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/basic.t b/tests/basic.t index e61fcf5..e0c13c6 100644 --- a/tests/basic.t +++ b/tests/basic.t @@ -158,10 +158,14 @@ sparse_union_data:dense_union_data:(0 1 0h;1 2 3;4 5 6f) -1 "\n+----------|| Test integer types schema ||----------+\n"; -fields:(int8_fd,int16_fd,int32_fd,int64_fd) +fields:(uint8_fd,int8_fd,uint16_fd,int16_fd,uint32_fd,int32_fd,uint64_fd,int64_fd) +orc_fields:(int8_fd,int16_fd,int32_fd,int64_fd) schema:sc.schema[fields] +orc_schema:sc.schema[orc_fields] sc.schemaFields[schema]~fields -array_data:(int8_data;int16_data;int32_data;int64_data) +sc.schemaFields[orc_schema]~orc_fields +array_data:(uint8_data;int8_data;uint16_data;int16_data;uint32_data;int32_data;uint64_data;int64_data) +orc_array_data:(int8_data;int16_data;int32_data;int64_data) rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]} -1 "<--- Read/write parquet --->"; @@ -174,7 +178,6 @@ filename:"ints.parquet" pq.writeParquet[filename;schema;array_data;parquet_write_options] pq.readParquetSchema[filename]~schema pq.readParquetData[filename;::]~array_data -show filename; rm filename; -1 "<--- Read/write ORC --->"; @@ -184,10 +187,9 @@ orc_read_options:enlist[`ORC_CHUNK_SIZE]!enlist[1024] filename:"ints.orc" -orc.writeOrc[filename;schema;array_data;orc_write_options] -orc.readOrcSchema[filename]~schema -orc.readOrcData[filename;::]~array_data -show filename; +orc.writeOrc[filename;orc_schema;orc_array_data;orc_write_options] +orc.readOrcSchema[filename]~orc_schema +orc.readOrcData[filename;::]~orc_array_data rm filename; -1 "<--- Read/write arrow file --->"; @@ -436,6 +438,7 @@ ipc.parseArrowSchema[serialized]~schema ipc.parseArrowData[serialized;::]~array_data sc.removeSchema[schema] +sc.removeSchema[orc_schema] -1 "\n+----------|| Clean up the constructed fields and datatypes ||----------+\n"; From 40a56d048ab99c545a771d1f943ca8b655fb7f99 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 20 Mar 2023 19:57:50 +0300 Subject: [PATCH 46/71] Cut off ORC support on Windows --- src/TableData.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/TableData.cpp b/src/TableData.cpp index 151060a..333f7df 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -2,7 +2,10 @@ #include #include +#ifndef _WIN32 #include +#endif + #include #include #include @@ -811,6 +814,9 @@ K readORCData(K orc_file, K options) { KDB_EXCEPTION_TRY; +#ifdef _WIN32 + return krr((S)"ORC files are not supported on Windows"); +#else if (!kx::arrowkdb::IsKdbString(orc_file)) return krr((S)"orc_file not 11h or 0 of 10h"); @@ -856,6 +862,7 @@ K readORCData(K orc_file, K options) } return data; +#endif KDB_EXCEPTION_CATCH; } @@ -864,6 +871,9 @@ K readORCSchema(K orc_file) { KDB_EXCEPTION_TRY; +#ifdef _WIN32 + return krr((S)"ORC files are not supported on Windows"); +#else if (!kx::arrowkdb::IsKdbString(orc_file)) return krr((S)"orc_file not 11h or 0 of 10h"); @@ -891,6 +901,7 @@ K readORCSchema(K orc_file) // Return the new schema_id return ki(kx::arrowkdb::GetSchemaStore()->Add(schema)); +#endif KDB_EXCEPTION_CATCH; } @@ -900,6 +911,9 @@ K writeORC(K orc_file, K schema_id, K array_data, K options) { KDB_EXCEPTION_TRY; +#ifdef _WIN32 + return krr((S)"ORC files are not supported on Windows"); +#else if (!kx::arrowkdb::IsKdbString(orc_file)) return krr((S)"orc_file not 11h or 0 of 10h"); if (schema_id->t != -KI) @@ -957,6 +971,7 @@ K writeORC(K orc_file, K schema_id, K array_data, K options) : knk( 2, ks( S( "error" ) ), ks( S( reason.c_str() ) ) ); return result; +#endif KDB_EXCEPTION_CATCH; } From 33f6aa82f7a65e7f9e58d6594fbf43e456edd3e7 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 20 Mar 2023 20:28:36 +0300 Subject: [PATCH 47/71] Segregation of ORC dataloader test --- tests/basic.t | 17 ---- tests/orc_dataloader/orc_dataloader.t | 107 ++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 17 deletions(-) create mode 100644 tests/orc_dataloader/orc_dataloader.t diff --git a/tests/basic.t b/tests/basic.t index e0c13c6..f42ae75 100644 --- a/tests/basic.t +++ b/tests/basic.t @@ -159,13 +159,9 @@ sparse_union_data:dense_union_data:(0 1 0h;1 2 3;4 5 6f) -1 "\n+----------|| Test integer types schema ||----------+\n"; fields:(uint8_fd,int8_fd,uint16_fd,int16_fd,uint32_fd,int32_fd,uint64_fd,int64_fd) -orc_fields:(int8_fd,int16_fd,int32_fd,int64_fd) schema:sc.schema[fields] -orc_schema:sc.schema[orc_fields] sc.schemaFields[schema]~fields -sc.schemaFields[orc_schema]~orc_fields array_data:(uint8_data;int8_data;uint16_data;int16_data;uint32_data;int32_data;uint64_data;int64_data) -orc_array_data:(int8_data;int16_data;int32_data;int64_data) rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]} -1 "<--- Read/write parquet --->"; @@ -180,18 +176,6 @@ pq.readParquetSchema[filename]~schema pq.readParquetData[filename;::]~array_data rm filename; --1 "<--- Read/write ORC --->"; -// This is required for writedown of small files -orc_write_options:enlist[`ORC_CHUNK_SIZE]!enlist[1024] -orc_read_options:enlist[`ORC_CHUNK_SIZE]!enlist[1024] - - -filename:"ints.orc" -orc.writeOrc[filename;orc_schema;orc_array_data;orc_write_options] -orc.readOrcSchema[filename]~orc_schema -orc.readOrcData[filename;::]~orc_array_data -rm filename; - -1 "<--- Read/write arrow file --->"; filename:"ints.arrow" @@ -438,7 +422,6 @@ ipc.parseArrowSchema[serialized]~schema ipc.parseArrowData[serialized;::]~array_data sc.removeSchema[schema] -sc.removeSchema[orc_schema] -1 "\n+----------|| Clean up the constructed fields and datatypes ||----------+\n"; diff --git a/tests/orc_dataloader/orc_dataloader.t b/tests/orc_dataloader/orc_dataloader.t new file mode 100644 index 0000000..5ea03f4 --- /dev/null +++ b/tests/orc_dataloader/orc_dataloader.t @@ -0,0 +1,107 @@ +// orc_dataloader.t + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +i8_dt:.arrowkdb.dt.int8[]; +i16_dt:.arrowkdb.dt.int16[]; +i32_dt:.arrowkdb.dt.int32[]; +i64_dt:.arrowkdb.dt.int64[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; +i16_fd:.arrowkdb.fd.field[`int16;i16_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +dataloader_schema:.arrowkdb.sc.schema[(ts_fd,i8_fd,i16_fd,i32_fd,i64_fd)]; + +-1"\n+----------|| Number of items in each array ||----------+\n"; +N:10 + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +i8_data:N?0x64; +i16_data:N?100h; +i32_data:N?100i; +i64_data:N?100; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +dataloader_data:(ts_data;i8_data;i16_data;i32_data;i64_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +orc_options:(``PARQUET_VERSION)!((::);`V2.0); + +parquet_dataloader:"orc_dataloader.parquet"; +.arrowkdb.pq.writeParquet[parquet_dataloader;dataloader_schema;dataloader_data;orc_options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_dataloader_schema:.arrowkdb.pq.readParquetSchema[parquet_dataloader]; +.arrowkdb.sc.equalSchemas[dataloader_schema;parquet_dataloader_schema] +dataloader_schema~parquet_dataloader_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_dataloader_data:.arrowkdb.pq.readParquetData[parquet_dataloader;orc_options]; +dataloader_data~parquet_dataloader_data +rm parquet_dataloader; + +-1"\n+----------|| Write the schema and array data to a ORC file ||----------+\n"; +orc_options[`ORC_CHUNK_SIZE]:1024 + +orc_dataloader:"orc_dataloader.orc" +.arrowkdb.orc.writeOrc[orc_dataloader;dataloader_schema;dataloader_data;orc_options] + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +orc_dataloader_schema:.arrowkdb.orc.readOrcSchema[orc_dataloader]; +.arrowkdb.sc.equalSchemas[dataloader_schema;orc_dataloader_schema] +dataloader_schema~orc_dataloader_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +orc_dataloader_data:.arrowkdb.orc.readOrcData[orc_dataloader;orc_options]; +dataloader_data~orc_dataloader_data +rm orc_dataloader; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_dataloader:"orc_dataloader.arrow"; +.arrowkdb.ipc.writeArrow[arrow_dataloader;dataloader_schema;dataloader_data;orc_options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_dataloader_schema:.arrowkdb.ipc.readArrowSchema[arrow_dataloader]; +.arrowkdb.sc.equalSchemas[dataloader_schema;arrow_dataloader_schema] +dataloader_schema~arrow_dataloader_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_dataloader_data:.arrowkdb.ipc.readArrowData[arrow_dataloader;orc_options]; +dataloader_data~arrow_dataloader_data +rm arrow_dataloader; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_dataloader:.arrowkdb.ipc.serializeArrow[dataloader_schema;dataloader_data;orc_options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_dataloader_schema:.arrowkdb.ipc.parseArrowSchema[serialized_dataloader]; +.arrowkdb.sc.equalSchemas[dataloader_schema;stream_dataloader_schema] +dataloader_schema~stream_dataloader_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_dataloader_data:.arrowkdb.ipc.parseArrowData[serialized_dataloader;orc_options]; +dataloader_data~stream_dataloader_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From 82ec1361250800604ee1fad58bc528568bd51380 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 20 Mar 2023 20:38:25 +0300 Subject: [PATCH 48/71] Disable ORC Dataloader tests on Windows --- .travis.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 0606daf..357f415 100644 --- a/.travis.yml +++ b/.travis.yml @@ -82,7 +82,11 @@ before_install: script: - if [[ $TESTS == "True" && "x$OD" != "x" && "x$QLIC_KC" != "x" ]]; then curl -o test.q -L https://github.com/KxSystems/hdf5/raw/master/test.q; - q test.q tests -q && q test.q tests/null_mapping -q && q test.q tests/null_bitmap -q; + if [[ $TRAVIS_OS_NAME == "windows" ]]; then + q test.q tests -q && q test.q tests/null_mapping -q && q test.q tests/null_bitmap -q; + else + q test.q tests -q && q test.q tests/null_mapping -q && q test.q tests/null_bitmap -q && q test.q tests/orc_dataloader -q; + fi fi - if [[ $TRAVIS_OS_NAME == "windows" && $BUILD == "True" ]]; then 7z a -tzip -r $FILE_NAME ./cmake/$FILE_ROOT/*; From 0c63eb3174b19fc76b86c5949192d8877199cf49 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 21 Mar 2023 09:34:36 +0000 Subject: [PATCH 49/71] ORC dataloader reference manual --- docs/reference.md | 159 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) diff --git a/docs/reference.md b/docs/reference.md index 98efb53..6fbc5dd 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -1971,6 +1971,165 @@ str_field: ] ``` +## Apache ORC files + +### `orc.writeOrc` + +*Convert a kdb+ mixed list of array data to an Arrow table and write to an Apache ORC file* +```txt +.arrowkdb.orc.writeOrc[orc_file;schema_id;array_data;options] +``` + +Where: + +- `orc_file` is a string containing the ORC file name +- `schema_id` is the schema identifier to use for the table +- `array_data` is a mixed list of array data +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. + +returns generic null on success + +> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. + +The mixed list of Arrow array data should be ordered in schema field number and each list item representing one of the arrays must be structured according to the field’s datatype. + +Supported options: + +- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. + +> :warning: The Apache ORC file format is [less](https://arrow.apache.org/docs/cpp/orc.html) fully featured compared to Parquet and consequently the ORC dataloader currently **does not support unsigned datatypes**. + +```q +q)i8_fd:.arrowkdb.fd.field[`int8;.arrowkdb.dt.int8[]]; +q)i16_fd:.arrowkdb.fd.field[`int16;.arrowkdb.dt.int16[]]; +q)i32_fd:.arrowkdb.fd.field[`int32;.arrowkdb.dt.int32[]]; +q)orc_schema:.arrowkdb.sc.schema[(i8_fd,i16_fd,i32_fd)]; +q)orc_data:(5?0x64;5?100h;5?100i); +q).arrowkdb.orc.writeOrc["dataloader.orc";orc_schema;orc_data;options] +``` + +## `orc.writeOrcFromTable` + +*Convert a kdb+ table to an Arrow table and write to an Apache ORC file, inferring the schema from the kdb+ table structure* + +```txt +.arrowkdb.orc.writeOrcFromTable[orc_file;table;options] +``` + +Where: + +- `orc_file` is a string containing the ORC file name +- `table` is a kdb+ table +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. + +returns generic null on success + +> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. + +Supported options: + +- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. + +> :warning: The Apache ORC file format is [less](https://arrow.apache.org/docs/cpp/orc.html) fully featured compared to Parquet and consequently the ORC dataloader currently **does not support unsigned datatypes**. + +> :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** +> +> Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). + +```q +q)table:([] i8_fd:5?0x64; i16_fd:5?100h; i32_fd:5?100i) +q).arrowkdb.orc.writeOrcFromTable["dataloader.orc";table;::] +``` + +### `orc.readOrcSchema` + +*Read the schema from an Apache ORC file* + +```txt +.arrowkdb.orc.readOrcSchema[orc_file] +``` + +Where `orc_file` is a string containing the ORC file name + +returns the schema identifier + +> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. + +```q +q)i8_fd:.arrowkdb.fd.field[`int8;.arrowkdb.dt.int8[]]; +q)i16_fd:.arrowkdb.fd.field[`int16;.arrowkdb.dt.int16[]]; +q)i32_fd:.arrowkdb.fd.field[`int32;.arrowkdb.dt.int32[]]; +q)orc_schema:.arrowkdb.sc.schema[(i8_fd,i16_fd,i32_fd)]; +q)orc_data:(5?0x64;5?100h;5?100i); +q).arrowkdb.orc.writeOrc["dataloader.orc";orc_schema;orc_data;options] +q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.orc.readOrcSchema["dataloader.orc"]] +1b +``` + +### `orc.readOrcData` + +*Read an Arrow table from an Apache ORC file and convert to a kdb+ mixed list of array data* + +```txt +.arrowkdb.orc.readOrcData[orc_file;options] +``` + +Where: + +- `orc_file` is a string containing the ORC file name +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. + +returns the array data + +> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. + +Supported options: + +- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. + +```q +q)i8_fd:.arrowkdb.fd.field[`int8;.arrowkdb.dt.int8[]]; +q)i16_fd:.arrowkdb.fd.field[`int16;.arrowkdb.dt.int16[]]; +q)i32_fd:.arrowkdb.fd.field[`int32;.arrowkdb.dt.int32[]]; +q)orc_schema:.arrowkdb.sc.schema[(i8_fd,i16_fd,i32_fd)]; +q)orc_data:(5?0x64;5?100h;5?100i); +q).arrowkdb.orc.writeOrc["dataloader.orc";orc_schema;orc_data;options] +q)read_data:.arrowkdb.orc.readOrcData["dataloader.orc";::] +q)orc_data~read_data +1b +``` + +### `orc.readOrcToTable` + +*Read an Arrow table from an Apache ORC file and convert to a kdb+ table* + +```txt +.arrowkdb.orc.readOrcToTable[orc_file;options] +``` + +Where: + +- `orc_file` is a string containing the ORC file name +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. + +returns the kdb+ table + +> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. + +Each schema field name is used as the column name and the Arrow array data is used as the column data. + +Supported options: + +- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. + +```q +q)table:([] i8_fd:5?0x64; i16_fd:5?100h; i32_fd:5?100i) +q).arrowkdb.orc.writeOrcFromTable["dataloader.orc";table;::] +q)read_table:.arrowkdb.orc.readOrcToTable["dataloader.orc";::] +q)read_table~table +1b +``` + ## Parquet files ### `pq.writeParquet` From 0651f14570cd690072fa6ef91580eb9f6607b1bb Mon Sep 17 00:00:00 2001 From: Slava Grechin <122450037+vgrechin-kx@users.noreply.github.com> Date: Thu, 23 Mar 2023 10:26:36 +0300 Subject: [PATCH 50/71] Update docs/reference.md Co-authored-by: nmcdonnell-kx <63713601+nmcdonnell-kx@users.noreply.github.com> --- docs/reference.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/reference.md b/docs/reference.md index 6fbc5dd..ec95b7d 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -2029,6 +2029,8 @@ returns generic null on success Supported options: - `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. > :warning: The Apache ORC file format is [less](https://arrow.apache.org/docs/cpp/orc.html) fully featured compared to Parquet and consequently the ORC dataloader currently **does not support unsigned datatypes**. From 59e7becaec61dd658ed90100a9b3f2e10b83784b Mon Sep 17 00:00:00 2001 From: Slava Grechin <122450037+vgrechin-kx@users.noreply.github.com> Date: Thu, 23 Mar 2023 10:26:56 +0300 Subject: [PATCH 51/71] Update docs/reference.md Co-authored-by: nmcdonnell-kx <63713601+nmcdonnell-kx@users.noreply.github.com> --- docs/reference.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/reference.md b/docs/reference.md index ec95b7d..f58f5d2 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -2088,6 +2088,10 @@ returns the array data Supported options: - `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. +- `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)i8_fd:.arrowkdb.fd.field[`int8;.arrowkdb.dt.int8[]]; From 13514046799987748d131858366320dafa6d79d2 Mon Sep 17 00:00:00 2001 From: Slava Grechin <122450037+vgrechin-kx@users.noreply.github.com> Date: Thu, 23 Mar 2023 10:27:08 +0300 Subject: [PATCH 52/71] Update docs/reference.md Co-authored-by: nmcdonnell-kx <63713601+nmcdonnell-kx@users.noreply.github.com> --- docs/reference.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/reference.md b/docs/reference.md index f58f5d2..007d074 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -1996,6 +1996,8 @@ The mixed list of Arrow array data should be ordered in schema field number and Supported options: - `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. > :warning: The Apache ORC file format is [less](https://arrow.apache.org/docs/cpp/orc.html) fully featured compared to Parquet and consequently the ORC dataloader currently **does not support unsigned datatypes**. From 680cf2d1b0fe21e2f7cb9daa2ed0c88d94abcbd8 Mon Sep 17 00:00:00 2001 From: Slava Grechin <122450037+vgrechin-kx@users.noreply.github.com> Date: Thu, 23 Mar 2023 10:27:24 +0300 Subject: [PATCH 53/71] Update docs/reference.md Co-authored-by: nmcdonnell-kx <63713601+nmcdonnell-kx@users.noreply.github.com> --- docs/reference.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/reference.md b/docs/reference.md index 007d074..1134232 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -2129,6 +2129,10 @@ Each schema field name is used as the column name and the Arrow array data is us Supported options: - `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. +- `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)table:([] i8_fd:5?0x64; i16_fd:5?100h; i32_fd:5?100i) From c1d4f54574af7e9e670dc5e4e9d6211ab98245ec Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 23 Mar 2023 07:54:19 +0000 Subject: [PATCH 54/71] Rearranging ORC chapter to the end of manual [ci skip] --- docs/reference.md | 342 +++++++++++++++++++++++----------------------- 1 file changed, 171 insertions(+), 171 deletions(-) diff --git a/docs/reference.md b/docs/reference.md index 1134232..5f78895 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -1971,177 +1971,6 @@ str_field: ] ``` -## Apache ORC files - -### `orc.writeOrc` - -*Convert a kdb+ mixed list of array data to an Arrow table and write to an Apache ORC file* -```txt -.arrowkdb.orc.writeOrc[orc_file;schema_id;array_data;options] -``` - -Where: - -- `orc_file` is a string containing the ORC file name -- `schema_id` is the schema identifier to use for the table -- `array_data` is a mixed list of array data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. - -returns generic null on success - -> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. - -The mixed list of Arrow array data should be ordered in schema field number and each list item representing one of the arrays must be structured according to the field’s datatype. - -Supported options: - -- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. -- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. - -> :warning: The Apache ORC file format is [less](https://arrow.apache.org/docs/cpp/orc.html) fully featured compared to Parquet and consequently the ORC dataloader currently **does not support unsigned datatypes**. - -```q -q)i8_fd:.arrowkdb.fd.field[`int8;.arrowkdb.dt.int8[]]; -q)i16_fd:.arrowkdb.fd.field[`int16;.arrowkdb.dt.int16[]]; -q)i32_fd:.arrowkdb.fd.field[`int32;.arrowkdb.dt.int32[]]; -q)orc_schema:.arrowkdb.sc.schema[(i8_fd,i16_fd,i32_fd)]; -q)orc_data:(5?0x64;5?100h;5?100i); -q).arrowkdb.orc.writeOrc["dataloader.orc";orc_schema;orc_data;options] -``` - -## `orc.writeOrcFromTable` - -*Convert a kdb+ table to an Arrow table and write to an Apache ORC file, inferring the schema from the kdb+ table structure* - -```txt -.arrowkdb.orc.writeOrcFromTable[orc_file;table;options] -``` - -Where: - -- `orc_file` is a string containing the ORC file name -- `table` is a kdb+ table -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. - -returns generic null on success - -> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. - -Supported options: - -- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. -- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. - -> :warning: The Apache ORC file format is [less](https://arrow.apache.org/docs/cpp/orc.html) fully featured compared to Parquet and consequently the ORC dataloader currently **does not support unsigned datatypes**. - -> :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** -> -> Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). - -```q -q)table:([] i8_fd:5?0x64; i16_fd:5?100h; i32_fd:5?100i) -q).arrowkdb.orc.writeOrcFromTable["dataloader.orc";table;::] -``` - -### `orc.readOrcSchema` - -*Read the schema from an Apache ORC file* - -```txt -.arrowkdb.orc.readOrcSchema[orc_file] -``` - -Where `orc_file` is a string containing the ORC file name - -returns the schema identifier - -> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. - -```q -q)i8_fd:.arrowkdb.fd.field[`int8;.arrowkdb.dt.int8[]]; -q)i16_fd:.arrowkdb.fd.field[`int16;.arrowkdb.dt.int16[]]; -q)i32_fd:.arrowkdb.fd.field[`int32;.arrowkdb.dt.int32[]]; -q)orc_schema:.arrowkdb.sc.schema[(i8_fd,i16_fd,i32_fd)]; -q)orc_data:(5?0x64;5?100h;5?100i); -q).arrowkdb.orc.writeOrc["dataloader.orc";orc_schema;orc_data;options] -q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.orc.readOrcSchema["dataloader.orc"]] -1b -``` - -### `orc.readOrcData` - -*Read an Arrow table from an Apache ORC file and convert to a kdb+ mixed list of array data* - -```txt -.arrowkdb.orc.readOrcData[orc_file;options] -``` - -Where: - -- `orc_file` is a string containing the ORC file name -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. - -returns the array data - -> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. - -Supported options: - -- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. -- `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. -- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. - -```q -q)i8_fd:.arrowkdb.fd.field[`int8;.arrowkdb.dt.int8[]]; -q)i16_fd:.arrowkdb.fd.field[`int16;.arrowkdb.dt.int16[]]; -q)i32_fd:.arrowkdb.fd.field[`int32;.arrowkdb.dt.int32[]]; -q)orc_schema:.arrowkdb.sc.schema[(i8_fd,i16_fd,i32_fd)]; -q)orc_data:(5?0x64;5?100h;5?100i); -q).arrowkdb.orc.writeOrc["dataloader.orc";orc_schema;orc_data;options] -q)read_data:.arrowkdb.orc.readOrcData["dataloader.orc";::] -q)orc_data~read_data -1b -``` - -### `orc.readOrcToTable` - -*Read an Arrow table from an Apache ORC file and convert to a kdb+ table* - -```txt -.arrowkdb.orc.readOrcToTable[orc_file;options] -``` - -Where: - -- `orc_file` is a string containing the ORC file name -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. - -returns the kdb+ table - -> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. - -Each schema field name is used as the column name and the Arrow array data is used as the column data. - -Supported options: - -- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. -- `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. -- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. - -```q -q)table:([] i8_fd:5?0x64; i16_fd:5?100h; i32_fd:5?100i) -q).arrowkdb.orc.writeOrcFromTable["dataloader.orc";table;::] -q)read_table:.arrowkdb.orc.readOrcToTable["dataloader.orc";::] -q)read_table~table -1b -``` - ## Parquet files ### `pq.writeParquet` @@ -2781,6 +2610,177 @@ q)new_table~table 1b ``` +## Apache ORC files + +### `orc.writeOrc` + +*Convert a kdb+ mixed list of array data to an Arrow table and write to an Apache ORC file* +```txt +.arrowkdb.orc.writeOrc[orc_file;schema_id;array_data;options] +``` + +Where: + +- `orc_file` is a string containing the ORC file name +- `schema_id` is the schema identifier to use for the table +- `array_data` is a mixed list of array data +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. + +returns generic null on success + +> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. + +The mixed list of Arrow array data should be ordered in schema field number and each list item representing one of the arrays must be structured according to the field’s datatype. + +Supported options: + +- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. + +> :warning: The Apache ORC file format is [less](https://arrow.apache.org/docs/cpp/orc.html) fully featured compared to Parquet and consequently the ORC dataloader currently **does not support unsigned datatypes**. + +```q +q)i8_fd:.arrowkdb.fd.field[`int8;.arrowkdb.dt.int8[]]; +q)i16_fd:.arrowkdb.fd.field[`int16;.arrowkdb.dt.int16[]]; +q)i32_fd:.arrowkdb.fd.field[`int32;.arrowkdb.dt.int32[]]; +q)orc_schema:.arrowkdb.sc.schema[(i8_fd,i16_fd,i32_fd)]; +q)orc_data:(5?0x64;5?100h;5?100i); +q).arrowkdb.orc.writeOrc["dataloader.orc";orc_schema;orc_data;options] +``` + +## `orc.writeOrcFromTable` + +*Convert a kdb+ table to an Arrow table and write to an Apache ORC file, inferring the schema from the kdb+ table structure* + +```txt +.arrowkdb.orc.writeOrcFromTable[orc_file;table;options] +``` + +Where: + +- `orc_file` is a string containing the ORC file name +- `table` is a kdb+ table +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. + +returns generic null on success + +> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. + +Supported options: + +- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. + +> :warning: The Apache ORC file format is [less](https://arrow.apache.org/docs/cpp/orc.html) fully featured compared to Parquet and consequently the ORC dataloader currently **does not support unsigned datatypes**. + +> :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** +> +> Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). + +```q +q)table:([] i8_fd:5?0x64; i16_fd:5?100h; i32_fd:5?100i) +q).arrowkdb.orc.writeOrcFromTable["dataloader.orc";table;::] +``` + +### `orc.readOrcSchema` + +*Read the schema from an Apache ORC file* + +```txt +.arrowkdb.orc.readOrcSchema[orc_file] +``` + +Where `orc_file` is a string containing the ORC file name + +returns the schema identifier + +> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. + +```q +q)i8_fd:.arrowkdb.fd.field[`int8;.arrowkdb.dt.int8[]]; +q)i16_fd:.arrowkdb.fd.field[`int16;.arrowkdb.dt.int16[]]; +q)i32_fd:.arrowkdb.fd.field[`int32;.arrowkdb.dt.int32[]]; +q)orc_schema:.arrowkdb.sc.schema[(i8_fd,i16_fd,i32_fd)]; +q)orc_data:(5?0x64;5?100h;5?100i); +q).arrowkdb.orc.writeOrc["dataloader.orc";orc_schema;orc_data;options] +q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.orc.readOrcSchema["dataloader.orc"]] +1b +``` + +### `orc.readOrcData` + +*Read an Arrow table from an Apache ORC file and convert to a kdb+ mixed list of array data* + +```txt +.arrowkdb.orc.readOrcData[orc_file;options] +``` + +Where: + +- `orc_file` is a string containing the ORC file name +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. + +returns the array data + +> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. + +Supported options: + +- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. +- `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. + +```q +q)i8_fd:.arrowkdb.fd.field[`int8;.arrowkdb.dt.int8[]]; +q)i16_fd:.arrowkdb.fd.field[`int16;.arrowkdb.dt.int16[]]; +q)i32_fd:.arrowkdb.fd.field[`int32;.arrowkdb.dt.int32[]]; +q)orc_schema:.arrowkdb.sc.schema[(i8_fd,i16_fd,i32_fd)]; +q)orc_data:(5?0x64;5?100h;5?100i); +q).arrowkdb.orc.writeOrc["dataloader.orc";orc_schema;orc_data;options] +q)read_data:.arrowkdb.orc.readOrcData["dataloader.orc";::] +q)orc_data~read_data +1b +``` + +### `orc.readOrcToTable` + +*Read an Arrow table from an Apache ORC file and convert to a kdb+ table* + +```txt +.arrowkdb.orc.readOrcToTable[orc_file;options] +``` + +Where: + +- `orc_file` is a string containing the ORC file name +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. + +returns the kdb+ table + +> :warning: The Apache ORC file format is not supported on Windows platform and will return with the message **ORC files are not supported on Windows**. + +Each schema field name is used as the column name and the Arrow array data is used as the column data. + +Supported options: + +- `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. +- `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. + +```q +q)table:([] i8_fd:5?0x64; i16_fd:5?100h; i32_fd:5?100i) +q).arrowkdb.orc.writeOrcFromTable["dataloader.orc";table;::] +q)read_table:.arrowkdb.orc.readOrcToTable["dataloader.orc";::] +q)read_table~table +1b +``` + ## Utilities ### `util.buildInfo` From 318af00dceb0d5bd2b363a328fbef573b03c1059 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 23 Mar 2023 08:10:06 +0000 Subject: [PATCH 55/71] Include ORC files into contents [ci skip] --- docs/reference.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/reference.md b/docs/reference.md index 5f78895..6b6b8e4 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -108,6 +108,12 @@ object | use [`ipc.parseArrowSchema`](#ipcparsearrowschema) | Parse the schema from an Arrow stream [`ipc.parseArrowData`](#ipcparsearrowdata) | Parse an Arrow table from an Arrow stream and convert to a kdb+ mixed list of array data [`ipc.parseArrowToTable`](#ipcparsearrowtotable) | Parse an Arrow table from an Arrow file and convert to a kdb+ table +
**[Apache ORC files](#apache-orc-files)** +[`orc.writeOrc`](#orcwriteorc) | Convert a kdb+ mixed list of array data to an Arrow table and write to an Apache ORC file +[`orc.writeOrcFromTable`](#orcwriteorcfromtable) | Convert a kdb+ table to an Arrow table and write to an Apache ORC file, inferring the schema from the kdb+ table structure +[`orc.readOrcSchema`](#orcreadorcschema) | Read the schema from an Apache ORC file +[`orc.readOrcData`](#orcreadorcdata) | Read an Arrow table from an Apache ORC file and convert to a kdb+ mixed list of array data +[`orc.readOrcToTable`](#orcreadorctotable) | Read an Arrow table from an Apache ORC file and convert to a kdb+ table
**[Utilities](#utilities)** [`util.buildInfo`](#utilbuildinfo) | Return build information regarding the in use Arrow library From e73285805d70f21abf96ba35e7ba09b092c5d7b4 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 23 Mar 2023 11:37:12 +0300 Subject: [PATCH 56/71] Rearranging example, moving ORC next to EOF [ci skip] --- examples/orc_dataloader.q | 46 +++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/examples/orc_dataloader.q b/examples/orc_dataloader.q index 97cdffb..8a58707 100644 --- a/examples/orc_dataloader.q +++ b/examples/orc_dataloader.q @@ -79,28 +79,7 @@ show dataloader_data~parquet_dataloader_data rm parquet_dataloader; //---------------------------// -// Example-2. Apache ORC file// -//---------------------------// - -// Write the schema and array data to a ORC file -orc_options[`ORC_CHUNK_SIZE]:1024 - -orc_dataloader:"orc_dataloader.orc" -.arrowkdb.orc.writeOrc[orc_dataloader;dataloader_schema;dataloader_data;orc_options] -show orc_dataloader; - -// Read the schema back and compare -orc_dataloader_schema:.arrowkdb.orc.readOrcSchema[orc_dataloader]; -show .arrowkdb.sc.equalSchemas[dataloader_schema;orc_dataloader_schema] -show dataloader_schema~orc_dataloader_schema - -// Read the array data back and compare -orc_dataloader_data:.arrowkdb.orc.readOrcData[orc_dataloader;orc_options]; -show dataloader_data~orc_dataloader_data -rm orc_dataloader; - -//---------------------------// -// Example-3. Arrow IPC file // +// Example-2. Arrow IPC file // //---------------------------// // Write the schema and array data to an arrow file @@ -119,7 +98,7 @@ show dataloader_data~arrow_dataloader_data rm arrow_dataloader; //-----------------------------// -// Example-4. Arrow IPC stream // +// Example-3. Arrow IPC stream // //-----------------------------// // Serialize the schema and array data to an arrow stream @@ -135,6 +114,27 @@ show dataloader_schema~stream_dataloader_schema stream_dataloader_data:.arrowkdb.ipc.parseArrowData[serialized_dataloader;orc_options]; show dataloader_data~stream_dataloader_data +//---------------------------// +// Example-4. Apache ORC file// +//---------------------------// + +// Write the schema and array data to a ORC file +orc_options[`ORC_CHUNK_SIZE]:1024 + +orc_dataloader:"orc_dataloader.orc" +.arrowkdb.orc.writeOrc[orc_dataloader;dataloader_schema;dataloader_data;orc_options] +show orc_dataloader; + +// Read the schema back and compare +orc_dataloader_schema:.arrowkdb.orc.readOrcSchema[orc_dataloader]; +show .arrowkdb.sc.equalSchemas[dataloader_schema;orc_dataloader_schema] +show dataloader_schema~orc_dataloader_schema + +// Read the array data back and compare +orc_dataloader_data:.arrowkdb.orc.readOrcData[orc_dataloader;orc_options]; +show dataloader_data~orc_dataloader_data +rm orc_dataloader; + -1 "\n+----------------------------------------+\n"; From a3382199d0bd22988410afb549e6efbf31faab22 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 23 Mar 2023 11:35:30 +0300 Subject: [PATCH 57/71] Simplifying ORC test - eliminating Arrow files writing --- tests/orc_dataloader/orc_dataloader.t | 44 +-------------------------- 1 file changed, 1 insertion(+), 43 deletions(-) diff --git a/tests/orc_dataloader/orc_dataloader.t b/tests/orc_dataloader/orc_dataloader.t index 5ea03f4..7b0f381 100644 --- a/tests/orc_dataloader/orc_dataloader.t +++ b/tests/orc_dataloader/orc_dataloader.t @@ -39,24 +39,8 @@ i64_data:N?100; -1"\n+----------|| Combine the data for all columns ||----------+\n"; dataloader_data:(ts_data;i8_data;i16_data;i32_data;i64_data); --1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; -orc_options:(``PARQUET_VERSION)!((::);`V2.0); - -parquet_dataloader:"orc_dataloader.parquet"; -.arrowkdb.pq.writeParquet[parquet_dataloader;dataloader_schema;dataloader_data;orc_options]; - --1"\n+----------|| Read the schema back and compare ||----------+\n"; -parquet_dataloader_schema:.arrowkdb.pq.readParquetSchema[parquet_dataloader]; -.arrowkdb.sc.equalSchemas[dataloader_schema;parquet_dataloader_schema] -dataloader_schema~parquet_dataloader_schema - --1"\n+----------|| Read the array data back and compare ||----------+\n"; -parquet_dataloader_data:.arrowkdb.pq.readParquetData[parquet_dataloader;orc_options]; -dataloader_data~parquet_dataloader_data -rm parquet_dataloader; - -1"\n+----------|| Write the schema and array data to a ORC file ||----------+\n"; -orc_options[`ORC_CHUNK_SIZE]:1024 +orc_options:(``ORC_CHUNK_SIZE)!((::);1024); orc_dataloader:"orc_dataloader.orc" .arrowkdb.orc.writeOrc[orc_dataloader;dataloader_schema;dataloader_data;orc_options] @@ -71,32 +55,6 @@ orc_dataloader_data:.arrowkdb.orc.readOrcData[orc_dataloader;orc_options]; dataloader_data~orc_dataloader_data rm orc_dataloader; --1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; -arrow_dataloader:"orc_dataloader.arrow"; -.arrowkdb.ipc.writeArrow[arrow_dataloader;dataloader_schema;dataloader_data;orc_options]; - --1"\n+----------|| Read the schema back and compare ||----------+\n"; -arrow_dataloader_schema:.arrowkdb.ipc.readArrowSchema[arrow_dataloader]; -.arrowkdb.sc.equalSchemas[dataloader_schema;arrow_dataloader_schema] -dataloader_schema~arrow_dataloader_schema - --1"\n+----------|| Read the array data back and compare ||----------+\n"; -arrow_dataloader_data:.arrowkdb.ipc.readArrowData[arrow_dataloader;orc_options]; -dataloader_data~arrow_dataloader_data -rm arrow_dataloader; - --1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; -serialized_dataloader:.arrowkdb.ipc.serializeArrow[dataloader_schema;dataloader_data;orc_options]; - --1"\n+----------|| Parse the schema back abd compare ||----------+\n"; -stream_dataloader_schema:.arrowkdb.ipc.parseArrowSchema[serialized_dataloader]; -.arrowkdb.sc.equalSchemas[dataloader_schema;stream_dataloader_schema] -dataloader_schema~stream_dataloader_schema - --1"\n+----------|| Parse the array data back and compare ||----------+\n"; -stream_dataloader_data:.arrowkdb.ipc.parseArrowData[serialized_dataloader;orc_options]; -dataloader_data~stream_dataloader_data - -1 "\n+----------|| Test utils ||----------+\n"; From e0c2a307e30ce9b0514b2b8584a3e1e984e03ef4 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 29 Mar 2023 12:16:59 +0000 Subject: [PATCH 58/71] Cleaning up ORC example out of other formats writing --- examples/orc_dataloader.q | 61 ++------------------------------------- 1 file changed, 2 insertions(+), 59 deletions(-) diff --git a/examples/orc_dataloader.q b/examples/orc_dataloader.q index 8a58707..8ed9054 100644 --- a/examples/orc_dataloader.q +++ b/examples/orc_dataloader.q @@ -57,69 +57,12 @@ dataloader_data:(ts_data;i8_data;i16_data;i32_data;i64_data); // Pretty print the Arrow table populated from the array data .arrowkdb.tb.prettyPrintTable[dataloader_schema;dataloader_data;::]; -//-------------------------// -// Example-1. Parquet file // -//-------------------------// - -// Write the schema and array data to a parquet file -orc_options:(``PARQUET_VERSION)!((::);`V2.0); - -parquet_dataloader:"orc_dataloader.parquet"; -.arrowkdb.pq.writeParquet[parquet_dataloader;dataloader_schema;dataloader_data;orc_options]; -show ls parquet_dataloader - -// Read the schema back and compare -parquet_dataloader_schema:.arrowkdb.pq.readParquetSchema[parquet_dataloader]; -show .arrowkdb.sc.equalSchemas[dataloader_schema;parquet_dataloader_schema] -show dataloader_schema~parquet_dataloader_schema - -// Read the array data back and compare -parquet_dataloader_data:.arrowkdb.pq.readParquetData[parquet_dataloader;orc_options]; -show dataloader_data~parquet_dataloader_data -rm parquet_dataloader; - -//---------------------------// -// Example-2. Arrow IPC file // -//---------------------------// - -// Write the schema and array data to an arrow file -arrow_dataloader:"orc_dataloader.arrow"; -.arrowkdb.ipc.writeArrow[arrow_dataloader;dataloader_schema;dataloader_data;orc_options]; -show ls arrow_dataloader - -// Read the schema back and compare -arrow_dataloader_schema:.arrowkdb.ipc.readArrowSchema[arrow_dataloader]; -show .arrowkdb.sc.equalSchemas[dataloader_schema;arrow_dataloader_schema] -show dataloader_schema~arrow_dataloader_schema - -// Read the array data back and compare -arrow_dataloader_data:.arrowkdb.ipc.readArrowData[arrow_dataloader;orc_options]; -show dataloader_data~arrow_dataloader_data -rm arrow_dataloader; - -//-----------------------------// -// Example-3. Arrow IPC stream // -//-----------------------------// - -// Serialize the schema and array data to an arrow stream -serialized_dataloader:.arrowkdb.ipc.serializeArrow[dataloader_schema;dataloader_data;orc_options]; -show serialized_dataloader - -// Parse the schema back abd compare -stream_dataloader_schema:.arrowkdb.ipc.parseArrowSchema[serialized_dataloader]; -show .arrowkdb.sc.equalSchemas[dataloader_schema;stream_dataloader_schema] -show dataloader_schema~stream_dataloader_schema - -// Parse the array data back and compare -stream_dataloader_data:.arrowkdb.ipc.parseArrowData[serialized_dataloader;orc_options]; -show dataloader_data~stream_dataloader_data - //---------------------------// -// Example-4. Apache ORC file// +// Example-1. Apache ORC file// //---------------------------// // Write the schema and array data to a ORC file -orc_options[`ORC_CHUNK_SIZE]:1024 +orc_options:(``ORC_CHUNK_SIZE)!((::);1024); orc_dataloader:"orc_dataloader.orc" .arrowkdb.orc.writeOrc[orc_dataloader;dataloader_schema;dataloader_data;orc_options] From 8c83feac1d3bcbf8e8143d1b7b6b7dae05bf4c08 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 29 Mar 2023 12:37:05 +0000 Subject: [PATCH 59/71] Review changes, error handling --- src/TableData.cpp | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/src/TableData.cpp b/src/TableData.cpp index 333f7df..0e34b97 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -949,28 +949,11 @@ K writeORC(K orc_file, K schema_id, K array_data, K options) // Create the arrow table auto table = MakeTable(schema, array_data, type_overrides); - std::string reason; - auto writeStatus = writer->Write( *table ); - if( writeStatus != writeStatus.OK() ){ - reason = std::string( "Failed to write ORC file, name: " ) - .append( path ) - .append( ", reason: " ) - .append( writeStatus.ToString() ); - } - - auto closeStatus = writer->Close(); - if( closeStatus != closeStatus.OK() ){ - reason = std::string( "Failed to close ORC file, name: " ) - .append( path ) - .append( ", reason: " ) - .append( closeStatus.ToString() ); - } + PARQUET_THROW_NOT_OK( writer->Write( *table ) ); - K result = reason.empty() - ? ( K )0 - : knk( 2, ks( S( "error" ) ), ks( S( reason.c_str() ) ) ); + PARQUET_THROW_NOT_OK( writer->Close() ); - return result; + return ( K )0; #endif KDB_EXCEPTION_CATCH; From f9a706c6d66f8b142b31b28a1ae3b7a800a55bc8 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 29 Mar 2023 12:39:14 +0000 Subject: [PATCH 60/71] Review changes, docs update --- docs/reference.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/reference.md b/docs/reference.md index 6b6b8e4..66869f6 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -2735,7 +2735,7 @@ returns the array data Supported options: - `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. -- `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. +- `USE_MMAP` - Flag indicating whether the ORC file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. - `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. - `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. @@ -2774,7 +2774,7 @@ Each schema field name is used as the column name and the Arrow array data is us Supported options: - `ORC_CHUNK_SIZE` - Controls the approximate size of ORC data stripes within a column. Long, default 1MB. -- `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. +- `USE_MMAP` - Flag indicating whether the ORC file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. - `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. - `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. From 064e397f26ea6194cd03d6cfa41d8ac21d72227a Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 29 Mar 2023 12:39:22 +0000 Subject: [PATCH 61/71] Setting Up arrowkdb on RHEL 7: C++ standard depends on version of Arrow --- CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 33bfada..5679b99 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,8 +6,9 @@ cmake_minimum_required(VERSION 3.1.3) project(arrowkdb CXX) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -DKXVER=3") -set(CMAKE_CXX_STANDARD 14) -IF(APPLE) +if(ARROW_SO_VERSION LESS "10.0") + set(CMAKE_CXX_STANDARD 14) +else() set(CMAKE_CXX_STANDARD 17) endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) From ecf262a455d7936ff07a3f30f2513a4032bca112 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 23 Mar 2023 20:28:39 +0300 Subject: [PATCH 62/71] Unit-test of supporting nulls in ORC files --- examples/orc_null_support.q | 273 ++++++++++++++++++++++++++++++++++++ 1 file changed, 273 insertions(+) create mode 100644 examples/orc_null_support.q diff --git a/examples/orc_null_support.q b/examples/orc_null_support.q new file mode 100644 index 0000000..ff7000b --- /dev/null +++ b/examples/orc_null_support.q @@ -0,0 +1,273 @@ +// orc_null_support.q +// Examples of creating a schema supporting null mapping and using it to read/write +// Apache ORC file with exposing null bitmap as a separate structure to kdb + +-1"\n+----------|| orc_null_support.q ||----------+\n"; + +// import the arrowkdb library +\l q/arrowkdb.q + +// Filesystem functions for Linux/MacOS/Windows +ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +///////////////////////// +// CONSTRUCTED SCHEMAS // +///////////////////////// + +//-------------------// +// Create the schema // +//-------------------// + +// Support null mapping in Apache ORC +int_opts:(`bool`int8`int16`int32`int64)!(1b;0x02;3h;4i;5); +float_opts:(`float32`float64`decimal)!(9.87e;6.54;3.21f); +cont_opts:(`utf8`binary)!("start";"x"$"alert"); +time_opts:(`date32`timestamp)!(2012.11.10;2011.01.01D00:00:00.000000000); + +compound_options:(``NULL_MAPPING)!((::);int_opts,float_opts,cont_opts,time_opts); + +// Create the datatype identifiers +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +i8_dt:.arrowkdb.dt.int8[]; +i16_dt:.arrowkdb.dt.int16[]; +i32_dt:.arrowkdb.dt.int32[]; +i64_dt:.arrowkdb.dt.int64[]; +f64_dt:.arrowkdb.dt.float64[]; + +str_dt:.arrowkdb.dt.utf8[]; +bin_dt:.arrowkdb.dt.binary[]; +dec_dt:.arrowkdb.dt.decimal128[38i;2i]; + +bool_dt:.arrowkdb.dt.boolean[]; +f32_dt:.arrowkdb.dt.float32[]; +d32_dt:.arrowkdb.dt.date32[]; + +// Create the field identifiers +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; +i16_fd:.arrowkdb.fd.field[`int16;i16_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; + +str_fd:.arrowkdb.fd.field[`string;str_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; + +numeric_schema:.arrowkdb.sc.schema[(ts_fd, i16_fd, i32_fd, i64_fd, f64_fd)]; +contiguous_schema:.arrowkdb.sc.schema[(str_fd, bin_fd, dec_fd)]; + +// Create a field containing the list datatype +list_dt:.arrowkdb.dt.list[i8_fd]; +list_fd:.arrowkdb.fd.field[`list_field;list_dt]; + +// Create a field containing the struct datatype +struct_dt:.arrowkdb.dt.struct[(bool_fd,f32_fd,d32_fd)]; +struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; + +// Create fields containing the map datatype +map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] +map_fd:.arrowkdb.fd.field[`map;map_dt]; + +// Create the schema containing the list and struct fields +compound_schema:.arrowkdb.sc.schema[(list_fd,struct_fd,map_fd)]; + +// Print the schema +-1"\nNumeric schema:"; +.arrowkdb.sc.printSchema[numeric_schema]; + +-1"\nContiguous schema:"; +.arrowkdb.sc.printSchema[contiguous_schema]; + +-1"\nCompound schema:"; +.arrowkdb.sc.printSchema[compound_schema]; + +// Number of items in each array +N:5 + +// Create data for each column in the table +ts_data:asc N?0p; + +i16_data:N?100h; +i16_data[0]:3h; +i32_data:N?100i; +i32_data[1]:4i; +i64_data:N?100; +i64_data[2]:5; +f64_data:N?100f; +f64_data[3]:6.54f; + +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[0]:"start" +bin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[1]:"x"$"alert" +dec_data:{"F"$.Q.f[2]x} each N?(10f) +dec_data[2]:3.21f + +N:3 +bool_data:N?(0b;1b); +bool_data[0]:1b; +f32_data:N?100e; +f32_data[1]:9.87e; +d32_data:N?(2012.11.10;2010.07.18;2011.07.16;2014.07.15;2016.07.11); +d32_data[2]:2012.11.10; + +// Combine the data for numeric columns +numeric_data:(ts_data;i16_data;i32_data;i64_data;f64_data); +// Combine the data for contiguous columns +contiguous_data:(str_data;bin_data;dec_data); + +// Combine the array data for the list and struct columns +list_array:(enlist 0x00;(0x0102);(0x030405)); +struct_array:(bool_data;f32_data;d32_data); +map_array:((enlist 1)!(enlist 1.23);(2 2)!(4.56 7.89);(3 3 3)!(9.87 6.54 3.21)) +compound_data:(list_array;struct_array;map_array); + +// Pretty print the Arrow table populated from the numeric data +compound_options[`DECIMAL128_AS_DOUBLE]:1 + +-1"\nNumeric table:"; +.arrowkdb.tb.prettyPrintTable[numeric_schema;numeric_data;compound_options]; + +// Show the string data as an arrow table +-1"\nContiguous table:"; +.arrowkdb.tb.prettyPrintTable[contiguous_schema;contiguous_data;compound_options] + +// Show the list data as an arrow table +-1"\nCompound table:"; +.arrowkdb.tb.prettyPrintTable[compound_schema;compound_data;compound_options] + +//-------------------------// +// Example-1. Arrow IPC file // +//-------------------------// + +// Write the schema and array data to a arrow file +arrow_numeric:"numeric_bitmap.arrow"; +arrow_contiguous:"contiguous_bitmap.arrow"; +arrow_compound:"compound_bitmap.arrow"; + +.arrowkdb.ipc.writeArrow[arrow_numeric;numeric_schema;numeric_data;compound_options]; +.arrowkdb.ipc.writeArrow[arrow_contiguous;contiguous_schema;contiguous_data;compound_options]; +.arrowkdb.ipc.writeArrow[arrow_compound;compound_schema;compound_data;compound_options]; + +show ls arrow_numeric +show ls arrow_contiguous +show ls arrow_compound + +// Read the schema back and compare +compound_options[`WITH_NULL_BITMAP]:1; + +arrow_numeric_schema:.arrowkdb.ipc.readArrowSchema[arrow_numeric]; +arrow_contiguous_schema:.arrowkdb.ipc.readArrowSchema[arrow_contiguous]; +arrow_compound_schema:.arrowkdb.ipc.readArrowSchema[arrow_compound]; + +show .arrowkdb.sc.equalSchemas[numeric_schema;arrow_numeric_schema] +show .arrowkdb.sc.equalSchemas[contiguous_schema;arrow_contiguous_schema] +show .arrowkdb.sc.equalSchemas[compound_schema;arrow_compound_schema] + +show numeric_schema~arrow_numeric_schema +show contiguous_schema~arrow_contiguous_schema +show compound_schema~arrow_compound_schema + +// Read the array data back and compare +arrow_numeric_data:.arrowkdb.ipc.readArrowData[arrow_numeric;compound_options]; +arrow_contiguous_data:.arrowkdb.ipc.readArrowData[arrow_contiguous;compound_options]; +arrow_compound_data:.arrowkdb.ipc.readArrowData[arrow_compound;compound_options]; + +show numeric_data~first arrow_numeric_data +show contiguous_data~first arrow_contiguous_data +show compound_data~first arrow_compound_data + +// Compare null bitmaps of arrow data +compound_numeric_nulls:(00000b;10000b;01000b;00100b;00010b); +compound_contiguous_nulls:(10000b;01000b;00100b); +compound_list_nulls:(enlist 0b;01b;000b); +compound_struct_nulls:(100b;010b;001b); +compound_map_nulls:((enlist 0b)!(enlist 0b);00b!00b;000b!010b) + +arrow_numeric_nulls:last arrow_numeric_data; +arrow_contiguous_nulls:last arrow_contiguous_data; +arrow_list_nulls:last[arrow_compound_data][0] +arrow_struct_nulls:last[arrow_compound_data][1] +arrow_map_nulls:last[arrow_compound_data][2] + +show compound_numeric_nulls~compound_numeric_nulls & arrow_numeric_nulls +show compound_contiguous_nulls~compound_contiguous_nulls & arrow_contiguous_nulls +show compound_list_nulls~arrow_list_nulls +show compound_struct_nulls~compound_struct_nulls & arrow_struct_nulls +show compound_map_nulls~arrow_map_nulls + +rm arrow_numeric; +rm arrow_contiguous; +rm arrow_compound; + +//---------------------------// +// Example-2. Apache ORC file// +//---------------------------// + +// Write the schema and array data to a ORC file +compound_options[`ORC_CHUNK_SIZE]:1024 + +orc_numeric:"numeric_bitmap.orc"; +orc_contiguous:"contiguous_bitmap.orc"; +orc_compound:"compound_bitmap.orc"; + +.arrowkdb.orc.writeOrc[orc_numeric;numeric_schema;numeric_data;compound_options] +.arrowkdb.orc.writeOrc[orc_contiguous;contiguous_schema;contiguous_data;compound_options] +.arrowkdb.orc.writeOrc[orc_compound;compound_schema;compound_data;compound_options] + +show ls orc_numeric +show ls orc_contiguous +show ls orc_compound + +// Read the schema back and compare +orc_numeric_schema:.arrowkdb.orc.readOrcSchema[orc_numeric]; +orc_contiguous_schema:.arrowkdb.orc.readOrcSchema[orc_contiguous]; +orc_compound_schema:.arrowkdb.orc.readOrcSchema[orc_compound]; + +show .arrowkdb.sc.equalSchemas[numeric_schema;orc_numeric_schema] +show .arrowkdb.sc.equalSchemas[contiguous_schema;orc_contiguous_schema] +show .arrowkdb.sc.equalSchemas[compound_schema;orc_compound_schema] + +show numeric_schema~orc_numeric_schema +show contiguous_schema~orc_contiguous_schema +show compound_schema~orc_compound_schema + +// Read the array data back and compare +orc_numeric_data:.arrowkdb.orc.readOrcData[orc_numeric;compound_options]; +orc_contiguous_data:.arrowkdb.orc.readOrcData[orc_contiguous;compound_options]; +orc_compound_data:.arrowkdb.orc.readOrcData[orc_compound;compound_options]; + +show numeric_data~first orc_numeric_data +show contiguous_data~first orc_contiguous_data +show compound_data~first orc_compound_data + +// Compare null bitmaps of arrow data +orc_numeric_nulls:last orc_numeric_data; +orc_contiguous_nulls:last orc_contiguous_data; +orc_list_nulls:last[orc_compound_data][0] +orc_struct_nulls:last[orc_compound_data][1] +orc_map_nulls:last[orc_compound_data][2] + +show compound_numeric_nulls~compound_numeric_nulls & orc_numeric_nulls +show compound_contiguous_nulls~compound_contiguous_nulls & orc_contiguous_nulls +show compound_list_nulls~orc_list_nulls +show compound_struct_nulls~compound_struct_nulls & orc_struct_nulls +show compound_map_nulls~orc_map_nulls + +rm orc_numeric; +rm orc_contiguous; +rm orc_compound; + + +-1 "\n+----------------------------------------+\n"; + +// Process off +exit 0; From c96e0af9f2cdf64aacf3183273ee7b09354b6b8f Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 24 Mar 2023 11:28:44 +0300 Subject: [PATCH 63/71] Supporting nulls in ORC files --- src/TableData.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/TableData.cpp b/src/TableData.cpp index 0e34b97..7302038 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -861,6 +861,20 @@ K readORCData(K orc_file, K options) kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); } + int64_t with_null_bitmap = 0; + read_options.GetIntOption( kx::arrowkdb::Options::WITH_NULL_BITMAP, with_null_bitmap ); + if( with_null_bitmap ){ + K bitmap = ktn( 0, col_num ); + for( auto i = 0; i < col_num; ++i ){ + auto chunked_array = table->column( i ); + kK( bitmap )[i] = kx::arrowkdb::ReadChunkedArrayNullBitmap( chunked_array, type_overrides ); + } + K array = data; + data = ktn( 0, 2 ); + kK( data )[0] = array; + kK( data )[1] = bitmap; + } + return data; #endif From d82b98f62f26e7ccda511107ed602492cf87c37e Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 24 Mar 2023 17:07:10 +0300 Subject: [PATCH 64/71] Unit-tests of null support for ORCs --- examples/orc_null_support.q | 30 +++--- tests/orc_dataloader/orc_compound_nulls.t | 104 ++++++++++++++++++++ tests/orc_dataloader/orc_contiguous_nulls.t | 79 +++++++++++++++ tests/orc_dataloader/orc_numeric_nulls.t | 84 ++++++++++++++++ 4 files changed, 282 insertions(+), 15 deletions(-) create mode 100644 tests/orc_dataloader/orc_compound_nulls.t create mode 100644 tests/orc_dataloader/orc_contiguous_nulls.t create mode 100644 tests/orc_dataloader/orc_numeric_nulls.t diff --git a/examples/orc_null_support.q b/examples/orc_null_support.q index ff7000b..fe03d38 100644 --- a/examples/orc_null_support.q +++ b/examples/orc_null_support.q @@ -186,11 +186,11 @@ show contiguous_data~first arrow_contiguous_data show compound_data~first arrow_compound_data // Compare null bitmaps of arrow data -compound_numeric_nulls:(00000b;10000b;01000b;00100b;00010b); -compound_contiguous_nulls:(10000b;01000b;00100b); -compound_list_nulls:(enlist 0b;01b;000b); -compound_struct_nulls:(100b;010b;001b); -compound_map_nulls:((enlist 0b)!(enlist 0b);00b!00b;000b!010b) +numeric_nulls:(00000b;10000b;01000b;00100b;00010b); +contiguous_nulls:(10000b;01000b;00100b); +list_nulls:(enlist 0b;01b;000b); +struct_nulls:(100b;010b;001b); +map_nulls:((enlist 0b)!(enlist 0b);00b!00b;000b!010b) arrow_numeric_nulls:last arrow_numeric_data; arrow_contiguous_nulls:last arrow_contiguous_data; @@ -198,11 +198,11 @@ arrow_list_nulls:last[arrow_compound_data][0] arrow_struct_nulls:last[arrow_compound_data][1] arrow_map_nulls:last[arrow_compound_data][2] -show compound_numeric_nulls~compound_numeric_nulls & arrow_numeric_nulls -show compound_contiguous_nulls~compound_contiguous_nulls & arrow_contiguous_nulls -show compound_list_nulls~arrow_list_nulls -show compound_struct_nulls~compound_struct_nulls & arrow_struct_nulls -show compound_map_nulls~arrow_map_nulls +show numeric_nulls~numeric_nulls & arrow_numeric_nulls +show contiguous_nulls~contiguous_nulls & arrow_contiguous_nulls +show list_nulls~arrow_list_nulls +show struct_nulls~struct_nulls & arrow_struct_nulls +show map_nulls~arrow_map_nulls rm arrow_numeric; rm arrow_contiguous; @@ -256,11 +256,11 @@ orc_list_nulls:last[orc_compound_data][0] orc_struct_nulls:last[orc_compound_data][1] orc_map_nulls:last[orc_compound_data][2] -show compound_numeric_nulls~compound_numeric_nulls & orc_numeric_nulls -show compound_contiguous_nulls~compound_contiguous_nulls & orc_contiguous_nulls -show compound_list_nulls~orc_list_nulls -show compound_struct_nulls~compound_struct_nulls & orc_struct_nulls -show compound_map_nulls~orc_map_nulls +show numeric_nulls~numeric_nulls & orc_numeric_nulls +show contiguous_nulls~contiguous_nulls & orc_contiguous_nulls +show list_nulls~orc_list_nulls +show struct_nulls~struct_nulls & orc_struct_nulls +show map_nulls~orc_map_nulls rm orc_numeric; rm orc_contiguous; diff --git a/tests/orc_dataloader/orc_compound_nulls.t b/tests/orc_dataloader/orc_compound_nulls.t new file mode 100644 index 0000000..451483d --- /dev/null +++ b/tests/orc_dataloader/orc_compound_nulls.t @@ -0,0 +1,104 @@ +// orc_compound_nulls.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping in Apache ORC ||----------+\n"; +comp_opts:(`bool`int8`int64`float32`float64`date32)!(1b;0x02;5;9.87e;6.54;2012.11.10); + +compound_options:(``NULL_MAPPING)!((::);comp_opts); + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +i8_dt:.arrowkdb.dt.int8[]; + +bool_dt:.arrowkdb.dt.boolean[]; +f32_dt:.arrowkdb.dt.float32[]; +d32_dt:.arrowkdb.dt.date32[]; + +i64_dt:.arrowkdb.dt.int64[]; +f64_dt:.arrowkdb.dt.float64[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; + +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; + +-1"\n+----------|| Create a field containing the list datatype ||----------+\n"; +list_dt:.arrowkdb.dt.list[i8_fd]; +list_fd:.arrowkdb.fd.field[`list_field;list_dt]; + +-1"\n+----------|| Create a field containing the struct datatype ||----------+\n"; +struct_dt:.arrowkdb.dt.struct[(bool_fd,f32_fd,d32_fd)]; +struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; + +-1"\n+----------|| Create fields containing the map datatype ||----------+\n"; +map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] +map_fd:.arrowkdb.fd.field[`map;map_dt]; + +-1"\n+----------|| Create the schema containing the list and struct fields ||----------+\n"; +compound_schema:.arrowkdb.sc.schema[(list_fd,struct_fd,map_fd)]; + +-1"\n+----------|| Number of items in each array ||----------+\n"; +N:3 + +bool_data:N?(0b;1b); +bool_data[0]:1b; +f32_data:N?100e; +f32_data[1]:9.87e; +d32_data:N?(2012.11.10;2010.07.18;2011.07.16;2014.07.15;2016.07.11); +d32_data[2]:2012.11.10; + +-1"\n+----------|| Combine the array data for the list and struct columns ||----------+\n"; +list_array:(enlist 0x00;(0x0102);(0x030405)); +struct_array:(bool_data;f32_data;d32_data); +map_array:((enlist 1)!(enlist 1.23);(2 2)!(4.56 7.89);(3 3 3)!(9.87 6.54 3.21)) +compound_data:(list_array;struct_array;map_array); + +-1"\n+----------|| Write the schema and array data to a ORC file ||----------+\n"; +compound_options[`ORC_CHUNK_SIZE]:1024 + +orc_compound:"compound_bitmap.orc"; +.arrowkdb.orc.writeOrc[orc_compound;compound_schema;compound_data;compound_options] + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +compound_options[`WITH_NULL_BITMAP]:1; + +orc_compound_schema:.arrowkdb.orc.readOrcSchema[orc_compound]; +.arrowkdb.sc.equalSchemas[compound_schema;orc_compound_schema] +compound_schema~orc_compound_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +orc_compound_data:.arrowkdb.orc.readOrcData[orc_compound;compound_options]; +compound_data~first orc_compound_data + +-1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; +list_nulls:(enlist 0b;01b;000b); +struct_nulls:(100b;010b;001b); +map_nulls:((enlist 0b)!(enlist 0b);00b!00b;000b!010b) + +orc_list_nulls:last[orc_compound_data][0] +orc_struct_nulls:last[orc_compound_data][1] +orc_map_nulls:last[orc_compound_data][2] + +list_nulls~orc_list_nulls +struct_nulls~struct_nulls & orc_struct_nulls +map_nulls~orc_map_nulls + +rm orc_compound; + + +-1 "\n+----------|| Test utils ||----------+\n"; + +.arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/orc_dataloader/orc_contiguous_nulls.t b/tests/orc_dataloader/orc_contiguous_nulls.t new file mode 100644 index 0000000..bcc489f --- /dev/null +++ b/tests/orc_dataloader/orc_contiguous_nulls.t @@ -0,0 +1,79 @@ +// orc_contiguous_nulls.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping in Apache ORC ||----------+\n"; +cont_opts:(`utf8`binary`decimal)!("start";"x"$"alert";3.21f); + +contiguous_options:(``NULL_MAPPING)!((::);cont_opts); + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +str_dt:.arrowkdb.dt.utf8[]; +bin_dt:.arrowkdb.dt.binary[]; +dec_dt:.arrowkdb.dt.decimal128[38i;2i]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +str_fd:.arrowkdb.fd.field[`string;str_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; + +contiguous_schema:.arrowkdb.sc.schema[(str_fd, bin_fd, dec_fd)]; + +-1"\n+----------|| Number of items in each array ||----------+\n"; +N:5 + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[0]:"start" +bin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[1]:"x"$"alert" +dec_data:{"F"$.Q.f[2]x} each N?(10f) +dec_data[2]:3.21f + +-1"\n+----------|| Combine the data for contiguous columns ||----------+\n"; +contiguous_options[`DECIMAL128_AS_DOUBLE]:1 + +contiguous_data:(str_data;bin_data;dec_data); + +-1"\n+----------|| Write the schema and array data to a ORC file ||----------+\n"; +contiguous_options[`ORC_CHUNK_SIZE]:1024 + +orc_contiguous:"contiguous_bitmap.orc"; +.arrowkdb.orc.writeOrc[orc_contiguous;contiguous_schema;contiguous_data;contiguous_options] + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +contiguous_options[`WITH_NULL_BITMAP]:1; + +orc_contiguous_schema:.arrowkdb.orc.readOrcSchema[orc_contiguous]; +.arrowkdb.sc.equalSchemas[contiguous_schema;orc_contiguous_schema] +contiguous_schema~orc_contiguous_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +orc_contiguous_data:.arrowkdb.orc.readOrcData[orc_contiguous;contiguous_options]; +contiguous_data~first orc_contiguous_data + +-1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; +contiguous_nulls:(10000b;01000b;00100b); +orc_contiguous_nulls:last orc_contiguous_data; +contiguous_nulls~contiguous_nulls & orc_contiguous_nulls + +rm orc_contiguous; + + +-1 "\n+----------|| Test utils ||----------+\n"; + +.arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/orc_dataloader/orc_numeric_nulls.t b/tests/orc_dataloader/orc_numeric_nulls.t new file mode 100644 index 0000000..bb5a4b5 --- /dev/null +++ b/tests/orc_dataloader/orc_numeric_nulls.t @@ -0,0 +1,84 @@ +// orc_numeric_nulls.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping in Apache ORC ||----------+\n"; +num_opts:(`int8`int16`int32`int64`float64)!(0x02;3h;4i;5;6.54); + +numeric_options:(``NULL_MAPPING)!((::);num_opts); + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +i8_dt:.arrowkdb.dt.int8[]; +i16_dt:.arrowkdb.dt.int16[]; +i32_dt:.arrowkdb.dt.int32[]; +i64_dt:.arrowkdb.dt.int64[]; +f64_dt:.arrowkdb.dt.float64[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; +i16_fd:.arrowkdb.fd.field[`int16;i16_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; + +-1"\n+----------|| Create the schema for the list of fields ||----------+\n"; +numeric_schema:.arrowkdb.sc.schema[(ts_fd, i16_fd, i32_fd, i64_fd, f64_fd)]; + +-1"\n+----------|| Number of items in each array ||----------+\n"; +N:5 + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +i16_data:N?100h; +i16_data[0]:3h; +i32_data:N?100i; +i32_data[1]:4i; +i64_data:N?100; +i64_data[2]:5; +f64_data:N?100f; +f64_data[3]:6.54f; + +-1"\n+----------|| Combine the data for numeric columns ||----------+\n"; +numeric_data:(ts_data;i16_data;i32_data;i64_data;f64_data); + +-1"\n+----------|| Write the schema and array data to a ORC file ||----------+\n"; +numeric_options[`ORC_CHUNK_SIZE]:1024 + +orc_numeric:"numeric_bitmap.orc"; +.arrowkdb.orc.writeOrc[orc_numeric;numeric_schema;numeric_data;numeric_options] + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +numeric_options[`WITH_NULL_BITMAP]:1; + +orc_numeric_schema:.arrowkdb.orc.readOrcSchema[orc_numeric]; +.arrowkdb.sc.equalSchemas[numeric_schema;orc_numeric_schema] +numeric_schema~orc_numeric_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +orc_numeric_data:.arrowkdb.orc.readOrcData[orc_numeric;numeric_options]; +numeric_data~first orc_numeric_data + +-1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; +numeric_nulls:(00000b;10000b;01000b;00100b;00010b); +orc_numeric_nulls:last orc_numeric_data; +numeric_nulls~numeric_nulls & orc_numeric_nulls + +rm orc_numeric; + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From 4762c181203341657841b4c4d9af5d90d72b533e Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 28 Mar 2023 07:33:48 +0000 Subject: [PATCH 65/71] Update for WITH_NULL_BITMAP --- q/arrowkdb.q | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/q/arrowkdb.q b/q/arrowkdb.q index cfff1c2..d2f92f3 100644 --- a/q/arrowkdb.q +++ b/q/arrowkdb.q @@ -106,6 +106,19 @@ tb.prettyPrintTable_:`arrowkdb 2:(`prettyPrintTable;3); tb.prettyPrintTable:{[x;y;z] -1 tb.prettyPrintTable_[x;y;z];}; tb.prettyPrintTableFromTable:{[table;options] tb.prettyPrintTable[sc.inferSchema[table];value flip table;options]}; +// ORC files +orc.writeOrc:`arrowkdb 2:(`writeORC;4); +orc.writeOrcFromTable:{[filename;table;options] orc.writeOrc[filename;sc.inferSchema[table];value flip table;options]}; +orc.readOrcSchema:`arrowkdb 2:(`readORCSchema;1); +orc.readOrcData:`arrowkdb 2:(`readORCData;2); +orc.readOrcToTable:{[filename;options] + fields:fd.fieldName each sc.schemaFields[orc.readOrcSchema[filename]]; + data:orc.readOrcData[filename;options]; + $[1~options`WITH_NULL_BITMAP; + (flip fields!first data;flip fields!last data); + flip fields!data + ] + }; // parquet files pq.writeParquet:`arrowkdb 2:(`writeParquet;4); @@ -132,14 +145,6 @@ pq.readParquetRowGroupsToTable:{[filename;row_groups;columns;options] ] }; -// ORC files -orc.writeOrc:`arrowkdb 2:(`writeORC;4); -orc.writeOrcFromTable:{[filename;table;options] orc.writeOrc[filename;sc.inferSchema[table];value flip table;options]}; -orc.readOrcSchema:`arrowkdb 2:(`readORCSchema;1); -orc.readOrcData:`arrowkdb 2:(`readORCData;2); -orc.readOrcToTable:{[filename;options] flip (fd.fieldName each sc.schemaFields[orc.readOrcSchema[filename]])!(orc.readOrcData[filename;options])}; -// orc.readColumn (Functionality is different since dealing with stripes) - // arrow files ipc.writeArrow:`arrowkdb 2:(`writeArrow;4); ipc.writeArrowFromTable:{[filename;table;options] ipc.writeArrow[filename;sc.inferSchema[table];value flip table;options]}; From 5f7303fd55806ab09ad3b428ed4db51ed67d0fb1 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 29 Mar 2023 13:56:15 +0000 Subject: [PATCH 66/71] Cleaning up ORC example of null support out of arrow writing --- examples/orc_null_support.q | 76 +++++-------------------------------- 1 file changed, 10 insertions(+), 66 deletions(-) diff --git a/examples/orc_null_support.q b/examples/orc_null_support.q index fe03d38..53aee10 100644 --- a/examples/orc_null_support.q +++ b/examples/orc_null_support.q @@ -144,81 +144,19 @@ compound_options[`DECIMAL128_AS_DOUBLE]:1 -1"\nCompound table:"; .arrowkdb.tb.prettyPrintTable[compound_schema;compound_data;compound_options] -//-------------------------// -// Example-1. Arrow IPC file // -//-------------------------// - -// Write the schema and array data to a arrow file -arrow_numeric:"numeric_bitmap.arrow"; -arrow_contiguous:"contiguous_bitmap.arrow"; -arrow_compound:"compound_bitmap.arrow"; - -.arrowkdb.ipc.writeArrow[arrow_numeric;numeric_schema;numeric_data;compound_options]; -.arrowkdb.ipc.writeArrow[arrow_contiguous;contiguous_schema;contiguous_data;compound_options]; -.arrowkdb.ipc.writeArrow[arrow_compound;compound_schema;compound_data;compound_options]; - -show ls arrow_numeric -show ls arrow_contiguous -show ls arrow_compound - -// Read the schema back and compare -compound_options[`WITH_NULL_BITMAP]:1; - -arrow_numeric_schema:.arrowkdb.ipc.readArrowSchema[arrow_numeric]; -arrow_contiguous_schema:.arrowkdb.ipc.readArrowSchema[arrow_contiguous]; -arrow_compound_schema:.arrowkdb.ipc.readArrowSchema[arrow_compound]; - -show .arrowkdb.sc.equalSchemas[numeric_schema;arrow_numeric_schema] -show .arrowkdb.sc.equalSchemas[contiguous_schema;arrow_contiguous_schema] -show .arrowkdb.sc.equalSchemas[compound_schema;arrow_compound_schema] - -show numeric_schema~arrow_numeric_schema -show contiguous_schema~arrow_contiguous_schema -show compound_schema~arrow_compound_schema - -// Read the array data back and compare -arrow_numeric_data:.arrowkdb.ipc.readArrowData[arrow_numeric;compound_options]; -arrow_contiguous_data:.arrowkdb.ipc.readArrowData[arrow_contiguous;compound_options]; -arrow_compound_data:.arrowkdb.ipc.readArrowData[arrow_compound;compound_options]; - -show numeric_data~first arrow_numeric_data -show contiguous_data~first arrow_contiguous_data -show compound_data~first arrow_compound_data - -// Compare null bitmaps of arrow data -numeric_nulls:(00000b;10000b;01000b;00100b;00010b); -contiguous_nulls:(10000b;01000b;00100b); -list_nulls:(enlist 0b;01b;000b); -struct_nulls:(100b;010b;001b); -map_nulls:((enlist 0b)!(enlist 0b);00b!00b;000b!010b) - -arrow_numeric_nulls:last arrow_numeric_data; -arrow_contiguous_nulls:last arrow_contiguous_data; -arrow_list_nulls:last[arrow_compound_data][0] -arrow_struct_nulls:last[arrow_compound_data][1] -arrow_map_nulls:last[arrow_compound_data][2] - -show numeric_nulls~numeric_nulls & arrow_numeric_nulls -show contiguous_nulls~contiguous_nulls & arrow_contiguous_nulls -show list_nulls~arrow_list_nulls -show struct_nulls~struct_nulls & arrow_struct_nulls -show map_nulls~arrow_map_nulls - -rm arrow_numeric; -rm arrow_contiguous; -rm arrow_compound; - //---------------------------// -// Example-2. Apache ORC file// +// Example-1. Apache ORC file// //---------------------------// // Write the schema and array data to a ORC file -compound_options[`ORC_CHUNK_SIZE]:1024 +compound_options[`WITH_NULL_BITMAP]:1; orc_numeric:"numeric_bitmap.orc"; orc_contiguous:"contiguous_bitmap.orc"; orc_compound:"compound_bitmap.orc"; +compound_options[`ORC_CHUNK_SIZE]:1024 + .arrowkdb.orc.writeOrc[orc_numeric;numeric_schema;numeric_data;compound_options] .arrowkdb.orc.writeOrc[orc_contiguous;contiguous_schema;contiguous_data;compound_options] .arrowkdb.orc.writeOrc[orc_compound;compound_schema;compound_data;compound_options] @@ -250,6 +188,12 @@ show contiguous_data~first orc_contiguous_data show compound_data~first orc_compound_data // Compare null bitmaps of arrow data +numeric_nulls:(00000b;10000b;01000b;00100b;00010b); +contiguous_nulls:(10000b;01000b;00100b); +list_nulls:(enlist 0b;01b;000b); +struct_nulls:(100b;010b;001b); +map_nulls:((enlist 0b)!(enlist 0b);00b!00b;000b!010b) + orc_numeric_nulls:last orc_numeric_data; orc_contiguous_nulls:last orc_contiguous_data; orc_list_nulls:last[orc_compound_data][0] From 819d5783ee06a04ab8dcb661595e5b77ace32913 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 29 Mar 2023 13:56:53 +0000 Subject: [PATCH 67/71] Use binary format for Arrow so version --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5679b99..4077f38 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 3.1.3) project(arrowkdb CXX) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -DKXVER=3") -if(ARROW_SO_VERSION LESS "10.0") +if(ARROW_SO_VERSION LESS "1000") set(CMAKE_CXX_STANDARD 14) else() set(CMAKE_CXX_STANDARD 17) From e1e40914c915eb04bde8b96c63ea528517813e02 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 29 Mar 2023 15:18:53 +0000 Subject: [PATCH 68/71] Unit-test typo for Travis moan --- tests/orc_dataloader/orc_compound_nulls.t | 2 +- tests/orc_dataloader/orc_contiguous_nulls.t | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/orc_dataloader/orc_compound_nulls.t b/tests/orc_dataloader/orc_compound_nulls.t index 451483d..4053aa2 100644 --- a/tests/orc_dataloader/orc_compound_nulls.t +++ b/tests/orc_dataloader/orc_compound_nulls.t @@ -97,7 +97,7 @@ rm orc_compound; -1 "\n+----------|| Test utils ||----------+\n"; -.arrowkdb.util.buildInfo[] +show .arrowkdb.util.buildInfo[] (type .arrowkdb.util.buildInfo[])~99h diff --git a/tests/orc_dataloader/orc_contiguous_nulls.t b/tests/orc_dataloader/orc_contiguous_nulls.t index bcc489f..42e28d4 100644 --- a/tests/orc_dataloader/orc_contiguous_nulls.t +++ b/tests/orc_dataloader/orc_contiguous_nulls.t @@ -72,7 +72,7 @@ rm orc_contiguous; -1 "\n+----------|| Test utils ||----------+\n"; -.arrowkdb.util.buildInfo[] +show .arrowkdb.util.buildInfo[] (type .arrowkdb.util.buildInfo[])~99h From e296ac0e9b4397363e4ef7eb107147668b6648d2 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 29 Mar 2023 15:26:13 +0000 Subject: [PATCH 69/71] Supply prefix path for Arrow --- .gitignore | 1 + CMakeLists.txt | 15 +++++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 701d371..391b645 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ arrowkdb.code-workspace .vscode/ build/ +tests/*.q test.q unit.q *.user diff --git a/CMakeLists.txt b/CMakeLists.txt index 4077f38..7472f08 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,12 +5,8 @@ endif() cmake_minimum_required(VERSION 3.1.3) project(arrowkdb CXX) +set(CMAKE_PREFIX_PATH "${CMAKE_PREFIX_PATH};${ARROW_INSTALL}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -DKXVER=3") -if(ARROW_SO_VERSION LESS "1000") - set(CMAKE_CXX_STANDARD 14) -else() - set(CMAKE_CXX_STANDARD 17) -endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) @@ -29,6 +25,13 @@ include_directories ( ${CMAKE_BINARY_DIR} # For 'k.h', downloaded below ) +find_package(Arrow REQUIRED) +if(ARROW_SO_VERSION LESS "1000") + set(CMAKE_CXX_STANDARD 14) +else() + set(CMAKE_CXX_STANDARD 17) +endif() + find_library(ARROW_LIBRARY NAMES arrow HINTS "${ARROW_INSTALL}/lib/" @@ -65,7 +68,7 @@ else() set(OSFLAG l) endif() -target_link_libraries(${MY_LIBRARY_NAME} ${ARROW_LIBRARY} ${PARQUET_LIBRARY} ${LINK_LIBS}) +target_link_libraries(${MY_LIBRARY_NAME} Arrow::arrow_shared ${PARQUET_LIBRARY} ${LINK_LIBS}) set_target_properties(${MY_LIBRARY_NAME} PROPERTIES PREFIX "") # Check if 32-bit/64-bit machine From eba33d28f358fa044381a9998354acb94ce90f95 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 29 Mar 2023 16:16:05 +0000 Subject: [PATCH 70/71] Fixing shared library complain of Travis --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7472f08..8320421 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,7 +68,7 @@ else() set(OSFLAG l) endif() -target_link_libraries(${MY_LIBRARY_NAME} Arrow::arrow_shared ${PARQUET_LIBRARY} ${LINK_LIBS}) +target_link_libraries(${MY_LIBRARY_NAME} ${ARROW_LIBRARY} ${PARQUET_LIBRARY} ${LINK_LIBS}) set_target_properties(${MY_LIBRARY_NAME} PROPERTIES PREFIX "") # Check if 32-bit/64-bit machine From 04a77d253a24923ee0e0560bff8d62af60631c8e Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 29 Mar 2023 16:48:35 +0000 Subject: [PATCH 71/71] Add extra warning that package requires Arrow 9.0 [ci skip] --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index dc5e95f..8bbb573 100644 --- a/README.md +++ b/README.md @@ -44,10 +44,11 @@ Conversely, Arrow is an in-memory format meant for direct and efficient use for ### Requirements - kdb+ ≥ 3.5 64-bit (Linux/MacOS/Windows) -- Apache Arrow = 9.0.0 (or ≥ 6.0.0 if building `arrowkdb` from source) +- Apache Arrow ≥ 9.0.0 (or ≥ 6.0.0 if building `arrowkdb` from source) - C++14 or later - CMake ≥ 3.1.3 +> :warning: If using the packaged version of `arrowkdb` you should install version 9.0.0 of Apache Arrow ### Third-party library installation