From f094434b41303ceae59698250e5a77b01a3c8faf Mon Sep 17 00:00:00 2001 From: Grigoriy Pisarenko Date: Thu, 12 Sep 2024 19:38:42 +0000 Subject: [PATCH 1/6] Partly moved ydb/core/formats into ydb/library/formats --- .../arrow/accessor/abstract/constructor.h | 7 +- .../formats/arrow/accessor/abstract/request.h | 3 +- .../formats/arrow/accessor/abstract/ya.make | 6 +- .../accessor/composite_serial/accessor.h | 2 +- .../arrow/accessor/composite_serial/ya.make | 1 + .../formats/arrow/accessor/plain/accessor.h | 4 +- .../arrow/accessor/plain/constructor.cpp | 5 +- .../arrow/accessor/plain/constructor.h | 2 +- .../formats/arrow/accessor/plain/request.h | 2 +- ydb/core/formats/arrow/accessor/plain/ya.make | 3 +- .../arrow/accessor/sparsed/accessor.cpp | 2 +- .../formats/arrow/accessor/sparsed/accessor.h | 2 +- .../arrow/accessor/sparsed/constructor.h | 2 +- .../formats/arrow/accessor/sparsed/request.h | 2 +- .../formats/arrow/accessor/sparsed/ya.make | 3 +- ydb/core/formats/arrow/accessor/ya.make | 1 - ydb/core/formats/arrow/arrow_filter.cpp | 2 +- ydb/core/formats/arrow/arrow_helpers.cpp | 818 +----------------- ydb/core/formats/arrow/arrow_helpers.h | 83 +- ydb/core/formats/arrow/common/adapter.h | 3 +- ydb/core/formats/arrow/common/container.cpp | 4 +- ydb/core/formats/arrow/common/container.h | 5 +- ydb/core/formats/arrow/common/validation.h | 3 - ydb/core/formats/arrow/common/ya.make | 2 +- ydb/core/formats/arrow/converter.cpp | 2 +- .../formats/arrow/dictionary/conversion.cpp | 11 +- .../formats/arrow/dictionary/conversion.h | 1 - ydb/core/formats/arrow/dictionary/object.cpp | 2 +- ydb/core/formats/arrow/dictionary/object.h | 2 +- ydb/core/formats/arrow/dictionary/ya.make | 4 +- ydb/core/formats/arrow/hash/calcer.cpp | 2 +- ydb/core/formats/arrow/hash/calcer.h | 4 +- ydb/core/formats/arrow/hash/ya.make | 5 +- ydb/core/formats/arrow/permutations.cpp | 201 +---- ydb/core/formats/arrow/permutations.h | 138 +-- ydb/core/formats/arrow/process_columns.cpp | 5 +- ydb/core/formats/arrow/reader/merger.cpp | 1 + ydb/core/formats/arrow/reader/position.h | 3 +- .../formats/arrow/reader/result_builder.cpp | 3 +- ydb/core/formats/arrow/reader/ya.make | 2 +- ydb/core/formats/arrow/save_load/loader.cpp | 2 +- ydb/core/formats/arrow/save_load/loader.h | 2 +- ydb/core/formats/arrow/save_load/saver.h | 2 +- ydb/core/formats/arrow/save_load/ya.make | 2 + ydb/core/formats/arrow/serializer/abstract.h | 2 +- ydb/core/formats/arrow/serializer/native.cpp | 2 +- ydb/core/formats/arrow/serializer/ya.make | 2 +- ydb/core/formats/arrow/size_calcer.cpp | 193 +---- ydb/core/formats/arrow/size_calcer.h | 48 +- ydb/core/formats/arrow/special_keys.h | 5 +- ydb/core/formats/arrow/splitter/scheme_info.h | 2 +- ydb/core/formats/arrow/splitter/simple.cpp | 2 +- ydb/core/formats/arrow/splitter/simple.h | 1 - ydb/core/formats/arrow/splitter/ya.make | 4 +- ydb/core/formats/arrow/switch/switch_type.h | 172 +--- ydb/core/formats/arrow/switch/ya.make | 2 +- .../formats/arrow/transformer/dictionary.h | 2 +- ydb/core/formats/arrow/transformer/ya.make | 3 +- ydb/core/formats/arrow/ut/ut_arrow.cpp | 21 - ydb/core/formats/arrow/ut/ut_dictionary.cpp | 6 +- ydb/core/formats/arrow/ut/ut_hash.cpp | 2 +- ydb/core/formats/arrow/ut/ya.make | 2 +- ydb/core/formats/arrow/ya.make | 7 +- .../kqp/compute_actor/kqp_compute_actor.cpp | 2 +- .../kqp/compute_actor/kqp_compute_events.h | 2 +- ydb/core/kqp/compute_actor/ya.make | 3 +- ydb/core/kqp/opt/kqp_query_plan.cpp | 2 +- ydb/core/kqp/opt/ya.make | 2 +- .../kqp/query_compiler/kqp_olap_compiler.cpp | 2 +- ydb/core/kqp/ut/common/columnshard.h | 7 +- ydb/core/kqp/ut/olap/helpers/typed_local.h | 6 +- ydb/core/protos/flat_scheme_op.proto | 2 +- ydb/core/protos/tx_columnshard.proto | 2 +- ydb/core/protos/tx_datashard.proto | 2 +- ydb/core/protos/ya.make | 2 +- ydb/core/tx/columnshard/common/scalars.cpp | 2 +- ydb/core/tx/columnshard/common/scalars.h | 2 +- ydb/core/tx/columnshard/common/ya.make | 2 +- .../data_sharing/source/session/cursor.cpp | 2 +- .../changes/compaction/common/context.h | 2 +- .../engines/changes/compaction/merger.cpp | 4 +- .../engines/changes/compaction/merger.h | 2 +- .../compaction/plain/column_cursor.cpp | 2 +- .../compaction/plain/column_portion_chunk.cpp | 2 +- .../compaction/plain/column_portion_chunk.h | 2 +- .../engines/changes/compaction/plain/logic.h | 4 +- .../changes/compaction/sparsed/logic.h | 4 +- .../engines/protos/portion_info.proto | 2 +- .../tx/columnshard/engines/protos/ya.make | 2 +- .../engines/scheme/indexes/abstract/checker.h | 2 +- .../engines/scheme/indexes/abstract/ya.make | 2 +- .../engines/storage/indexes/portions/ya.make | 2 +- .../tx/columnshard/test_helper/helper.cpp | 2 +- ydb/core/tx/program/program.h | 2 +- ydb/core/tx/program/ya.make | 2 +- .../arrow/accessor/abstract/accessor.cpp | 12 +- .../arrow/accessor/abstract/accessor.h | 0 .../formats/arrow/accessor/abstract/ya.make | 14 + .../arrow/accessor/common/chunk_data.cpp | 0 .../arrow/accessor/common/chunk_data.h | 0 .../formats/arrow/accessor/common/const.cpp | 0 .../formats/arrow/accessor/common/const.h | 0 .../formats/arrow/accessor/common/ya.make | 0 .../arrow/accessor/composite/accessor.cpp | 0 .../arrow/accessor/composite/accessor.h | 2 +- .../formats/arrow/accessor/composite/ya.make | 2 +- ydb/library/formats/arrow/accessor/ya.make | 8 + ydb/library/formats/arrow/arrow_helpers.cpp | 807 +++++++++++++++++ ydb/library/formats/arrow/arrow_helpers.h | 101 +++ ydb/library/formats/arrow/common/validation.h | 3 + .../formats/arrow/common/vector_operations.h | 0 ydb/library/formats/arrow/common/ya.make | 12 + .../formats/arrow/hash/xx_hash.cpp | 0 .../formats/arrow/hash/xx_hash.h | 0 ydb/library/formats/arrow/hash/ya.make | 17 + .../formats/arrow/input_stream.h | 0 .../formats/arrow/modifier/schema.cpp | 2 +- .../formats/arrow/modifier/schema.h | 0 .../formats/arrow/modifier/subset.cpp | 0 .../formats/arrow/modifier/subset.h | 2 +- .../formats/arrow/modifier/ya.make | 4 +- ydb/library/formats/arrow/permutations.cpp | 214 +++++ ydb/library/formats/arrow/permutations.h | 149 ++++ .../formats/arrow/protos/accessor.proto | 0 .../formats/arrow/protos/fields.proto | 0 .../formats/arrow/protos/ssa.proto | 0 .../formats/arrow/protos/ya.make | 0 .../formats/arrow/replace_key.cpp | 0 .../formats/arrow/replace_key.h | 33 +- .../formats/arrow/scalar/serialization.cpp | 2 +- .../formats/arrow/scalar/serialization.h | 0 .../formats/arrow/scalar/ya.make | 2 +- .../formats/arrow/simple_arrays_cache.cpp | 2 + .../formats/arrow/simple_arrays_cache.h | 0 .../formats/arrow/simple_builder/array.cpp | 0 .../formats/arrow/simple_builder/array.h | 0 .../formats/arrow/simple_builder/batch.cpp | 0 .../formats/arrow/simple_builder/batch.h | 0 .../formats/arrow/simple_builder/filler.cpp | 0 .../formats/arrow/simple_builder/filler.h | 0 .../formats/arrow/simple_builder/ya.make | 0 ydb/library/formats/arrow/size_calcer.cpp | 208 +++++ ydb/library/formats/arrow/size_calcer.h | 63 ++ .../formats/arrow/splitter/similar_packer.cpp | 0 .../formats/arrow/splitter/similar_packer.h | 0 .../formats/arrow/splitter/stats.cpp | 0 .../formats/arrow/splitter/stats.h | 0 ydb/library/formats/arrow/splitter/ya.make | 14 + .../formats/arrow/switch/compare.cpp | 0 .../formats/arrow/switch/compare.h | 0 .../formats/arrow/switch/switch_type.cpp | 5 + .../formats/arrow/switch/switch_type.h | 184 ++++ ydb/library/formats/arrow/switch/ya.make | 13 + .../formats/arrow/switch_type.h | 0 .../formats/arrow/transformer/abstract.cpp | 0 .../formats/arrow/transformer/abstract.h | 0 .../formats/arrow/transformer/composite.cpp | 0 .../formats/arrow/transformer/composite.h | 0 ydb/library/formats/arrow/transformer/ya.make | 12 + ydb/library/formats/arrow/ut/ut_arrow.cpp | 302 +++++++ .../formats/arrow/ut/ut_size_calcer.cpp | 11 +- ydb/library/formats/arrow/ut/ya.make | 29 + .../formats/arrow/validation/validation.cpp | 0 .../formats/arrow/validation/validation.h | 0 .../formats/arrow/validation/ya.make | 0 ydb/library/formats/arrow/ya.make | 52 ++ ydb/library/formats/ya.make | 9 + ydb/library/ya.make | 1 + 168 files changed, 2395 insertions(+), 1815 deletions(-) delete mode 100644 ydb/core/formats/arrow/common/validation.h rename ydb/{core => library}/formats/arrow/accessor/abstract/accessor.cpp (95%) rename ydb/{core => library}/formats/arrow/accessor/abstract/accessor.h (100%) create mode 100644 ydb/library/formats/arrow/accessor/abstract/ya.make rename ydb/{core => library}/formats/arrow/accessor/common/chunk_data.cpp (100%) rename ydb/{core => library}/formats/arrow/accessor/common/chunk_data.h (100%) rename ydb/{core => library}/formats/arrow/accessor/common/const.cpp (100%) rename ydb/{core => library}/formats/arrow/accessor/common/const.h (100%) rename ydb/{core => library}/formats/arrow/accessor/common/ya.make (100%) rename ydb/{core => library}/formats/arrow/accessor/composite/accessor.cpp (100%) rename ydb/{core => library}/formats/arrow/accessor/composite/accessor.h (97%) rename ydb/{core => library}/formats/arrow/accessor/composite/ya.make (69%) create mode 100644 ydb/library/formats/arrow/accessor/ya.make create mode 100644 ydb/library/formats/arrow/arrow_helpers.cpp create mode 100644 ydb/library/formats/arrow/arrow_helpers.h create mode 100644 ydb/library/formats/arrow/common/validation.h rename ydb/{core => library}/formats/arrow/common/vector_operations.h (100%) create mode 100644 ydb/library/formats/arrow/common/ya.make rename ydb/{core => library}/formats/arrow/hash/xx_hash.cpp (100%) rename ydb/{core => library}/formats/arrow/hash/xx_hash.h (100%) create mode 100644 ydb/library/formats/arrow/hash/ya.make rename ydb/{core => library}/formats/arrow/input_stream.h (100%) rename ydb/{core => library}/formats/arrow/modifier/schema.cpp (97%) rename ydb/{core => library}/formats/arrow/modifier/schema.h (100%) rename ydb/{core => library}/formats/arrow/modifier/subset.cpp (100%) rename ydb/{core => library}/formats/arrow/modifier/subset.h (96%) rename ydb/{core => library}/formats/arrow/modifier/ya.make (67%) create mode 100644 ydb/library/formats/arrow/permutations.cpp create mode 100644 ydb/library/formats/arrow/permutations.h rename ydb/{core => library}/formats/arrow/protos/accessor.proto (100%) rename ydb/{core => library}/formats/arrow/protos/fields.proto (100%) rename ydb/{core => library}/formats/arrow/protos/ssa.proto (100%) rename ydb/{core => library}/formats/arrow/protos/ya.make (100%) rename ydb/{core => library}/formats/arrow/replace_key.cpp (100%) rename ydb/{core => library}/formats/arrow/replace_key.h (88%) rename ydb/{core => library}/formats/arrow/scalar/serialization.cpp (97%) rename ydb/{core => library}/formats/arrow/scalar/serialization.h (100%) rename ydb/{core => library}/formats/arrow/scalar/ya.make (79%) rename ydb/{core => library}/formats/arrow/simple_arrays_cache.cpp (98%) rename ydb/{core => library}/formats/arrow/simple_arrays_cache.h (100%) rename ydb/{core => library}/formats/arrow/simple_builder/array.cpp (100%) rename ydb/{core => library}/formats/arrow/simple_builder/array.h (100%) rename ydb/{core => library}/formats/arrow/simple_builder/batch.cpp (100%) rename ydb/{core => library}/formats/arrow/simple_builder/batch.h (100%) rename ydb/{core => library}/formats/arrow/simple_builder/filler.cpp (100%) rename ydb/{core => library}/formats/arrow/simple_builder/filler.h (100%) rename ydb/{core => library}/formats/arrow/simple_builder/ya.make (100%) create mode 100644 ydb/library/formats/arrow/size_calcer.cpp create mode 100644 ydb/library/formats/arrow/size_calcer.h rename ydb/{core => library}/formats/arrow/splitter/similar_packer.cpp (100%) rename ydb/{core => library}/formats/arrow/splitter/similar_packer.h (100%) rename ydb/{core => library}/formats/arrow/splitter/stats.cpp (100%) rename ydb/{core => library}/formats/arrow/splitter/stats.h (100%) create mode 100644 ydb/library/formats/arrow/splitter/ya.make rename ydb/{core => library}/formats/arrow/switch/compare.cpp (100%) rename ydb/{core => library}/formats/arrow/switch/compare.h (100%) create mode 100644 ydb/library/formats/arrow/switch/switch_type.cpp create mode 100644 ydb/library/formats/arrow/switch/switch_type.h create mode 100644 ydb/library/formats/arrow/switch/ya.make rename ydb/{core => library}/formats/arrow/switch_type.h (100%) rename ydb/{core => library}/formats/arrow/transformer/abstract.cpp (100%) rename ydb/{core => library}/formats/arrow/transformer/abstract.h (100%) rename ydb/{core => library}/formats/arrow/transformer/composite.cpp (100%) rename ydb/{core => library}/formats/arrow/transformer/composite.h (100%) create mode 100644 ydb/library/formats/arrow/transformer/ya.make create mode 100644 ydb/library/formats/arrow/ut/ut_arrow.cpp rename ydb/{core => library}/formats/arrow/ut/ut_size_calcer.cpp (91%) create mode 100644 ydb/library/formats/arrow/ut/ya.make rename ydb/{core => library}/formats/arrow/validation/validation.cpp (100%) rename ydb/{core => library}/formats/arrow/validation/validation.h (100%) rename ydb/{core => library}/formats/arrow/validation/ya.make (100%) create mode 100644 ydb/library/formats/arrow/ya.make create mode 100644 ydb/library/formats/ya.make diff --git a/ydb/core/formats/arrow/accessor/abstract/constructor.h b/ydb/core/formats/arrow/accessor/abstract/constructor.h index 7f9883402c25..aa99260e097a 100644 --- a/ydb/core/formats/arrow/accessor/abstract/constructor.h +++ b/ydb/core/formats/arrow/accessor/abstract/constructor.h @@ -1,9 +1,8 @@ #pragma once -#include "accessor.h" - -#include -#include +#include +#include +#include #include #include diff --git a/ydb/core/formats/arrow/accessor/abstract/request.h b/ydb/core/formats/arrow/accessor/abstract/request.h index c13105fe8e21..42fbf3551623 100644 --- a/ydb/core/formats/arrow/accessor/abstract/request.h +++ b/ydb/core/formats/arrow/accessor/abstract/request.h @@ -1,8 +1,7 @@ #pragma once #include "constructor.h" -#include - +#include #include #include diff --git a/ydb/core/formats/arrow/accessor/abstract/ya.make b/ydb/core/formats/arrow/accessor/abstract/ya.make index fd68f1eeb3bf..c40f1f297c18 100644 --- a/ydb/core/formats/arrow/accessor/abstract/ya.make +++ b/ydb/core/formats/arrow/accessor/abstract/ya.make @@ -1,15 +1,15 @@ LIBRARY() PEERDIR( - ydb/core/formats/arrow/protos - ydb/core/formats/arrow/accessor/common contrib/libs/apache/arrow ydb/library/conclusion ydb/services/metadata/abstract + ydb/library/formats/arrow/accessor/abstract + ydb/library/formats/arrow/accessor/common + ydb/library/formats/arrow/protos ) SRCS( - accessor.cpp constructor.cpp request.cpp ) diff --git a/ydb/core/formats/arrow/accessor/composite_serial/accessor.h b/ydb/core/formats/arrow/accessor/composite_serial/accessor.h index 63a4cda06bcb..ac7e0193d7bd 100644 --- a/ydb/core/formats/arrow/accessor/composite_serial/accessor.h +++ b/ydb/core/formats/arrow/accessor/composite_serial/accessor.h @@ -1,6 +1,6 @@ #pragma once -#include #include +#include namespace NKikimr::NArrow::NAccessor { diff --git a/ydb/core/formats/arrow/accessor/composite_serial/ya.make b/ydb/core/formats/arrow/accessor/composite_serial/ya.make index 49c2e1e41ea4..e8095e990285 100644 --- a/ydb/core/formats/arrow/accessor/composite_serial/ya.make +++ b/ydb/core/formats/arrow/accessor/composite_serial/ya.make @@ -2,6 +2,7 @@ LIBRARY() PEERDIR( contrib/libs/apache/arrow + ydb/library/formats/arrow/accessor/abstract ydb/core/formats/arrow/common ydb/core/formats/arrow/save_load ) diff --git a/ydb/core/formats/arrow/accessor/plain/accessor.h b/ydb/core/formats/arrow/accessor/plain/accessor.h index 323073dc0704..a00826161c40 100644 --- a/ydb/core/formats/arrow/accessor/plain/accessor.h +++ b/ydb/core/formats/arrow/accessor/plain/accessor.h @@ -1,6 +1,6 @@ #pragma once -#include -#include +#include +#include namespace NKikimr::NArrow::NAccessor { diff --git a/ydb/core/formats/arrow/accessor/plain/constructor.cpp b/ydb/core/formats/arrow/accessor/plain/constructor.cpp index 7e756d1f30bf..3ecf41502b33 100644 --- a/ydb/core/formats/arrow/accessor/plain/constructor.cpp +++ b/ydb/core/formats/arrow/accessor/plain/constructor.cpp @@ -1,9 +1,8 @@ #include "accessor.h" #include "constructor.h" -#include -#include - +#include +#include #include namespace NKikimr::NArrow::NAccessor::NPlain { diff --git a/ydb/core/formats/arrow/accessor/plain/constructor.h b/ydb/core/formats/arrow/accessor/plain/constructor.h index cf84f5021bd7..57c366689eb0 100644 --- a/ydb/core/formats/arrow/accessor/plain/constructor.h +++ b/ydb/core/formats/arrow/accessor/plain/constructor.h @@ -1,6 +1,6 @@ #pragma once #include -#include +#include namespace NKikimr::NArrow::NAccessor::NPlain { diff --git a/ydb/core/formats/arrow/accessor/plain/request.h b/ydb/core/formats/arrow/accessor/plain/request.h index 02f6cce8560a..19a8390f2df2 100644 --- a/ydb/core/formats/arrow/accessor/plain/request.h +++ b/ydb/core/formats/arrow/accessor/plain/request.h @@ -1,6 +1,6 @@ #pragma once #include -#include +#include namespace NKikimr::NArrow::NAccessor::NPlain { diff --git a/ydb/core/formats/arrow/accessor/plain/ya.make b/ydb/core/formats/arrow/accessor/plain/ya.make index b7d600885be5..5b3dd0bf6081 100644 --- a/ydb/core/formats/arrow/accessor/plain/ya.make +++ b/ydb/core/formats/arrow/accessor/plain/ya.make @@ -1,8 +1,9 @@ LIBRARY() PEERDIR( - ydb/core/formats/arrow/protos ydb/core/formats/arrow/accessor/abstract + ydb/library/formats/arrow + ydb/library/formats/arrow/protos ) SRCS( diff --git a/ydb/core/formats/arrow/accessor/sparsed/accessor.cpp b/ydb/core/formats/arrow/accessor/sparsed/accessor.cpp index b360ee3ea155..62c796b811d1 100644 --- a/ydb/core/formats/arrow/accessor/sparsed/accessor.cpp +++ b/ydb/core/formats/arrow/accessor/sparsed/accessor.cpp @@ -1,9 +1,9 @@ #include "accessor.h" -#include #include #include #include +#include namespace NKikimr::NArrow::NAccessor { diff --git a/ydb/core/formats/arrow/accessor/sparsed/accessor.h b/ydb/core/formats/arrow/accessor/sparsed/accessor.h index 3f531375613f..040224962239 100644 --- a/ydb/core/formats/arrow/accessor/sparsed/accessor.h +++ b/ydb/core/formats/arrow/accessor/sparsed/accessor.h @@ -1,8 +1,8 @@ #pragma once -#include #include #include +#include #include #include diff --git a/ydb/core/formats/arrow/accessor/sparsed/constructor.h b/ydb/core/formats/arrow/accessor/sparsed/constructor.h index 05743cb4b373..0ccf5efdd70f 100644 --- a/ydb/core/formats/arrow/accessor/sparsed/constructor.h +++ b/ydb/core/formats/arrow/accessor/sparsed/constructor.h @@ -1,6 +1,6 @@ #pragma once #include -#include +#include namespace NKikimr::NArrow::NAccessor::NSparsed { diff --git a/ydb/core/formats/arrow/accessor/sparsed/request.h b/ydb/core/formats/arrow/accessor/sparsed/request.h index 205949bca97a..4be2d897b090 100644 --- a/ydb/core/formats/arrow/accessor/sparsed/request.h +++ b/ydb/core/formats/arrow/accessor/sparsed/request.h @@ -1,6 +1,6 @@ #pragma once #include -#include +#include namespace NKikimr::NArrow::NAccessor::NSparsed { diff --git a/ydb/core/formats/arrow/accessor/sparsed/ya.make b/ydb/core/formats/arrow/accessor/sparsed/ya.make index c4916a29c36c..c68f5f84f9a6 100644 --- a/ydb/core/formats/arrow/accessor/sparsed/ya.make +++ b/ydb/core/formats/arrow/accessor/sparsed/ya.make @@ -1,8 +1,9 @@ LIBRARY() PEERDIR( - ydb/core/formats/arrow/protos ydb/core/formats/arrow/accessor/abstract + ydb/library/formats/arrow + ydb/library/formats/arrow/protos ) SRCS( diff --git a/ydb/core/formats/arrow/accessor/ya.make b/ydb/core/formats/arrow/accessor/ya.make index 8d9536da5157..197b97d9efe7 100644 --- a/ydb/core/formats/arrow/accessor/ya.make +++ b/ydb/core/formats/arrow/accessor/ya.make @@ -3,7 +3,6 @@ LIBRARY() PEERDIR( ydb/core/formats/arrow/accessor/abstract ydb/core/formats/arrow/accessor/plain - ydb/core/formats/arrow/accessor/composite ydb/core/formats/arrow/accessor/composite_serial ydb/core/formats/arrow/accessor/sparsed ) diff --git a/ydb/core/formats/arrow/arrow_filter.cpp b/ydb/core/formats/arrow/arrow_filter.cpp index 58cd7116baed..c404a016f4bd 100644 --- a/ydb/core/formats/arrow/arrow_filter.cpp +++ b/ydb/core/formats/arrow/arrow_filter.cpp @@ -1,5 +1,5 @@ #include "arrow_filter.h" -#include "switch_type.h" +#include "switch/switch_type.h" #include "common/container.h" #include "common/adapter.h" diff --git a/ydb/core/formats/arrow/arrow_helpers.cpp b/ydb/core/formats/arrow/arrow_helpers.cpp index b57af6d8ef53..1246b01e334c 100644 --- a/ydb/core/formats/arrow/arrow_helpers.cpp +++ b/ydb/core/formats/arrow/arrow_helpers.cpp @@ -1,13 +1,14 @@ #include "arrow_helpers.h" -#include "switch_type.h" -#include "common/validation.h" +#include "switch/switch_type.h" #include "permutations.h" #include "common/adapter.h" #include "serializer/native.h" #include "serializer/abstract.h" #include "serializer/stream.h" -#include "simple_arrays_cache.h" +#include +#include +#include #include #include @@ -105,11 +106,6 @@ arrow::Result> MakeArrowSchema(const std::vector< return fields.status(); } -TString SerializeSchema(const arrow::Schema& schema) { - auto buffer = TStatusValidator::GetValid(arrow::ipc::SerializeSchema(schema)); - return buffer->ToString(); -} - std::shared_ptr DeserializeSchema(const TString& str) { std::shared_ptr buffer(std::make_shared(str)); arrow::io::BufferReader reader(buffer); @@ -143,148 +139,6 @@ std::shared_ptr DeserializeBatch(const TString& blob, const } } -std::shared_ptr MakeEmptyBatch(const std::shared_ptr& schema, const ui32 rowsCount) { - std::vector> columns; - columns.reserve(schema->num_fields()); - - for (auto& field : schema->fields()) { - auto result = NArrow::TThreadSimpleArraysCache::GetNull(field->type(), rowsCount); - columns.emplace_back(result); - Y_ABORT_UNLESS(result); - } - return arrow::RecordBatch::Make(schema, rowsCount, columns); -} - -std::shared_ptr CombineBatches(const std::vector>& batches) { - if (batches.empty()) { - return nullptr; - } - auto table = TStatusValidator::GetValid(arrow::Table::FromRecordBatches(batches)); - return table ? ToBatch(table, true) : nullptr; -} - -std::shared_ptr ToBatch(const std::shared_ptr& tableExt, const bool combine) { - if (!tableExt) { - return nullptr; - } - std::shared_ptr table; - if (combine) { - auto res = tableExt->CombineChunks(); - Y_ABORT_UNLESS(res.ok()); - table = *res; - } else { - table = tableExt; - } - std::vector> columns; - columns.reserve(table->num_columns()); - for (auto& col : table->columns()) { - AFL_VERIFY(col->num_chunks() == 1)("size", col->num_chunks())("size_bytes", GetTableDataSize(tableExt)) - ("schema", tableExt->schema()->ToString())("size_new", GetTableDataSize(table)); - columns.push_back(col->chunk(0)); - } - return arrow::RecordBatch::Make(table->schema(), table->num_rows(), columns); -} - -// Check if the permutation doesn't reorder anything -bool IsTrivial(const arrow::UInt64Array& permutation, const ui64 originalLength) { - if ((ui64)permutation.length() != originalLength) { - return false; - } - for (i64 i = 0; i < permutation.length(); ++i) { - if (permutation.Value(i) != (ui64)i) { - return false; - } - } - return true; -} - -std::shared_ptr Reorder(const std::shared_ptr& batch, - const std::shared_ptr& permutation, const bool canRemove) { - Y_ABORT_UNLESS(permutation->length() == batch->num_rows() || canRemove); - - auto res = IsTrivial(*permutation, batch->num_rows()) ? batch : arrow::compute::Take(batch, permutation); - Y_ABORT_UNLESS(res.ok()); - return (*res).record_batch(); -} - -THashMap> ShardingSplit(const std::shared_ptr& batch, const THashMap>& shardRows) { - AFL_VERIFY(batch); - std::shared_ptr permutation; - { - arrow::UInt64Builder builder; - Y_VERIFY_OK(builder.Reserve(batch->num_rows())); - - for (auto&& [shardId, rowIdxs]: shardRows) { - for (auto& row : rowIdxs) { - Y_VERIFY_OK(builder.Append(row)); - } - } - Y_VERIFY_OK(builder.Finish(&permutation)); - } - - auto reorderedBatch = Reorder(batch, permutation, false); - - THashMap> out; - - int offset = 0; - for (auto&& [shardId, shardRowIdxs] : shardRows) { - if (shardRowIdxs.empty()) { - continue; - } - out.emplace(shardId, reorderedBatch->Slice(offset, shardRowIdxs.size())); - offset += shardRowIdxs.size(); - } - - Y_ABORT_UNLESS(offset == batch->num_rows()); - return out; -} - -std::vector> ShardingSplit(const std::shared_ptr& batch, const std::vector>& shardRows, const ui32 numShards) { - AFL_VERIFY(batch); - std::shared_ptr permutation; - { - arrow::UInt64Builder builder; - Y_VERIFY_OK(builder.Reserve(batch->num_rows())); - - for (ui32 shardNo = 0; shardNo < numShards; ++shardNo) { - for (auto& row : shardRows[shardNo]) { - Y_VERIFY_OK(builder.Append(row)); - } - } - Y_VERIFY_OK(builder.Finish(&permutation)); - } - - auto reorderedBatch = Reorder(batch, permutation, false); - - std::vector> out(numShards); - - int offset = 0; - for (ui32 shardNo = 0; shardNo < numShards; ++shardNo) { - int length = shardRows[shardNo].size(); - if (length) { - out[shardNo] = reorderedBatch->Slice(offset, length); - offset += length; - } - } - - Y_ABORT_UNLESS(offset == batch->num_rows()); - return out; -} - -std::vector> ShardingSplit(const std::shared_ptr& batch, - const std::vector& sharding, ui32 numShards) { - AFL_VERIFY(batch); - Y_ABORT_UNLESS((size_t)batch->num_rows() == sharding.size()); - - std::vector> shardRows(numShards); - for (size_t row = 0; row < sharding.size(); ++row) { - ui32 shardNo = sharding[row]; - Y_ABORT_UNLESS(shardNo < numShards); - shardRows[shardNo].push_back(row); - } - return ShardingSplit(batch, shardRows, numShards); -} - void DedupSortedBatch(const std::shared_ptr& batch, const std::shared_ptr& sortingKey, std::vector>& out) { @@ -322,37 +176,6 @@ void DedupSortedBatch(const std::shared_ptr& batch, Y_DEBUG_ABORT_UNLESS(NArrow::IsSortedAndUnique(out.back(), sortingKey)); } -template -static bool IsSelfSorted(const std::shared_ptr& batch) { - if (batch->num_rows() < 2) { - return true; - } - auto& columns = batch->columns(); - - for (int i = 1; i < batch->num_rows(); ++i) { - TRawReplaceKey prev(&columns, i - 1); - TRawReplaceKey current(&columns, i); - if constexpr (desc) { - if (prev < current) { - AFL_DEBUG(NKikimrServices::ARROW_HELPER)("event", "prev < current")("current", current.DebugString())("prev", prev.DebugString()); - return false; - } - } else { - if (current < prev) { - AFL_DEBUG(NKikimrServices::ARROW_HELPER)("event", "current < prev")("current", current.DebugString())("prev", prev.DebugString()); - return false; - } - } - if constexpr (uniq) { - if (prev == current) { - AFL_DEBUG(NKikimrServices::ARROW_HELPER)("event", "equal")("current", current.DebugString())("prev", prev.DebugString()); - return false; - } - } - } - return true; -} - bool IsSorted(const std::shared_ptr& batch, const std::shared_ptr& sortingKey, bool desc) { auto keyBatch = TColumnOperator().Adapt(batch, sortingKey).DetachResult(); @@ -373,312 +196,6 @@ bool IsSortedAndUnique(const std::shared_ptr& batch, } } -bool HasAllColumns(const std::shared_ptr& batch, const std::shared_ptr& schema) { - for (auto& field : schema->fields()) { - if (batch->schema()->GetFieldIndex(field->name()) < 0) { - return false; - } - } - return true; -} - -std::vector> MakeBuilders(const std::shared_ptr& schema, - size_t reserve, const std::map& sizeByColumn) { - std::vector> builders; - builders.reserve(schema->num_fields()); - - for (auto& field : schema->fields()) { - std::unique_ptr builder; - TStatusValidator::Validate(arrow::MakeBuilder(arrow::default_memory_pool(), field->type(), &builder)); - if (sizeByColumn.size()) { - auto it = sizeByColumn.find(field->name()); - if (it != sizeByColumn.end()) { - AFL_VERIFY(NArrow::ReserveData(*builder, it->second))("size", it->second)("field", field->name()); - } - } - - if (reserve) { - TStatusValidator::Validate(builder->Reserve(reserve)); - } - - builders.emplace_back(std::move(builder)); - - } - return builders; -} - -std::unique_ptr MakeBuilder(const std::shared_ptr& field) { - AFL_VERIFY(field); - return MakeBuilder(field->type()); -} - -std::unique_ptr MakeBuilder(const std::shared_ptr& type) { - AFL_VERIFY(type); - std::unique_ptr builder; - TStatusValidator::Validate(arrow::MakeBuilder(arrow::default_memory_pool(), type, &builder)); - return std::move(builder); -} - -std::vector> Finish(std::vector>&& builders) { - std::vector> out; - for (auto& builder : builders) { - std::shared_ptr array; - TStatusValidator::Validate(builder->Finish(&array)); - out.emplace_back(array); - } - return out; -} - -std::vector ColumnNames(const std::shared_ptr& schema) { - std::vector out; - out.reserve(schema->num_fields()); - for (int i = 0; i < schema->num_fields(); ++i) { - auto& name = schema->field(i)->name(); - out.emplace_back(TString(name.data(), name.size())); - } - return out; -} - -std::shared_ptr MakeUI64Array(ui64 value, i64 size) { - auto res = arrow::MakeArrayFromScalar(arrow::UInt64Scalar(value), size); - Y_ABORT_UNLESS(res.ok()); - return std::static_pointer_cast(*res); -} - -std::pair FindMinMaxPosition(const std::shared_ptr& array) { - if (array->length() == 0) { - return {-1, -1}; - } - - int minPos = 0; - int maxPos = 0; - SwitchType(array->type_id(), [&](const auto& type) { - using TWrap = std::decay_t; - using TArray = typename arrow::TypeTraits::ArrayType; - - auto& column = static_cast(*array); - - for (int i = 1; i < column.length(); ++i) { - const auto& value = column.GetView(i); - if (value < column.GetView(minPos)) { - minPos = i; - } - if (value > column.GetView(maxPos)) { - maxPos = i; - } - } - return true; - }); - return {minPos, maxPos}; -} - -std::shared_ptr MinScalar(const std::shared_ptr& type) { - std::shared_ptr out; - SwitchType(type->id(), [&](const auto& t) { - using TWrap = std::decay_t; - using T = typename TWrap::T; - using TScalar = typename arrow::TypeTraits::ScalarType; - - if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v) { - out = std::make_shared(arrow::Buffer::FromString(""), type); - } else if constexpr (std::is_same_v) { - std::string s(static_cast(*type).byte_width(), '\0'); - out = std::make_shared(arrow::Buffer::FromString(s), type); - } else if constexpr (std::is_same_v) { - return false; - } else if constexpr (arrow::is_temporal_type::value) { - using TCType = typename arrow::TypeTraits::CType; - out = std::make_shared(Min(), type); - } else if constexpr (arrow::has_c_type::value) { - using TCType = typename arrow::TypeTraits::CType; - out = std::make_shared(Min()); - } else { - return false; - } - return true; - }); - Y_ABORT_UNLESS(out); - return out; -} - -namespace { - -template -class TDefaultScalarValue { -public: - static constexpr T Value = 0; -}; - -template <> -class TDefaultScalarValue { -public: - static constexpr bool Value = false; -}; - -} - -std::shared_ptr DefaultScalar(const std::shared_ptr& type) { - std::shared_ptr out; - SwitchType(type->id(), [&](const auto& t) { - using TWrap = std::decay_t; - using T = typename TWrap::T; - using TScalar = typename arrow::TypeTraits::ScalarType; - - if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v) { - out = std::make_shared(arrow::Buffer::FromString(""), type); - } else if constexpr (std::is_same_v) { - std::string s(static_cast(*type).byte_width(), '\0'); - out = std::make_shared(arrow::Buffer::FromString(s), type); - } else if constexpr (std::is_same_v) { - return false; - } else if constexpr (arrow::is_temporal_type::value) { - using TCType = typename arrow::TypeTraits::CType; - out = std::make_shared(TDefaultScalarValue::Value, type); - } else if constexpr (arrow::has_c_type::value) { - using TCType = typename arrow::TypeTraits::CType; - out = std::make_shared(TDefaultScalarValue::Value); - } else { - return false; - } - return true; - }); - AFL_VERIFY(out)("type", type->ToString()); - return out; -} - -std::shared_ptr GetScalar(const std::shared_ptr& array, int position) { - auto res = array->GetScalar(position); - Y_ABORT_UNLESS(res.ok()); - return *res; -} - -bool IsGoodScalar(const std::shared_ptr& x) { - if (!x) { - return false; - } - - return SwitchType(x->type->id(), [&](const auto& type) { - using TWrap = std::decay_t; - using TScalar = typename arrow::TypeTraits::ScalarType; - using TValue = std::decay_t(*x).value)>; - - if constexpr (arrow::has_string_view()) { - const auto& xval = static_cast(*x).value; - return xval && xval->data(); - } - if constexpr (std::is_arithmetic_v) { - return true; - } - return false; - }); -} - -bool ScalarLess(const std::shared_ptr& x, const std::shared_ptr& y) { - Y_ABORT_UNLESS(x); - Y_ABORT_UNLESS(y); - return ScalarLess(*x, *y); -} - -bool ScalarLess(const arrow::Scalar& x, const arrow::Scalar& y) { - return ScalarCompare(x, y) < 0; -} - -bool ColumnEqualsScalar( - const std::shared_ptr& c, const ui32 position, const std::shared_ptr& s) { - AFL_VERIFY(c); - if (!s) { - return c->IsNull(position) ; - } - AFL_VERIFY(c->type()->Equals(s->type))("s", s->type->ToString())("c", c->type()->ToString()); - - return SwitchTypeImpl(c->type()->id(), [&](const auto& type) { - using TWrap = std::decay_t; - using TScalar = typename arrow::TypeTraits::ScalarType; - using TArrayType = typename arrow::TypeTraits::ArrayType; - using TValue = std::decay_t(*s).value)>; - - if constexpr (arrow::has_string_view()) { - const auto& cval = static_cast(*c).GetView(position); - const auto& sval = static_cast(*s).value; - AFL_VERIFY(sval); - TStringBuf cBuf(reinterpret_cast(cval.data()), cval.size()); - TStringBuf sBuf(reinterpret_cast(sval->data()), sval->size()); - return cBuf == sBuf; - } - if constexpr (std::is_arithmetic_v) { - const auto cval = static_cast(*c).GetView(position); - const auto sval = static_cast(*s).value; - return (cval == sval); - } - Y_ABORT_UNLESS(false); // TODO: non primitive types - return false; - }); -} - -int ScalarCompare(const arrow::Scalar& x, const arrow::Scalar& y) { - Y_VERIFY_S(x.type->Equals(y.type), x.type->ToString() + " vs " + y.type->ToString()); - - return SwitchTypeImpl(x.type->id(), [&](const auto& type) { - using TWrap = std::decay_t; - using TScalar = typename arrow::TypeTraits::ScalarType; - using TValue = std::decay_t(x).value)>; - - if constexpr (arrow::has_string_view()) { - const auto& xval = static_cast(x).value; - const auto& yval = static_cast(y).value; - Y_ABORT_UNLESS(xval); - Y_ABORT_UNLESS(yval); - TStringBuf xBuf(reinterpret_cast(xval->data()), xval->size()); - TStringBuf yBuf(reinterpret_cast(yval->data()), yval->size()); - if (xBuf < yBuf) { - return -1; - } else if (yBuf < xBuf) { - return 1; - } else { - return 0; - } - } - if constexpr (std::is_arithmetic_v) { - const auto& xval = static_cast(x).value; - const auto& yval = static_cast(y).value; - if (xval < yval) { - return -1; - } else if (yval < xval) { - return 1; - } else { - return 0; - } - } - Y_ABORT_UNLESS(false); // TODO: non primitive types - return 0; - }); -} - -int ScalarCompare(const std::shared_ptr& x, const std::shared_ptr& y) { - Y_ABORT_UNLESS(x); - Y_ABORT_UNLESS(y); - return ScalarCompare(*x, *y); -} - -int ScalarCompareNullable(const std::shared_ptr& x, const std::shared_ptr& y) { - if (!x && !!y) { - return -1; - } - if (!!x && !y) { - return 1; - } - if (!x && !y) { - return 0; - } - return ScalarCompare(*x, *y); -} - std::shared_ptr SortBatch(const std::shared_ptr& batch, const std::shared_ptr& sortingKey, const bool andUnique) { auto sortPermutation = MakeSortPermutation(batch, sortingKey, andUnique); @@ -689,217 +206,6 @@ std::shared_ptr SortBatch(const std::shared_ptr BoolVecToArray(const std::vector& vec) { - std::shared_ptr out; - arrow::BooleanBuilder builder; - for (const auto val : vec) { - Y_ABORT_UNLESS(builder.Append(val).ok()); - } - Y_ABORT_UNLESS(builder.Finish(&out).ok()); - return out; -} - - -bool ArrayScalarsEqual(const std::shared_ptr& lhs, const std::shared_ptr& rhs) { - bool res = lhs->length() == rhs->length(); - for (int64_t i = 0; i < lhs->length() && res; ++i) { - res &= arrow::ScalarEquals(*lhs->GetScalar(i).ValueOrDie(), *rhs->GetScalar(i).ValueOrDie()); - } - return res; -} - -bool ReserveData(arrow::ArrayBuilder& builder, const size_t size) { - arrow::Status result = arrow::Status::OK(); - if (builder.type()->id() == arrow::Type::BINARY || - builder.type()->id() == arrow::Type::STRING) - { - static_assert(std::is_convertible_v&>, - "Expected StringBuilder to be BaseBinaryBuilder"); - auto& bBuilder = static_cast&>(builder); - result = bBuilder.ReserveData(size); - } - - if (!result.ok()) { - AFL_ERROR(NKikimrServices::ARROW_HELPER)("event", "ReserveData")("error", result.ToString()); - } - return result.ok(); -} - -template -bool MergeBatchColumnsImpl(const std::vector>& batches, std::shared_ptr& result, - const std::vector& columnsOrder, const bool orderFieldsAreNecessary, const TBuilder& builder) { - if (batches.empty()) { - result = nullptr; - return true; - } - if (batches.size() == 1) { - result = batches.front(); - return true; - } - std::vector> fields; - std::vector> columns; - std::map fieldNames; - for (auto&& i : batches) { - Y_ABORT_UNLESS(i); - for (auto&& f : i->schema()->fields()) { - if (!fieldNames.emplace(f->name(), fields.size()).second) { - AFL_ERROR(NKikimrServices::ARROW_HELPER)("event", "duplicated column")("name", f->name()); - return false; - } - fields.emplace_back(f); - } - if (i->num_rows() != batches.front()->num_rows()) { - AFL_ERROR(NKikimrServices::ARROW_HELPER)("event", "inconsistency record sizes")("i", i->num_rows())("front", batches.front()->num_rows()); - return false; - } - for (auto&& c : i->columns()) { - columns.emplace_back(c); - } - } - - Y_ABORT_UNLESS(fields.size() == columns.size()); - if (columnsOrder.size()) { - std::vector> fieldsOrdered; - std::vector> columnsOrdered; - for (auto&& i : columnsOrder) { - auto it = fieldNames.find(i); - if (orderFieldsAreNecessary) { - Y_ABORT_UNLESS(it != fieldNames.end()); - } else if (it == fieldNames.end()) { - continue; - } - fieldsOrdered.emplace_back(fields[it->second]); - columnsOrdered.emplace_back(columns[it->second]); - } - std::swap(fieldsOrdered, fields); - std::swap(columnsOrdered, columns); - } - result = builder(std::make_shared(fields), batches.front()->num_rows(), std::move(columns)); - return true; -} - -bool MergeBatchColumns(const std::vector>& batches, std::shared_ptr& result, const std::vector& columnsOrder, const bool orderFieldsAreNecessary) { - const auto builder = [](const std::shared_ptr& schema, const ui32 recordsCount, std::vector>&& columns) { - return arrow::Table::Make(schema, columns, recordsCount); - }; - - return MergeBatchColumnsImpl(batches, result, columnsOrder, orderFieldsAreNecessary, builder); -} - -bool MergeBatchColumns(const std::vector>& batches, std::shared_ptr& result, const std::vector& columnsOrder, const bool orderFieldsAreNecessary) { - const auto builder = [](const std::shared_ptr& schema, const ui32 recordsCount, std::vector>&& columns) { - return arrow::RecordBatch::Make(schema, recordsCount, columns); - }; - - return MergeBatchColumnsImpl(batches, result, columnsOrder, orderFieldsAreNecessary, builder); -} - -std::partial_ordering ColumnsCompare(const std::vector>& x, const ui32 xRow, const std::vector>& y, const ui32 yRow) { - return TRawReplaceKey(&x, xRow).CompareNotNull(TRawReplaceKey(&y, yRow)); -} - -NJson::TJsonValue DebugJson(std::shared_ptr array, const ui32 position) { - NJson::TJsonValue result = NJson::JSON_ARRAY; - for (auto&& i : array->columns()) { - result.AppendValue(DebugJson(i, position)); - } - return result; -} - -TString DebugString(std::shared_ptr array, const ui32 position) { - if (!array) { - return "_NO_DATA"; - } - Y_ABORT_UNLESS(position < array->length()); - TStringBuilder result; - SwitchType(array->type_id(), [&](const auto& type) { - using TWrap = std::decay_t; - using TArray = typename arrow::TypeTraits::ArrayType; - - auto& column = static_cast(*array); - if constexpr (arrow::has_string_view()) { - auto value = column.GetString(position); - result << TString(value.data(), value.size()); - } - if constexpr (arrow::has_c_type()) { - result << column.Value(position); - } - return true; - }); - return result; -} - -NJson::TJsonValue DebugJson(std::shared_ptr array, const ui32 position) { - if (!array) { - return NJson::JSON_NULL; - } - Y_ABORT_UNLESS(position < array->length()); - NJson::TJsonValue result = NJson::JSON_MAP; - SwitchType(array->type_id(), [&](const auto& type) { - using TWrap = std::decay_t; - using TArray = typename arrow::TypeTraits::ArrayType; - - auto& column = static_cast(*array); - result.InsertValue("type", typeid(TArray).name()); - if constexpr (arrow::has_string_view()) { - auto value = column.GetString(position); - result.InsertValue("value", TString(value.data(), value.size())); - } - if constexpr (arrow::has_c_type()) { - result.InsertValue("value", column.Value(position)); - } - return true; - }); - return result; -} - -NJson::TJsonValue DebugJson(std::shared_ptr array, const ui32 head, const ui32 tail) { - if (!array) { - return NJson::JSON_NULL; - } - NJson::TJsonValue resultFull = NJson::JSON_MAP; - resultFull.InsertValue("length", array->length()); - SwitchType(array->type_id(), [&](const auto& type) { - using TWrap = std::decay_t; - using TArray = typename arrow::TypeTraits::ArrayType; - - auto& column = static_cast(*array); - resultFull.InsertValue("type", typeid(TArray).name()); - resultFull.InsertValue("head", head); - resultFull.InsertValue("tail", tail); - auto& result = resultFull.InsertValue("data", NJson::JSON_ARRAY); - for (int i = 0; i < column.length(); ++i) { - if (i >= (int)head && i + (int)tail < column.length()) { - continue; - } - if constexpr (arrow::has_string_view()) { - auto value = column.GetString(i); - result.AppendValue(TString(value.data(), value.size())); - } - if constexpr (arrow::has_c_type()) { - result.AppendValue(column.Value(i)); - } - } - return true; - }); - return resultFull; -} - -NJson::TJsonValue DebugJson(std::shared_ptr batch, const ui32 head, const ui32 tail) { - if (!batch) { - return NJson::JSON_NULL; - } - NJson::TJsonValue result = NJson::JSON_ARRAY; - ui32 idx = 0; - for (auto&& i : batch->columns()) { - auto& jsonColumn = result.AppendValue(NJson::JSON_MAP); - jsonColumn.InsertValue("name", batch->column_name(idx)); - jsonColumn.InsertValue("data", DebugJson(i, head, tail)); - ++idx; - } - return result; -} - std::shared_ptr ReallocateBatch(std::shared_ptr original) { if (!original) { return nullptr; @@ -919,120 +225,4 @@ std::shared_ptr ReallocateBatch(const std::shared_ptr MergeColumns(const std::vector>& batches) { - std::vector> columns; - std::vector> fields; - std::optional recordsCount; - std::set columnNames; - for (auto&& batch : batches) { - if (!batch) { - continue; - } - for (auto&& column : batch->columns()) { - columns.emplace_back(column); - if (!recordsCount) { - recordsCount = column->length(); - } else { - Y_ABORT_UNLESS(*recordsCount == column->length()); - } - } - for (auto&& field : batch->schema()->fields()) { - AFL_VERIFY(columnNames.emplace(field->name()).second)("field_name", field->name()); - fields.emplace_back(field); - } - } - if (columns.empty()) { - return nullptr; - } - auto schema = std::make_shared(fields); - return arrow::RecordBatch::Make(schema, *recordsCount, columns); -} - -std::vector> SliceToRecordBatches(const std::shared_ptr& t) { - if (!t->num_rows()) { - return {}; - } - std::vector positions; - { - for (auto&& i : t->columns()) { - ui32 pos = 0; - for (auto&& arr : i->chunks()) { - positions.emplace_back(pos); - pos += arr->length(); - } - AFL_VERIFY(pos == t->num_rows()); - } - positions.emplace_back(t->num_rows()); - } - std::sort(positions.begin(), positions.end()); - positions.erase(std::unique(positions.begin(), positions.end()), positions.end()); - AFL_VERIFY(positions.size() > 1)("size", positions.size())("positions", JoinSeq(",", positions)); - std::vector>> slicedData; - slicedData.resize(positions.size() - 1); - for (auto&& i : t->columns()) { - ui32 currentPosition = 0; - auto it = i->chunks().begin(); - ui32 length = 0; - const auto initializeIt = [&length, &it, &i]() { - for (; it != i->chunks().end() && !(*it)->length(); ++it) { - } - if (it != i->chunks().end()) { - length = (*it)->length(); - } - }; - initializeIt(); - for (ui32 idx = 0; idx + 1 < positions.size(); ++idx) { - AFL_VERIFY(it != i->chunks().end()); - AFL_VERIFY(positions[idx + 1] - currentPosition <= length)("length", length)("idx+1", positions[idx + 1])("pos", currentPosition); - auto chunk = (*it)->Slice(positions[idx] - currentPosition, positions[idx + 1] - positions[idx]); - AFL_VERIFY_DEBUG(chunk->length() == positions[idx + 1] - positions[idx])("length", chunk->length())("expect", positions[idx + 1] - positions[idx]); - if (positions[idx + 1] - currentPosition == length) { - ++it; - initializeIt(); - currentPosition = positions[idx + 1]; - } - slicedData[idx].emplace_back(chunk); - } - } - std::vector> result; - ui32 count = 0; - for (auto&& i : slicedData) { - AFL_VERIFY(i.size()); - AFL_VERIFY(i.front()->length()); - result.emplace_back(arrow::RecordBatch::Make(t->schema(), i.front()->length(), i)); - count += result.back()->num_rows(); - } - AFL_VERIFY(count == t->num_rows())("count", count)("t", t->num_rows())("sd_size", slicedData.size())("columns", t->num_columns())( - "schema", t->schema()->ToString()); - return result; -} - -std::shared_ptr ToTable(const std::shared_ptr& batch) { - if (!batch) { - return nullptr; - } - return TStatusValidator::GetValid(arrow::Table::FromRecordBatches(batch->schema(), {batch})); -} - -bool HasNulls(const std::shared_ptr& column) { - AFL_VERIFY(column); - return column->null_bitmap_data(); -} - -std::vector ConvertStrings(const std::vector& input) { - std::vector result; - for (auto&& i : input) { - result.emplace_back(i); - } - return result; -} - -std::vector ConvertStrings(const std::vector& input) { - std::vector result; - for (auto&& i : input) { - result.emplace_back(i); - } - return result; -} - } diff --git a/ydb/core/formats/arrow/arrow_helpers.h b/ydb/core/formats/arrow/arrow_helpers.h index 08a9387a7f30..ea1f7a825bb2 100644 --- a/ydb/core/formats/arrow/arrow_helpers.h +++ b/ydb/core/formats/arrow/arrow_helpers.h @@ -1,5 +1,4 @@ #pragma once -#include "switch_type.h" #include "process_columns.h" #include #include @@ -9,44 +8,16 @@ #include #include #include +#include namespace NKikimr::NArrow { -using TArrayVec = std::vector>; - arrow::Result> GetArrowType(NScheme::TTypeInfo typeInfo); arrow::Result> GetCSVArrowType(NScheme::TTypeInfo typeId); -template -inline bool ArrayEqualValue(const std::shared_ptr& x, const std::shared_ptr& y) { - auto& arrX = static_cast(*x); - auto& arrY = static_cast(*y); - for (int i = 0; i < x->length(); ++i) { - if (arrX.Value(i) != arrY.Value(i)) { - return false; - } - } - return true; -} - -template -inline bool ArrayEqualView(const std::shared_ptr& x, const std::shared_ptr& y) { - auto& arrX = static_cast(*x); - auto& arrY = static_cast(*y); - for (int i = 0; i < x->length(); ++i) { - if (arrX.GetView(i) != arrY.GetView(i)) { - return false; - } - } - return true; -} - -struct TSortDescription; - arrow::Result MakeArrowFields(const std::vector>& columns, const std::set& notNullColumns = {}); arrow::Result> MakeArrowSchema(const std::vector>& columns, const std::set& notNullColumns = {}); -TString SerializeSchema(const arrow::Schema& schema); std::shared_ptr DeserializeSchema(const TString& str); TString SerializeBatch(const std::shared_ptr& batch, const arrow::ipc::IpcWriteOptions& options); @@ -54,28 +25,6 @@ TString SerializeBatchNoCompression(const std::shared_ptr& b std::shared_ptr DeserializeBatch(const TString& blob, const std::shared_ptr& schema); -std::shared_ptr MakeEmptyBatch(const std::shared_ptr& schema, const ui32 rowsCount = 0); -std::shared_ptr ToTable(const std::shared_ptr& batch); - -std::shared_ptr ToBatch(const std::shared_ptr& combinedTable, const bool combine); -std::shared_ptr CombineBatches(const std::vector>& batches); -std::shared_ptr MergeColumns(const std::vector>& rb); -std::vector> ShardingSplit(const std::shared_ptr& batch, const std::vector& sharding, ui32 numShards); -std::vector> ShardingSplit(const std::shared_ptr& batch, const std::vector>& shardRows, const ui32 numShards); -THashMap> ShardingSplit(const std::shared_ptr& batch, const THashMap>& shardRows); - -std::unique_ptr MakeBuilder(const std::shared_ptr& field); -std::unique_ptr MakeBuilder(const std::shared_ptr& type); - -std::vector> MakeBuilders(const std::shared_ptr& schema, - size_t reserve = 0, const std::map& sizeByColumn = {}); -std::vector> Finish(std::vector>&& builders); - -std::shared_ptr MakeUI64Array(ui64 value, i64 size); -std::vector ColumnNames(const std::shared_ptr& schema); -bool ReserveData(arrow::ArrayBuilder& builder, const size_t size); -bool MergeBatchColumns(const std::vector>& batches, std::shared_ptr& result, const std::vector& columnsOrder = {}, const bool orderFieldsAreNecessary = true); -bool MergeBatchColumns(const std::vector>& batches, std::shared_ptr& result, const std::vector& columnsOrder = {}, const bool orderFieldsAreNecessary = true); std::shared_ptr SortBatch(const std::shared_ptr& batch, const std::shared_ptr& sortingKey, const bool andUnique); @@ -88,38 +37,8 @@ bool IsSortedAndUnique(const std::shared_ptr& batch, void DedupSortedBatch(const std::shared_ptr& batch, const std::shared_ptr& sortingKey, std::vector>& out); -bool HasAllColumns(const std::shared_ptr& batch, const std::shared_ptr& schema); -std::pair FindMinMaxPosition(const std::shared_ptr& column); - -std::shared_ptr DefaultScalar(const std::shared_ptr& type); -std::shared_ptr MinScalar(const std::shared_ptr& type); -std::shared_ptr GetScalar(const std::shared_ptr& array, int position); -bool IsGoodScalar(const std::shared_ptr& x); -int ScalarCompare(const arrow::Scalar& x, const arrow::Scalar& y); -int ScalarCompare(const std::shared_ptr& x, const std::shared_ptr& y); -int ScalarCompareNullable(const std::shared_ptr& x, const std::shared_ptr& y); -std::partial_ordering ColumnsCompare( - const std::vector>& x, const ui32 xRow, const std::vector>& y, const ui32 yRow); -bool ColumnEqualsScalar( - const std::shared_ptr& c, const ui32 position, const std::shared_ptr& s); -bool ScalarLess(const std::shared_ptr& x, const std::shared_ptr& y); -bool ScalarLess(const arrow::Scalar& x, const arrow::Scalar& y); std::shared_ptr ReallocateBatch(std::shared_ptr original); std::shared_ptr ReallocateBatch(const std::shared_ptr& original); -bool HasNulls(const std::shared_ptr& column); - -std::vector> SliceToRecordBatches(const std::shared_ptr& t); - -bool ArrayScalarsEqual(const std::shared_ptr& lhs, const std::shared_ptr& rhs); -std::shared_ptr BoolVecToArray(const std::vector& vec); - -NJson::TJsonValue DebugJson(std::shared_ptr array, const ui32 head, const ui32 tail); -NJson::TJsonValue DebugJson(std::shared_ptr batch, const ui32 head, const ui32 tail); - -NJson::TJsonValue DebugJson(std::shared_ptr array, const ui32 position); -TString DebugString(std::shared_ptr array, const ui32 position); -NJson::TJsonValue DebugJson(std::shared_ptr array, const ui32 position); - } diff --git a/ydb/core/formats/arrow/common/adapter.h b/ydb/core/formats/arrow/common/adapter.h index 0a32dc4eb885..18b2deeacc9b 100644 --- a/ydb/core/formats/arrow/common/adapter.h +++ b/ydb/core/formats/arrow/common/adapter.h @@ -1,10 +1,9 @@ #pragma once #include "container.h" -#include "validation.h" -#include #include +#include #include #include diff --git a/ydb/core/formats/arrow/common/container.cpp b/ydb/core/formats/arrow/common/container.cpp index 77ad64c09297..7b159f2eef06 100644 --- a/ydb/core/formats/arrow/common/container.cpp +++ b/ydb/core/formats/arrow/common/container.cpp @@ -1,11 +1,11 @@ #include "container.h" -#include #include #include -#include #include +#include +#include namespace NKikimr::NArrow { diff --git a/ydb/core/formats/arrow/common/container.h b/ydb/core/formats/arrow/common/container.h index 572c3e3c4373..dacd5d62c0b0 100644 --- a/ydb/core/formats/arrow/common/container.h +++ b/ydb/core/formats/arrow/common/container.h @@ -1,11 +1,10 @@ #pragma once -#include - -#include #include #include #include +#include +#include #include #include diff --git a/ydb/core/formats/arrow/common/validation.h b/ydb/core/formats/arrow/common/validation.h deleted file mode 100644 index 344128547d7c..000000000000 --- a/ydb/core/formats/arrow/common/validation.h +++ /dev/null @@ -1,3 +0,0 @@ -#pragma once - -#include diff --git a/ydb/core/formats/arrow/common/ya.make b/ydb/core/formats/arrow/common/ya.make index 76f8805b572f..fc34c380aeb0 100644 --- a/ydb/core/formats/arrow/common/ya.make +++ b/ydb/core/formats/arrow/common/ya.make @@ -5,8 +5,8 @@ PEERDIR( ydb/core/formats/arrow/switch ydb/library/actors/core ydb/library/conclusion + ydb/library/formats/arrow ydb/core/formats/arrow/splitter - ydb/core/formats/arrow/validation ) SRCS( diff --git a/ydb/core/formats/arrow/converter.cpp b/ydb/core/formats/arrow/converter.cpp index 1bd0c92e2ceb..f0a38e2c8149 100644 --- a/ydb/core/formats/arrow/converter.cpp +++ b/ydb/core/formats/arrow/converter.cpp @@ -1,5 +1,5 @@ #include "converter.h" -#include "switch_type.h" +#include "switch/switch_type.h" #include #include diff --git a/ydb/core/formats/arrow/dictionary/conversion.cpp b/ydb/core/formats/arrow/dictionary/conversion.cpp index 026ae6ba5d6e..b1decbf14bb6 100644 --- a/ydb/core/formats/arrow/dictionary/conversion.cpp +++ b/ydb/core/formats/arrow/dictionary/conversion.cpp @@ -1,8 +1,8 @@ #include "conversion.h" #include -#include -#include #include +#include +#include namespace NKikimr::NArrow { @@ -131,11 +131,4 @@ bool IsDictionableArray(const std::shared_ptr& data) { return result; } -ui64 GetDictionarySize(const std::shared_ptr& data) { - if (!data) { - return 0; - } - return GetArrayDataSize(data->dictionary()) + GetArrayDataSize(data->indices()); -} - } diff --git a/ydb/core/formats/arrow/dictionary/conversion.h b/ydb/core/formats/arrow/dictionary/conversion.h index ee044bfd514b..dfedb4aa31a0 100644 --- a/ydb/core/formats/arrow/dictionary/conversion.h +++ b/ydb/core/formats/arrow/dictionary/conversion.h @@ -7,7 +7,6 @@ namespace NKikimr::NArrow { bool IsDictionableArray(const std::shared_ptr& data); -ui64 GetDictionarySize(const std::shared_ptr& data); std::shared_ptr ArrayToDictionary(const std::shared_ptr& data); std::shared_ptr ArrayToDictionary(const std::shared_ptr& data); std::shared_ptr DictionaryToArray(const std::shared_ptr& data); diff --git a/ydb/core/formats/arrow/dictionary/object.cpp b/ydb/core/formats/arrow/dictionary/object.cpp index 4a72802b2aec..36c9fe3fc276 100644 --- a/ydb/core/formats/arrow/dictionary/object.cpp +++ b/ydb/core/formats/arrow/dictionary/object.cpp @@ -1,6 +1,6 @@ #include "object.h" -#include #include +#include #include namespace NKikimr::NArrow::NDictionary { diff --git a/ydb/core/formats/arrow/dictionary/object.h b/ydb/core/formats/arrow/dictionary/object.h index 2fd4d6a12924..09f5efebb56f 100644 --- a/ydb/core/formats/arrow/dictionary/object.h +++ b/ydb/core/formats/arrow/dictionary/object.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include namespace NKikimr::NArrow::NDictionary { diff --git a/ydb/core/formats/arrow/dictionary/ya.make b/ydb/core/formats/arrow/dictionary/ya.make index ea71f4c7dacf..35639ea43bea 100644 --- a/ydb/core/formats/arrow/dictionary/ya.make +++ b/ydb/core/formats/arrow/dictionary/ya.make @@ -3,9 +3,11 @@ LIBRARY() PEERDIR( contrib/libs/apache/arrow ydb/core/protos - ydb/core/formats/arrow/simple_builder ydb/core/formats/arrow/switch ydb/library/actors/core + ydb/library/formats/arrow/transformer + ydb/library/formats/arrow/common + ydb/library/formats/arrow/simple_builder ) SRCS( diff --git a/ydb/core/formats/arrow/hash/calcer.cpp b/ydb/core/formats/arrow/hash/calcer.cpp index 71af0492cfea..d5fa4a8dd6a3 100644 --- a/ydb/core/formats/arrow/hash/calcer.cpp +++ b/ydb/core/formats/arrow/hash/calcer.cpp @@ -1,8 +1,8 @@ #include "calcer.h" -#include "xx_hash.h" #include #include #include +#include #include #include #include diff --git a/ydb/core/formats/arrow/hash/calcer.h b/ydb/core/formats/arrow/hash/calcer.h index d82f669fbee1..51dfe7858f8c 100644 --- a/ydb/core/formats/arrow/hash/calcer.h +++ b/ydb/core/formats/arrow/hash/calcer.h @@ -1,11 +1,11 @@ #pragma once -#include "xx_hash.h" #include -#include #include #include #include +#include +#include #include #include diff --git a/ydb/core/formats/arrow/hash/ya.make b/ydb/core/formats/arrow/hash/ya.make index 6d9a98b836a6..d7337f6b5588 100644 --- a/ydb/core/formats/arrow/hash/ya.make +++ b/ydb/core/formats/arrow/hash/ya.make @@ -2,17 +2,18 @@ LIBRARY() PEERDIR( contrib/libs/apache/arrow - ydb/core/formats/arrow/simple_builder ydb/core/formats/arrow/switch ydb/core/formats/arrow/reader ydb/library/actors/core ydb/library/services ydb/library/actors/protos + ydb/library/formats/arrow/hash + ydb/library/formats/arrow/common + ydb/library/formats/arrow/simple_builder ) SRCS( calcer.cpp - xx_hash.cpp ) END() diff --git a/ydb/core/formats/arrow/permutations.cpp b/ydb/core/formats/arrow/permutations.cpp index 623bc15c9221..c36a09779c6d 100644 --- a/ydb/core/formats/arrow/permutations.cpp +++ b/ydb/core/formats/arrow/permutations.cpp @@ -1,13 +1,13 @@ #include "permutations.h" #include "arrow_helpers.h" -#include "replace_key.h" #include "size_calcer.h" #include "hash/calcer.h" -#include #include +#include +#include #include #include @@ -15,28 +15,6 @@ namespace NKikimr::NArrow { -std::shared_ptr MakePermutation(const int size, const bool reverse) { - arrow::UInt64Builder builder; - TStatusValidator::Validate(builder.Reserve(size)); - - if (size) { - if (reverse) { - ui64 value = size - 1; - for (i64 i = 0; i < size; ++i, --value) { - TStatusValidator::Validate(builder.Append(value)); - } - } else { - for (i64 i = 0; i < size; ++i) { - TStatusValidator::Validate(builder.Append(i)); - } - } - } - - std::shared_ptr out; - TStatusValidator::Validate(builder.Finish(&out)); - return out; -} - std::shared_ptr MakeSortPermutation(const std::shared_ptr& batch, const std::shared_ptr& sortingKey, const bool andUnique) { auto keyBatch = TColumnOperator().VerifyIfAbsent().Adapt(batch, sortingKey).DetachResult(); auto keyColumns = std::make_shared(keyBatch->columns()); @@ -100,84 +78,6 @@ std::shared_ptr MakeSortPermutation(const std::shared_ptr -std::shared_ptr MakeFilterPermutationImpl(const std::vector& indexes) { - if (indexes.empty()) { - return {}; - } - - arrow::UInt64Builder builder; - if (!builder.Reserve(indexes.size()).ok()) { - return {}; - } - - for (auto&& i : indexes) { - TStatusValidator::Validate(builder.Append(i)); - } - std::shared_ptr out; - TStatusValidator::Validate(builder.Finish(&out)); - return out; -} - -std::shared_ptr MakeFilterPermutation(const std::vector& indexes) { - return MakeFilterPermutationImpl(indexes); -} - -std::shared_ptr MakeFilterPermutation(const std::vector& indexes) { - return MakeFilterPermutationImpl(indexes); -} - -std::shared_ptr CopyRecords(const std::shared_ptr& source, const std::vector& indexes) { - Y_ABORT_UNLESS(!!source); - auto schema = source->schema(); - std::vector> columns; - for (auto&& i : source->columns()) { - columns.emplace_back(CopyRecords(i, indexes)); - } - return arrow::RecordBatch::Make(schema, indexes.size(), columns); -} - -std::shared_ptr CopyRecords(const std::shared_ptr& source, const std::vector& indexes) { - if (!source) { - return source; - } - std::shared_ptr result; - SwitchType(source->type_id(), [&](const auto& type) { - using TWrap = std::decay_t; - using TArray = typename arrow::TypeTraits::ArrayType; - using TBuilder = typename arrow::TypeTraits::BuilderType; - auto& column = static_cast(*source); - - std::unique_ptr builder; - TStatusValidator::Validate(arrow::MakeBuilder(arrow::default_memory_pool(), source->type(), &builder)); - auto& builderImpl = static_cast(*builder); - - if constexpr (arrow::has_string_view::value) { - ui64 sumByIndexes = 0; - for (auto&& idx : indexes) { - Y_ABORT_UNLESS(idx < (ui64)column.length()); - sumByIndexes += column.GetView(idx).size(); - } - TStatusValidator::Validate(builderImpl.ReserveData(sumByIndexes)); - } - - TStatusValidator::Validate(builder->Reserve(indexes.size())); - - { - const ui32 arraySize = column.length(); - for (auto&& i : indexes) { - Y_ABORT_UNLESS(i < arraySize); - builderImpl.UnsafeAppend(column.GetView(i)); - } - } - - TStatusValidator::Validate(builder->Finish(&result)); - return true; - }); - Y_ABORT_UNLESS(result); - return result; -} - namespace { template @@ -213,101 +113,4 @@ bool THashConstructor::BuildHashUI64(std::shared_ptr& batch, return BuildHashUI64Impl(batch, fieldNames, hashFieldName); } -ui64 TShardedRecordBatch::GetMemorySize() const { - return NArrow::GetTableMemorySize(RecordBatch); -} - -TShardedRecordBatch::TShardedRecordBatch(const std::shared_ptr& batch) { - AFL_VERIFY(batch); - RecordBatch = TStatusValidator::GetValid(arrow::Table::FromRecordBatches(batch->schema(), {batch})); -} - - -TShardedRecordBatch::TShardedRecordBatch(const std::shared_ptr& batch) - : RecordBatch(batch) -{ - AFL_VERIFY(RecordBatch); -} - -TShardedRecordBatch::TShardedRecordBatch(const std::shared_ptr& batch, std::vector>&& splittedByShards) - : RecordBatch(batch) - , SplittedByShards(std::move(splittedByShards)) -{ - AFL_VERIFY(RecordBatch); - AFL_VERIFY(SplittedByShards.size()); -} - -std::vector> TShardingSplitIndex::Apply(const std::shared_ptr& input) { - AFL_VERIFY(input); - AFL_VERIFY(input->num_rows() == RecordsCount); - auto permutation = BuildPermutation(); - auto resultBatch = NArrow::TStatusValidator::GetValid(arrow::compute::Take(input, *permutation)).table(); - AFL_VERIFY(resultBatch->num_rows() == RecordsCount); - std::vector> result; - ui64 startIndex = 0; - for (auto&& i : Remapping) { - result.emplace_back(resultBatch->Slice(startIndex, i.size())); - startIndex += i.size(); - } - AFL_VERIFY(startIndex == RecordsCount); - return result; -} - -NKikimr::NArrow::TShardedRecordBatch TShardingSplitIndex::Apply(const ui32 shardsCount, const std::shared_ptr& input, const std::string& hashColumnName) { - AFL_VERIFY(input); - if (shardsCount == 1) { - return TShardedRecordBatch(input); - } - auto hashColumn = input->GetColumnByName(hashColumnName); - if (!hashColumn) { - return TShardedRecordBatch(input); - } - std::optional splitter; - if (hashColumn->type()->id() == arrow::Type::UINT64) { - splitter = TShardingSplitIndex::Build(shardsCount, *hashColumn); - } else if (hashColumn->type()->id() == arrow::Type::UINT32) { - splitter = TShardingSplitIndex::Build(shardsCount, *hashColumn); - } else if (hashColumn->type()->id() == arrow::Type::INT64) { - splitter = TShardingSplitIndex::Build(shardsCount, *hashColumn); - } else if (hashColumn->type()->id() == arrow::Type::INT32) { - splitter = TShardingSplitIndex::Build(shardsCount, *hashColumn); - } else { - Y_ABORT_UNLESS(false); - } - auto resultBatch = NArrow::TStatusValidator::GetValid(input->RemoveColumn(input->schema()->GetFieldIndex(hashColumnName))); - return TShardedRecordBatch(resultBatch, splitter->DetachRemapping()); -} - -TShardedRecordBatch TShardingSplitIndex::Apply(const ui32 shardsCount, const std::shared_ptr& input, const std::string& hashColumnName) { - return Apply(shardsCount, TStatusValidator::GetValid(arrow::Table::FromRecordBatches(input->schema(), {input})) - , hashColumnName); -} - -std::shared_ptr TShardingSplitIndex::BuildPermutation() const { - arrow::UInt64Builder builder; - Y_ABORT_UNLESS(builder.Reserve(RecordsCount).ok()); - - for (auto&& i : Remapping) { - for (auto&& idx : i) { - TStatusValidator::Validate(builder.Append(idx)); - } - } - - std::shared_ptr out; - Y_ABORT_UNLESS(builder.Finish(&out).ok()); - return out; -} - -std::shared_ptr ReverseRecords(const std::shared_ptr& batch) { - AFL_VERIFY(batch); - auto permutation = NArrow::MakePermutation(batch->num_rows(), true); - return NArrow::TStatusValidator::GetValid(arrow::compute::Take(batch, permutation)).record_batch(); -} - -std::shared_ptr ReverseRecords(const std::shared_ptr& batch) { - AFL_VERIFY(batch); - auto permutation = NArrow::MakePermutation(batch->num_rows(), true); - return NArrow::TStatusValidator::GetValid(arrow::compute::Take(batch, permutation)).table(); -} - } diff --git a/ydb/core/formats/arrow/permutations.h b/ydb/core/formats/arrow/permutations.h index 73a433ee52a2..f8c62fb87107 100644 --- a/ydb/core/formats/arrow/permutations.h +++ b/ydb/core/formats/arrow/permutations.h @@ -2,6 +2,7 @@ #include "arrow_helpers.h" #include +#include #include #include #include @@ -15,143 +16,6 @@ class THashConstructor { }; -class TShardedRecordBatch { -private: - YDB_READONLY_DEF(std::shared_ptr, RecordBatch); - YDB_READONLY_DEF(std::vector>, SplittedByShards); -public: - TShardedRecordBatch(const std::shared_ptr& batch); - TShardedRecordBatch(const std::shared_ptr& batch); - - void Cut(const ui32 limit) { - RecordBatch = RecordBatch->Slice(0, limit); - for (auto&& i : SplittedByShards) { - auto it = std::lower_bound(i.begin(), i.end(), limit); - if (it != i.end()) { - i.erase(it, i.end()); - } - } - } - - bool IsSharded() const { - return SplittedByShards.size() > 1; - } - - TShardedRecordBatch(const std::shared_ptr& batch, std::vector>&& splittedByShards); - - ui64 GetMemorySize() const; - - ui64 GetRecordsCount() const { - return RecordBatch->num_rows(); - } -}; - -class TShardingSplitIndex { -private: - ui32 ShardsCount = 0; - std::vector> Remapping; - ui32 RecordsCount = 0; - - template - std::vector MergeLists(const std::vector& base, const TIterator itFrom, const TIterator itTo) { - std::vector result; - result.reserve(base.size() + (itTo - itFrom)); - auto itBase = base.begin(); - auto itExt = itFrom; - while (itBase != base.end() && itExt != itTo) { - if (*itBase < *itExt) { - result.emplace_back(*itBase); - ++itBase; - } else { - result.emplace_back(*itExt); - ++itExt; - } - } - if (itBase == base.end()) { - result.insert(result.end(), itExt, itTo); - } else if (itExt == itTo) { - result.insert(result.end(), itBase, base.end()); - } - return result; - } - - template - void Initialize(const arrow::ChunkedArray& arrowHashArrayChunked) { - Y_ABORT_UNLESS(ShardsCount); - Remapping.resize(ShardsCount); - const ui32 expectation = arrowHashArrayChunked.length() / ShardsCount + 1; - for (auto&& i : Remapping) { - i.reserve(2 * expectation); - } - for (auto&& arrowHashArrayAbstract : arrowHashArrayChunked.chunks()) { - auto& arrowHashArray = static_cast(*arrowHashArrayAbstract); - ui64 offset = 0; - for (ui64 i = 0; i < (ui64)arrowHashArray.length(); ++i) { - const i64 v = arrowHashArray.GetView(i); - const ui32 idx = ((v < 0) ? (-v) : v) % ShardsCount; - Remapping[idx].emplace_back(offset + i); - } - offset += (ui64)arrowHashArray.length(); - } - std::deque*> sizeCorrection; - for (auto&& i : Remapping) { - sizeCorrection.emplace_back(&i); - } - const auto pred = [](const std::vector* l, const std::vector* r) { - return l->size() < r->size(); - }; - std::sort(sizeCorrection.begin(), sizeCorrection.end(), pred); - while (sizeCorrection.size() > 1 && sizeCorrection.back()->size() > expectation && sizeCorrection.front()->size() < expectation) { - const ui32 uselessRecords = sizeCorrection.back()->size() - expectation; - const ui32 needRecords = expectation - sizeCorrection.front()->size(); - const ui32 moveRecords = std::min(needRecords, uselessRecords); - if (moveRecords == 0) { - break; - } - *sizeCorrection.front() = MergeLists(*sizeCorrection.front(), sizeCorrection.back()->end() - moveRecords, sizeCorrection.back()->end()); - sizeCorrection.back()->resize(sizeCorrection.back()->size() - moveRecords); - if (sizeCorrection.back()->size() <= expectation) { - sizeCorrection.pop_back(); - } - if (sizeCorrection.front()->size() >= expectation) { - sizeCorrection.pop_front(); - } - } - } - - TShardingSplitIndex(const ui32 shardsCount, const arrow::ChunkedArray& arrowHashArray) - : ShardsCount(shardsCount) - , RecordsCount(arrowHashArray.length()) { - } - -public: - - std::vector> DetachRemapping() { - return std::move(Remapping); - } - - template - static TShardingSplitIndex Build(const ui32 shardsCount, const arrow::ChunkedArray& arrowHashArray) { - TShardingSplitIndex result(shardsCount, arrowHashArray); - result.Initialize(arrowHashArray); - return result; - } - - std::shared_ptr BuildPermutation() const; - - std::vector> Apply(const std::shared_ptr& input); - static TShardedRecordBatch Apply(const ui32 shardsCount, const std::shared_ptr& input, const std::string& hashColumnName); - static TShardedRecordBatch Apply(const ui32 shardsCount, const std::shared_ptr& input, const std::string& hashColumnName); -}; - -std::shared_ptr MakePermutation(const int size, const bool reverse = false); -std::shared_ptr MakeFilterPermutation(const std::vector& indexes); -std::shared_ptr MakeFilterPermutation(const std::vector& indexes); std::shared_ptr MakeSortPermutation(const std::shared_ptr& batch, const std::shared_ptr& sortingKey, const bool andUnique); -std::shared_ptr ReverseRecords(const std::shared_ptr& batch); -std::shared_ptr ReverseRecords(const std::shared_ptr& batch); - -std::shared_ptr CopyRecords(const std::shared_ptr& source, const std::vector& indexes); -std::shared_ptr CopyRecords(const std::shared_ptr& source, const std::vector& indexes); } diff --git a/ydb/core/formats/arrow/process_columns.cpp b/ydb/core/formats/arrow/process_columns.cpp index 846542608304..c032d1d8006c 100644 --- a/ydb/core/formats/arrow/process_columns.cpp +++ b/ydb/core/formats/arrow/process_columns.cpp @@ -1,8 +1,9 @@ #include "process_columns.h" #include "common/adapter.h" -#include "modifier/schema.h" -#include "modifier/subset.h" + +#include +#include #include diff --git a/ydb/core/formats/arrow/reader/merger.cpp b/ydb/core/formats/arrow/reader/merger.cpp index 68e6fb842faa..16b9733ad4c0 100644 --- a/ydb/core/formats/arrow/reader/merger.cpp +++ b/ydb/core/formats/arrow/reader/merger.cpp @@ -1,5 +1,6 @@ #include "merger.h" #include "result_builder.h" +#include #include namespace NKikimr::NArrow::NMerger { diff --git a/ydb/core/formats/arrow/reader/position.h b/ydb/core/formats/arrow/reader/position.h index ef5c0990eb0c..78233e50b4a5 100644 --- a/ydb/core/formats/arrow/reader/position.h +++ b/ydb/core/formats/arrow/reader/position.h @@ -1,10 +1,9 @@ #pragma once -#include #include #include -#include #include +#include #include #include diff --git a/ydb/core/formats/arrow/reader/result_builder.cpp b/ydb/core/formats/arrow/reader/result_builder.cpp index 523539b843e1..9b412902b1ed 100644 --- a/ydb/core/formats/arrow/reader/result_builder.cpp +++ b/ydb/core/formats/arrow/reader/result_builder.cpp @@ -1,9 +1,8 @@ #include "result_builder.h" -#include - #include #include +#include #include diff --git a/ydb/core/formats/arrow/reader/ya.make b/ydb/core/formats/arrow/reader/ya.make index d57bb4e501ca..8dba6acf2efb 100644 --- a/ydb/core/formats/arrow/reader/ya.make +++ b/ydb/core/formats/arrow/reader/ya.make @@ -2,11 +2,11 @@ LIBRARY() PEERDIR( contrib/libs/apache/arrow - ydb/core/formats/arrow/simple_builder ydb/core/formats/arrow/switch ydb/core/formats/arrow/common ydb/library/actors/core ydb/library/services + ydb/library/formats/arrow ) SRCS( diff --git a/ydb/core/formats/arrow/save_load/loader.cpp b/ydb/core/formats/arrow/save_load/loader.cpp index d6f200b35c05..c9328f751d4a 100644 --- a/ydb/core/formats/arrow/save_load/loader.cpp +++ b/ydb/core/formats/arrow/save_load/loader.cpp @@ -1,6 +1,6 @@ #include "loader.h" -#include +#include namespace NKikimr::NArrow::NAccessor { diff --git a/ydb/core/formats/arrow/save_load/loader.h b/ydb/core/formats/arrow/save_load/loader.h index 1b42e41fc106..2d3119ac3fa8 100644 --- a/ydb/core/formats/arrow/save_load/loader.h +++ b/ydb/core/formats/arrow/save_load/loader.h @@ -1,9 +1,9 @@ #pragma once #include #include -#include #include +#include #include diff --git a/ydb/core/formats/arrow/save_load/saver.h b/ydb/core/formats/arrow/save_load/saver.h index 3532a0195fa3..dd9feb4114f3 100644 --- a/ydb/core/formats/arrow/save_load/saver.h +++ b/ydb/core/formats/arrow/save_load/saver.h @@ -1,8 +1,8 @@ #pragma once #include -#include #include +#include #include #include diff --git a/ydb/core/formats/arrow/save_load/ya.make b/ydb/core/formats/arrow/save_load/ya.make index db2d6667519a..7947aa1ab826 100644 --- a/ydb/core/formats/arrow/save_load/ya.make +++ b/ydb/core/formats/arrow/save_load/ya.make @@ -10,6 +10,8 @@ PEERDIR( contrib/libs/apache/arrow ydb/library/accessor ydb/library/conclusion + ydb/library/formats/arrow/transformer + ydb/library/formats/arrow/common ydb/core/formats/arrow/transformer ydb/core/formats/arrow/serializer ) diff --git a/ydb/core/formats/arrow/serializer/abstract.h b/ydb/core/formats/arrow/serializer/abstract.h index db60152224ea..9811aaaf0f20 100644 --- a/ydb/core/formats/arrow/serializer/abstract.h +++ b/ydb/core/formats/arrow/serializer/abstract.h @@ -4,9 +4,9 @@ #include #include #include -#include #include +#include #include #include diff --git a/ydb/core/formats/arrow/serializer/native.cpp b/ydb/core/formats/arrow/serializer/native.cpp index 7b422a8c1cb1..4b90286001d2 100644 --- a/ydb/core/formats/arrow/serializer/native.cpp +++ b/ydb/core/formats/arrow/serializer/native.cpp @@ -2,10 +2,10 @@ #include "stream.h" #include "parsing.h" #include -#include #include #include +#include #include #include diff --git a/ydb/core/formats/arrow/serializer/ya.make b/ydb/core/formats/arrow/serializer/ya.make index 79a3ae1a3ddf..8c9fb49fe08f 100644 --- a/ydb/core/formats/arrow/serializer/ya.make +++ b/ydb/core/formats/arrow/serializer/ya.make @@ -2,9 +2,9 @@ LIBRARY() PEERDIR( contrib/libs/apache/arrow - ydb/core/formats/arrow/validation ydb/services/metadata/abstract ydb/library/actors/core + ydb/library/formats/arrow/common ydb/core/protos ) diff --git a/ydb/core/formats/arrow/size_calcer.cpp b/ydb/core/formats/arrow/size_calcer.cpp index 283d0ff2d3c1..a79f52eb5799 100644 --- a/ydb/core/formats/arrow/size_calcer.cpp +++ b/ydb/core/formats/arrow/size_calcer.cpp @@ -1,5 +1,5 @@ #include "size_calcer.h" -#include "switch_type.h" +#include "switch/switch_type.h" #include "arrow_helpers.h" #include "dictionary/conversion.h" #include @@ -50,197 +50,6 @@ TConclusion> SplitByBlobSize(const std::shared_ptr return result; } -ui32 TRowSizeCalculator::GetRowBitWidth(const ui32 row) const { - Y_ABORT_UNLESS(Prepared); - ui32 result = CommonSize; - for (auto&& c : BinaryColumns) { - result += GetBitWidthAligned(c->GetView(row).size() * 8); - } - for (auto&& c : StringColumns) { - result += GetBitWidthAligned(c->GetView(row).size() * 8); - } - return result; -} - -bool TRowSizeCalculator::InitBatch(const std::shared_ptr& batch) { - Batch = batch; - CommonSize = 0; - BinaryColumns.clear(); - StringColumns.clear(); - Prepared = false; - for (ui32 i = 0; i < (ui32)Batch->num_columns(); ++i) { - auto fSize = std::dynamic_pointer_cast(Batch->column(i)->type()); - if (fSize) { - CommonSize += GetBitWidthAligned(fSize->bit_width()); - } else { - auto c = Batch->column(i); - if (c->type()->id() == arrow::Type::BINARY) { - const arrow::BinaryArray& viewArray = static_cast(*c); - BinaryColumns.emplace_back(&viewArray); - } else if (c->type()->id() == arrow::Type::STRING) { - const arrow::StringArray& viewArray = static_cast(*c); - StringColumns.emplace_back(&viewArray); - } else { - return false; - } - } - } - Prepared = true; - return true; -} - -ui32 TRowSizeCalculator::GetRowBytesSize(const ui32 row) const { - const ui32 bitsWidth = GetRowBitWidth(row); - ui32 result = bitsWidth / 8; - if (bitsWidth % 8) { - ++result; - } - return result; -} - -ui64 GetArrayMemorySize(const std::shared_ptr& data) { - if (!data) { - return 0; - } - ui64 result = 0; - for (auto&& i : data->buffers) { - if (i) { - result += i->capacity(); - } - } - for (auto&& i : data->child_data) { - for (auto&& b : i->buffers) { - if (b) { - result += b->capacity(); - } - } - } - if (data->dictionary) { - for (auto&& b : data->dictionary->buffers) { - if (b) { - result += b->capacity(); - } - } - } - return result; -} - - -ui64 GetBatchDataSize(const std::shared_ptr& batch) { - if (!batch) { - return 0; - } - ui64 bytes = 0; - for (auto& column : batch->columns()) { - bytes += GetArrayDataSize(column); - } - return bytes; -} - -ui64 GetBatchMemorySize(const std::shared_ptr& batch) { - if (!batch) { - return 0; - } - ui64 bytes = 0; - for (auto& column : batch->column_data()) { - bytes += GetArrayMemorySize(column); - } - return bytes; -} - -ui64 GetTableMemorySize(const std::shared_ptr& batch) { - if (!batch) { - return 0; - } - ui64 bytes = 0; - for (auto& column : batch->columns()) { - for (auto&& chunk : column->chunks()) { - bytes += GetArrayMemorySize(chunk->data()); - } - } - return bytes; -} - -ui64 GetTableDataSize(const std::shared_ptr& batch) { - if (!batch) { - return 0; - } - ui64 bytes = 0; - for (auto& column : batch->columns()) { - for (auto&& chunk : column->chunks()) { - bytes += GetArrayDataSize(chunk); - } - } - return bytes; -} - -template -ui64 GetArrayDataSizeImpl(const std::shared_ptr& column) { - return sizeof(typename TType::c_type) * column->length(); -} - -template <> -ui64 GetArrayDataSizeImpl(const std::shared_ptr& column) { - return column->length() * 8; // Special value for empty lines -} - -template <> -ui64 GetArrayDataSizeImpl(const std::shared_ptr& column) { - auto typedColumn = std::static_pointer_cast(column); - return typedColumn->total_values_length() + sizeof(arrow::StringArray::offset_type) * column->length(); -} - -template <> -ui64 GetArrayDataSizeImpl(const std::shared_ptr& column) { - auto typedColumn = std::static_pointer_cast(column); - return typedColumn->total_values_length() + sizeof(arrow::LargeStringArray::offset_type) * column->length(); -} - -template <> -ui64 GetArrayDataSizeImpl(const std::shared_ptr& column) { - auto typedColumn = std::static_pointer_cast(column); - return typedColumn->total_values_length() + sizeof(arrow::BinaryArray::offset_type) * column->length(); -} - -template <> -ui64 GetArrayDataSizeImpl(const std::shared_ptr& column) { - auto typedColumn = std::static_pointer_cast(column); - return typedColumn->total_values_length() + sizeof(arrow::LargeBinaryArray::offset_type) * column->length(); -} - -template <> -ui64 GetArrayDataSizeImpl(const std::shared_ptr& column) { - auto typedColumn = std::static_pointer_cast(column); - return typedColumn->byte_width() * typedColumn->length(); -} - -template <> -ui64 GetArrayDataSizeImpl(const std::shared_ptr& column) { - return sizeof(ui64) * 2 * column->length(); -} - -ui64 GetArrayDataSize(const std::shared_ptr& column) { - auto type = column->type(); - if (type->id() == arrow::Type::DICTIONARY) { - auto dictArray = static_pointer_cast(column); - return GetDictionarySize(dictArray); - } - ui64 bytes = 0; - bool success = SwitchTypeWithNull(type->id(), [&](TTypeWrapper typeHolder) { - Y_UNUSED(typeHolder); - bytes = GetArrayDataSizeImpl(column); - return true; - }); - - // Add null bit mask overhead if any. - if (HasNulls(column)) { - bytes += column->length() / 8 + 1; - } - - Y_DEBUG_ABORT_UNLESS(success, "Unsupported arrow type %s", type->ToString().data()); - return bytes; -} - NKikimr::NArrow::TSerializedBatch TSerializedBatch::Build(std::shared_ptr batch, const TBatchSplitttingContext& context) { std::optional specialKeysPayload; std::optional specialKeysFull; diff --git a/ydb/core/formats/arrow/size_calcer.h b/ydb/core/formats/arrow/size_calcer.h index 3ae1c212405e..00e55f9ca20b 100644 --- a/ydb/core/formats/arrow/size_calcer.h +++ b/ydb/core/formats/arrow/size_calcer.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -11,43 +12,6 @@ namespace NKikimr::NArrow { -class TRowSizeCalculator { -private: - std::shared_ptr Batch; - ui32 CommonSize = 0; - std::vector BinaryColumns; - std::vector StringColumns; - bool Prepared = false; - const ui32 AlignBitsCount = 1; - - ui32 GetBitWidthAligned(const ui32 bitWidth) const { - if (AlignBitsCount == 1) { - return bitWidth; - } - ui32 result = bitWidth / AlignBitsCount; - if (bitWidth % AlignBitsCount) { - result += 1; - } - result *= AlignBitsCount; - return result; - } - -public: - - ui64 GetApproxSerializeSize(const ui64 dataSize) const { - return Max(dataSize * 1.05, dataSize + Batch->num_columns() * 8); - } - - TRowSizeCalculator(const ui32 alignBitsCount) - : AlignBitsCount(alignBitsCount) - { - - } - bool InitBatch(const std::shared_ptr& batch); - ui32 GetRowBitWidth(const ui32 row) const; - ui32 GetRowBytesSize(const ui32 row) const; -}; - class TBatchSplitttingContext { private: YDB_ACCESSOR(ui64, SizeLimit, 6 * 1024 * 1024); @@ -114,14 +78,4 @@ class TSerializedBatch { TConclusion> SplitByBlobSize(const std::shared_ptr& batch, const TBatchSplitttingContext& context); -// Return size in bytes including size of bitmap mask -ui64 GetBatchDataSize(const std::shared_ptr& batch); -ui64 GetTableDataSize(const std::shared_ptr& batch); -// Return size in bytes including size of bitmap mask -ui64 GetArrayMemorySize(const std::shared_ptr& data); -ui64 GetBatchMemorySize(const std::shared_ptr&batch); -ui64 GetTableMemorySize(const std::shared_ptr& batch); -// Return size in bytes *not* including size of bitmap mask -ui64 GetArrayDataSize(const std::shared_ptr& column); - } diff --git a/ydb/core/formats/arrow/special_keys.h b/ydb/core/formats/arrow/special_keys.h index e7ac96eaf9f9..7559b4a5f7fb 100644 --- a/ydb/core/formats/arrow/special_keys.h +++ b/ydb/core/formats/arrow/special_keys.h @@ -1,5 +1,8 @@ #pragma once -#include + +#include "arrow_helpers.h" + +#include #include namespace NKikimr::NArrow { diff --git a/ydb/core/formats/arrow/splitter/scheme_info.h b/ydb/core/formats/arrow/splitter/scheme_info.h index 400bdfcc7862..0bb30e97300a 100644 --- a/ydb/core/formats/arrow/splitter/scheme_info.h +++ b/ydb/core/formats/arrow/splitter/scheme_info.h @@ -1,9 +1,9 @@ #pragma once -#include "stats.h" #include #include +#include #include namespace NKikimr::NArrow::NSplitter { diff --git a/ydb/core/formats/arrow/splitter/simple.cpp b/ydb/core/formats/arrow/splitter/simple.cpp index 8af1ca704212..a113084b531b 100644 --- a/ydb/core/formats/arrow/splitter/simple.cpp +++ b/ydb/core/formats/arrow/splitter/simple.cpp @@ -1,8 +1,8 @@ #include "simple.h" -#include #include +#include #include namespace NKikimr::NArrow::NSplitter { diff --git a/ydb/core/formats/arrow/splitter/simple.h b/ydb/core/formats/arrow/splitter/simple.h index 5be5c0b9b5d2..1405d3a6dc20 100644 --- a/ydb/core/formats/arrow/splitter/simple.h +++ b/ydb/core/formats/arrow/splitter/simple.h @@ -1,6 +1,5 @@ #pragma once #include -#include "stats.h" #include "scheme_info.h" namespace NKikimr::NArrow::NSplitter { diff --git a/ydb/core/formats/arrow/splitter/ya.make b/ydb/core/formats/arrow/splitter/ya.make index 078d7ea83737..70db86e75d4d 100644 --- a/ydb/core/formats/arrow/splitter/ya.make +++ b/ydb/core/formats/arrow/splitter/ya.make @@ -1,16 +1,16 @@ LIBRARY() SRCS( - stats.cpp simple.cpp scheme_info.cpp - similar_packer.cpp ) PEERDIR( contrib/libs/apache/arrow ydb/library/actors/core ydb/library/conclusion + ydb/library/formats/arrow/splitter + ydb/library/formats/arrow/common ydb/core/formats/arrow/serializer ) diff --git a/ydb/core/formats/arrow/switch/switch_type.h b/ydb/core/formats/arrow/switch/switch_type.h index fec6d1c93b39..75090fbc0a5b 100644 --- a/ydb/core/formats/arrow/switch/switch_type.h +++ b/ydb/core/formats/arrow/switch/switch_type.h @@ -1,7 +1,7 @@ #pragma once #include #include -#include +#include #include #include @@ -13,106 +13,6 @@ extern "C" { namespace NKikimr::NArrow { -template -struct TTypeWrapper -{ - using T = TType; -}; - -template -TResult SwitchTypeImpl(arrow::Type::type typeId, TFunc&& f) { - switch (typeId) { - case arrow::Type::NA: { - if constexpr (EnableNull) { - return f(TTypeWrapper()); - } - break; - } - case arrow::Type::BOOL: - return f(TTypeWrapper()); - case arrow::Type::UINT8: - return f(TTypeWrapper()); - case arrow::Type::INT8: - return f(TTypeWrapper()); - case arrow::Type::UINT16: - return f(TTypeWrapper()); - case arrow::Type::INT16: - return f(TTypeWrapper()); - case arrow::Type::UINT32: - return f(TTypeWrapper()); - case arrow::Type::INT32: - return f(TTypeWrapper()); - case arrow::Type::UINT64: - return f(TTypeWrapper()); - case arrow::Type::INT64: - return f(TTypeWrapper()); - case arrow::Type::HALF_FLOAT: - return f(TTypeWrapper()); - case arrow::Type::FLOAT: - return f(TTypeWrapper()); - case arrow::Type::DOUBLE: - return f(TTypeWrapper()); - case arrow::Type::STRING: - return f(TTypeWrapper()); - case arrow::Type::BINARY: - return f(TTypeWrapper()); - case arrow::Type::FIXED_SIZE_BINARY: - return f(TTypeWrapper()); - case arrow::Type::DATE32: - return f(TTypeWrapper()); - case arrow::Type::DATE64: - return f(TTypeWrapper()); - case arrow::Type::TIMESTAMP: - return f(TTypeWrapper()); - case arrow::Type::TIME32: - return f(TTypeWrapper()); - case arrow::Type::TIME64: - return f(TTypeWrapper()); - case arrow::Type::INTERVAL_MONTHS: - return f(TTypeWrapper()); - case arrow::Type::DECIMAL: - return f(TTypeWrapper()); - case arrow::Type::DURATION: - return f(TTypeWrapper()); - case arrow::Type::LARGE_STRING: - return f(TTypeWrapper()); - case arrow::Type::LARGE_BINARY: - return f(TTypeWrapper()); - case arrow::Type::DECIMAL256: - case arrow::Type::DENSE_UNION: - case arrow::Type::DICTIONARY: - case arrow::Type::EXTENSION: - case arrow::Type::FIXED_SIZE_LIST: - case arrow::Type::INTERVAL_DAY_TIME: - case arrow::Type::LARGE_LIST: - case arrow::Type::LIST: - case arrow::Type::MAP: - case arrow::Type::MAX_ID: - case arrow::Type::SPARSE_UNION: - case arrow::Type::STRUCT: - break; - } - - return defaultValue; -} - -template -bool SwitchType(arrow::Type::type typeId, TFunc&& f) { - return SwitchTypeImpl(typeId, std::move(f)); -} - -template -bool SwitchTypeWithNull(arrow::Type::type typeId, TFunc&& f) { - return SwitchType(typeId, std::move(f)); -} - -template -bool SwitchArrayType(const arrow::Datum& column, TFunc&& f) { - auto type = column.type(); - Y_ABORT_UNLESS(type); - return SwitchType(type->id(), std::forward(f)); -} - /** * @brief Function to switch yql type correctly and uniformly converting it to arrow type using callback * @@ -227,74 +127,4 @@ inline bool IsPrimitiveYqlType(const NScheme::TTypeInfo& typeInfo) { return false; } -template -bool Append(arrow::ArrayBuilder& builder, const typename T::c_type& value) { - using TBuilder = typename arrow::TypeTraits::BuilderType; - - TStatusValidator::Validate(static_cast(builder).Append(value)); - return true; -} - -template -bool Append(arrow::ArrayBuilder& builder, arrow::util::string_view value) { - using TBuilder = typename arrow::TypeTraits::BuilderType; - - TStatusValidator::Validate(static_cast(builder).Append(value)); - return true; -} - -template -bool Append(arrow::ArrayBuilder& builder, const typename T::c_type* values, size_t size) { - using TBuilder = typename arrow::NumericBuilder; - - TStatusValidator::Validate(static_cast(builder).AppendValues(values, size)); - return true; -} - -template -bool Append(arrow::ArrayBuilder& builder, const std::vector& values) { - using TBuilder = typename arrow::NumericBuilder; - - TStatusValidator::Validate(static_cast(builder).AppendValues(values.data(), values.size())); - return true; -} - -template -[[nodiscard]] bool Append(T& builder, const arrow::Array& array, int position, ui64* recordSize = nullptr) { - Y_DEBUG_ABORT_UNLESS(builder.type()->id() == array.type_id()); - return SwitchType(array.type_id(), [&](const auto& type) { - using TWrap = std::decay_t; - using TArray = typename arrow::TypeTraits::ArrayType; - using TBuilder = typename arrow::TypeTraits::BuilderType; - - auto& typedArray = static_cast(array); - auto& typedBuilder = static_cast(builder); - - if (typedArray.IsNull(position)) { - TStatusValidator::Validate(typedBuilder.AppendNull()); - if (recordSize) { - *recordSize += 4; - } - return true; - } else { - if constexpr (!arrow::has_string_view::value) { - TStatusValidator::Validate(typedBuilder.Append(typedArray.GetView(position))); - if (recordSize) { - *recordSize += sizeof(typedArray.GetView(position)); - } - return true; - } - if constexpr (arrow::has_string_view::value) { - TStatusValidator::Validate(typedBuilder.Append(typedArray.GetView(position))); - if (recordSize) { - *recordSize += typedArray.GetView(position).size(); - } - return true; - } - } - Y_ABORT_UNLESS(false, "unpredictable variant"); - return false; - }); -} - } diff --git a/ydb/core/formats/arrow/switch/ya.make b/ydb/core/formats/arrow/switch/ya.make index e11e5e070ca6..622e9bf2a604 100644 --- a/ydb/core/formats/arrow/switch/ya.make +++ b/ydb/core/formats/arrow/switch/ya.make @@ -4,11 +4,11 @@ PEERDIR( contrib/libs/apache/arrow ydb/core/scheme_types ydb/library/actors/core + ydb/library/formats/arrow/switch ) SRCS( switch_type.cpp - compare.cpp ) END() diff --git a/ydb/core/formats/arrow/transformer/dictionary.h b/ydb/core/formats/arrow/transformer/dictionary.h index da0c13a5189a..4229c0ed8071 100644 --- a/ydb/core/formats/arrow/transformer/dictionary.h +++ b/ydb/core/formats/arrow/transformer/dictionary.h @@ -1,5 +1,5 @@ #pragma once -#include "abstract.h" +#include namespace NKikimr::NArrow::NTransformation { diff --git a/ydb/core/formats/arrow/transformer/ya.make b/ydb/core/formats/arrow/transformer/ya.make index 3a1c0c4c12ab..8ca15c923dd9 100644 --- a/ydb/core/formats/arrow/transformer/ya.make +++ b/ydb/core/formats/arrow/transformer/ya.make @@ -3,12 +3,11 @@ LIBRARY() PEERDIR( contrib/libs/apache/arrow ydb/core/formats/arrow/dictionary + ydb/library/formats/arrow/transformer ) SRCS( - abstract.cpp dictionary.cpp - composite.cpp ) END() diff --git a/ydb/core/formats/arrow/ut/ut_arrow.cpp b/ydb/core/formats/arrow/ut/ut_arrow.cpp index da620d70fa30..b12fc5561b12 100644 --- a/ydb/core/formats/arrow/ut/ut_arrow.cpp +++ b/ydb/core/formats/arrow/ut/ut_arrow.cpp @@ -185,11 +185,6 @@ struct TDataRow { } }; - -std::shared_ptr GetColumn(const arrow::Table& table, int i, int chunk = 0) { - return table.column(i)->chunk(chunk); -} - std::shared_ptr GetColumn(const arrow::RecordBatch& batch, int i) { return batch.column(i); } @@ -526,22 +521,6 @@ bool CheckSorted(const std::shared_ptr& batch, bool desc = f } Y_UNIT_TEST_SUITE(ArrowTest) { - Y_UNIT_TEST(Basic) { - std::vector rows = TestRows(); - - std::shared_ptr table = TDataRowTableBuilder::Build(rows); - - auto expectedSchema = TDataRow::MakeArrowSchema(); - UNIT_ASSERT_EQUAL(expectedSchema->Equals(*table->schema()), true); - - std::vector readRows = ToVector(table); - - UNIT_ASSERT_EQUAL(rows.size(), readRows.size()); - for (size_t i = 0; i < rows.size(); ++i) { - UNIT_ASSERT_EQUAL(rows[i], readRows[i]); - } - } - Y_UNIT_TEST(BatchBuilder) { std::vector rows = TestRows(); diff --git a/ydb/core/formats/arrow/ut/ut_dictionary.cpp b/ydb/core/formats/arrow/ut/ut_dictionary.cpp index c3df2c6a30f0..02a9dc9b3e69 100644 --- a/ydb/core/formats/arrow/ut/ut_dictionary.cpp +++ b/ydb/core/formats/arrow/ut/ut_dictionary.cpp @@ -1,10 +1,10 @@ #include #include #include -#include -#include -#include #include +#include +#include +#include Y_UNIT_TEST_SUITE(Dictionary) { diff --git a/ydb/core/formats/arrow/ut/ut_hash.cpp b/ydb/core/formats/arrow/ut/ut_hash.cpp index 3255d430352d..4c64c74895cb 100644 --- a/ydb/core/formats/arrow/ut/ut_hash.cpp +++ b/ydb/core/formats/arrow/ut/ut_hash.cpp @@ -1,7 +1,7 @@ #include #include -#include #include +#include Y_UNIT_TEST_SUITE(Hash) { diff --git a/ydb/core/formats/arrow/ut/ya.make b/ydb/core/formats/arrow/ut/ya.make index c4c993ec3220..54fa4d357730 100644 --- a/ydb/core/formats/arrow/ut/ya.make +++ b/ydb/core/formats/arrow/ut/ya.make @@ -5,6 +5,7 @@ SIZE(SMALL) PEERDIR( contrib/libs/apache/arrow ydb/library/arrow_kernels + ydb/library/formats/arrow/simple_builder ydb/core/base # for NYql::NUdf alloc stuff used in binary_json @@ -26,7 +27,6 @@ SRCS( ut_arrow.cpp ut_program_step.cpp ut_dictionary.cpp - ut_size_calcer.cpp ut_column_filter.cpp ut_hash.cpp ) diff --git a/ydb/core/formats/arrow/ya.make b/ydb/core/formats/arrow/ya.make index 8bb86947266e..fa66d3a97154 100644 --- a/ydb/core/formats/arrow/ya.make +++ b/ydb/core/formats/arrow/ya.make @@ -9,19 +9,17 @@ PEERDIR( ydb/core/scheme ydb/core/formats/arrow/accessor ydb/core/formats/arrow/serializer - ydb/core/formats/arrow/simple_builder ydb/core/formats/arrow/dictionary ydb/core/formats/arrow/transformer ydb/core/formats/arrow/reader ydb/core/formats/arrow/save_load ydb/core/formats/arrow/splitter - ydb/core/formats/arrow/modifier - ydb/core/formats/arrow/scalar ydb/core/formats/arrow/hash ydb/library/actors/core ydb/library/arrow_kernels ydb/library/binary_json ydb/library/dynumber + ydb/library/formats/arrow ydb/library/services ydb/library/yql/core/arrow_kernels/request ) @@ -49,14 +47,11 @@ SRCS( converter.cpp converter.h custom_registry.cpp - input_stream.h permutations.cpp program.cpp - replace_key.cpp size_calcer.cpp ssa_program_optimizer.cpp special_keys.cpp - simple_arrays_cache.cpp process_columns.cpp ) diff --git a/ydb/core/kqp/compute_actor/kqp_compute_actor.cpp b/ydb/core/kqp/compute_actor/kqp_compute_actor.cpp index de81712d0d7a..3376baff7be6 100644 --- a/ydb/core/kqp/compute_actor/kqp_compute_actor.cpp +++ b/ydb/core/kqp/compute_actor/kqp_compute_actor.cpp @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include diff --git a/ydb/core/kqp/compute_actor/kqp_compute_events.h b/ydb/core/kqp/compute_actor/kqp_compute_events.h index 0adb559f132f..a9dd127a64b0 100644 --- a/ydb/core/kqp/compute_actor/kqp_compute_events.h +++ b/ydb/core/kqp/compute_actor/kqp_compute_events.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include #include #include diff --git a/ydb/core/kqp/compute_actor/ya.make b/ydb/core/kqp/compute_actor/ya.make index 50d029c6bacf..7b45cfa31530 100644 --- a/ydb/core/kqp/compute_actor/ya.make +++ b/ydb/core/kqp/compute_actor/ya.make @@ -22,7 +22,8 @@ PEERDIR( ydb/core/kqp/runtime ydb/core/tx/datashard ydb/core/tx/scheme_cache - ydb/core/formats/arrow/protos + ydb/library/formats/arrow/protos + ydb/library/formats/arrow/common ydb/library/yql/dq/actors/compute ydb/library/yql/providers/generic/actors ydb/library/yql/providers/s3/actors_factory diff --git a/ydb/core/kqp/opt/kqp_query_plan.cpp b/ydb/core/kqp/opt/kqp_query_plan.cpp index c69dab844fa2..eec08a2a40d9 100644 --- a/ydb/core/kqp/opt/kqp_query_plan.cpp +++ b/ydb/core/kqp/opt/kqp_query_plan.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include diff --git a/ydb/core/kqp/opt/ya.make b/ydb/core/kqp/opt/ya.make index 534b45272730..503c88745451 100644 --- a/ydb/core/kqp/opt/ya.make +++ b/ydb/core/kqp/opt/ya.make @@ -27,7 +27,7 @@ PEERDIR( ydb/library/yql/providers/s3/expr_nodes ydb/library/yql/utils/plan ydb/core/kqp/provider - ydb/core/formats/arrow/protos + ydb/library/formats/arrow/protos ) YQL_LAST_ABI_VERSION() diff --git a/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp b/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp index 7ea483df9b9b..5b2cfc5bb7a2 100644 --- a/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp +++ b/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include diff --git a/ydb/core/kqp/ut/common/columnshard.h b/ydb/core/kqp/ut/common/columnshard.h index d1bfa68c3a28..83dd86d3d98b 100644 --- a/ydb/core/kqp/ut/common/columnshard.h +++ b/ydb/core/kqp/ut/common/columnshard.h @@ -2,15 +2,14 @@ #include "kqp_ut_common.h" #include +#include +#include +#include #include #include #include #include -#include -#include -#include - #include namespace NKikimr { diff --git a/ydb/core/kqp/ut/olap/helpers/typed_local.h b/ydb/core/kqp/ut/olap/helpers/typed_local.h index 285c7d062b94..df81c5cd6a86 100644 --- a/ydb/core/kqp/ut/olap/helpers/typed_local.h +++ b/ydb/core/kqp/ut/olap/helpers/typed_local.h @@ -1,9 +1,9 @@ #pragma once #include #include -#include -#include -#include +#include +#include +#include #include diff --git a/ydb/core/protos/flat_scheme_op.proto b/ydb/core/protos/flat_scheme_op.proto index 8fc7935f7d8b..c5e25cdd5717 100644 --- a/ydb/core/protos/flat_scheme_op.proto +++ b/ydb/core/protos/flat_scheme_op.proto @@ -20,7 +20,7 @@ import "ydb/library/mkql_proto/protos/minikql.proto"; import "ydb/core/protos/index_builder.proto"; import "ydb/core/tx/columnshard/engines/scheme/defaults/protos/data.proto"; import "ydb/core/tx/columnshard/common/protos/snapshot.proto"; -import "ydb/core/formats/arrow/protos/accessor.proto"; +import "ydb/library/formats/arrow/protos/accessor.proto"; import "google/protobuf/struct.proto"; diff --git a/ydb/core/protos/tx_columnshard.proto b/ydb/core/protos/tx_columnshard.proto index f246a1cd848a..1875cca96112 100644 --- a/ydb/core/protos/tx_columnshard.proto +++ b/ydb/core/protos/tx_columnshard.proto @@ -3,7 +3,7 @@ import "ydb/core/protos/flat_scheme_op.proto"; import "ydb/core/protos/long_tx_service.proto"; import "ydb/core/protos/subdomains.proto"; import "ydb/core/protos/tx.proto"; -import "ydb/core/formats/arrow/protos/fields.proto"; +import "ydb/library/formats/arrow/protos/fields.proto"; package NKikimrTxColumnShard; option java_package = "ru.yandex.kikimr.proto"; diff --git a/ydb/core/protos/tx_datashard.proto b/ydb/core/protos/tx_datashard.proto index a585b0d1cef0..6c0231ff1a11 100644 --- a/ydb/core/protos/tx_datashard.proto +++ b/ydb/core/protos/tx_datashard.proto @@ -7,7 +7,7 @@ import "ydb/core/scheme/protos/key_range.proto"; import "ydb/core/scheme/protos/pathid.proto"; import "ydb/core/protos/data_events.proto"; import "ydb/core/protos/kqp.proto"; -import "ydb/core/formats/arrow/protos/ssa.proto"; +import "ydb/library/formats/arrow/protos/ssa.proto"; import "ydb/core/protos/tablet.proto"; import "ydb/core/protos/tx.proto"; import "ydb/core/protos/flat_scheme_op.proto"; diff --git a/ydb/core/protos/ya.make b/ydb/core/protos/ya.make index 360efcd51fa5..4f7710acf4f4 100644 --- a/ydb/core/protos/ya.make +++ b/ydb/core/protos/ya.make @@ -171,7 +171,7 @@ PEERDIR( ydb/library/ydb_issue/proto ydb/core/tx/columnshard/engines/scheme/defaults/protos ydb/core/tx/columnshard/engines/protos - ydb/core/formats/arrow/protos + ydb/library/formats/arrow/protos ydb/core/tx/columnshard/common/protos ) diff --git a/ydb/core/tx/columnshard/common/scalars.cpp b/ydb/core/tx/columnshard/common/scalars.cpp index d614253e9ec1..d85622edeeec 100644 --- a/ydb/core/tx/columnshard/common/scalars.cpp +++ b/ydb/core/tx/columnshard/common/scalars.cpp @@ -1,6 +1,6 @@ #include "scalars.h" -#include +#include #include #include diff --git a/ydb/core/tx/columnshard/common/scalars.h b/ydb/core/tx/columnshard/common/scalars.h index 328296048620..7635caa19e11 100644 --- a/ydb/core/tx/columnshard/common/scalars.h +++ b/ydb/core/tx/columnshard/common/scalars.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/common/ya.make b/ydb/core/tx/columnshard/common/ya.make index 300691ed711e..c7d8a27bf3ee 100644 --- a/ydb/core/tx/columnshard/common/ya.make +++ b/ydb/core/tx/columnshard/common/ya.make @@ -11,7 +11,7 @@ SRCS( ) PEERDIR( - ydb/core/formats/arrow/protos + ydb/library/formats/arrow/protos contrib/libs/apache/arrow ydb/core/formats/arrow ydb/core/tx/columnshard/common/protos diff --git a/ydb/core/tx/columnshard/data_sharing/source/session/cursor.cpp b/ydb/core/tx/columnshard/data_sharing/source/session/cursor.cpp index 1072d6ff1cb6..5bc37cd29122 100644 --- a/ydb/core/tx/columnshard/data_sharing/source/session/cursor.cpp +++ b/ydb/core/tx/columnshard/data_sharing/source/session/cursor.cpp @@ -1,7 +1,7 @@ #include "source.h" #include #include -#include +#include namespace NKikimr::NOlap::NDataSharing { diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/common/context.h b/ydb/core/tx/columnshard/engines/changes/compaction/common/context.h index ebe3394f299e..73117725614d 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/common/context.h +++ b/ydb/core/tx/columnshard/engines/changes/compaction/common/context.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/merger.cpp b/ydb/core/tx/columnshard/engines/changes/compaction/merger.cpp index 72ca7d2019ed..439426439867 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/merger.cpp +++ b/ydb/core/tx/columnshard/engines/changes/compaction/merger.cpp @@ -6,8 +6,8 @@ #include #include -#include -#include +#include +#include #include namespace NKikimr::NOlap::NCompaction { diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/merger.h b/ydb/core/tx/columnshard/engines/changes/compaction/merger.h index ed862f2e25dd..9c84799fe8ad 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/merger.h +++ b/ydb/core/tx/columnshard/engines/changes/compaction/merger.h @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_cursor.cpp b/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_cursor.cpp index 95b7acab74dc..9fd0c4d301e1 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_cursor.cpp +++ b/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_cursor.cpp @@ -1,5 +1,5 @@ #include "column_cursor.h" -#include +#include namespace NKikimr::NOlap::NCompaction { diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_portion_chunk.cpp b/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_portion_chunk.cpp index dde08cabb4fc..3db4127653b8 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_portion_chunk.cpp +++ b/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_portion_chunk.cpp @@ -1,7 +1,7 @@ #include "column_portion_chunk.h" #include -#include +#include #include #include diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_portion_chunk.h b/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_portion_chunk.h index ce10642ae95d..59c6a3b460da 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_portion_chunk.h +++ b/ydb/core/tx/columnshard/engines/changes/compaction/plain/column_portion_chunk.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/plain/logic.h b/ydb/core/tx/columnshard/engines/changes/compaction/plain/logic.h index 9e3ec9a7c184..5b3c53f2eec9 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/plain/logic.h +++ b/ydb/core/tx/columnshard/engines/changes/compaction/plain/logic.h @@ -1,8 +1,8 @@ #pragma once #include "column_cursor.h" -#include -#include +#include +#include #include namespace NKikimr::NOlap::NCompaction { diff --git a/ydb/core/tx/columnshard/engines/changes/compaction/sparsed/logic.h b/ydb/core/tx/columnshard/engines/changes/compaction/sparsed/logic.h index bf7be222ff48..35a3649d61f3 100644 --- a/ydb/core/tx/columnshard/engines/changes/compaction/sparsed/logic.h +++ b/ydb/core/tx/columnshard/engines/changes/compaction/sparsed/logic.h @@ -1,6 +1,6 @@ #pragma once -#include -#include +#include +#include #include #include diff --git a/ydb/core/tx/columnshard/engines/protos/portion_info.proto b/ydb/core/tx/columnshard/engines/protos/portion_info.proto index dc599633eb7a..f7f38bb96ed7 100644 --- a/ydb/core/tx/columnshard/engines/protos/portion_info.proto +++ b/ydb/core/tx/columnshard/engines/protos/portion_info.proto @@ -1,4 +1,4 @@ -import "ydb/core/formats/arrow/protos/ssa.proto"; +import "ydb/library/formats/arrow/protos/ssa.proto"; package NKikimrTxColumnShard; diff --git a/ydb/core/tx/columnshard/engines/protos/ya.make b/ydb/core/tx/columnshard/engines/protos/ya.make index ad664077a031..5719eb76af10 100644 --- a/ydb/core/tx/columnshard/engines/protos/ya.make +++ b/ydb/core/tx/columnshard/engines/protos/ya.make @@ -5,7 +5,7 @@ SRCS( ) PEERDIR( - ydb/core/formats/arrow/protos + ydb/library/formats/arrow/protos ) diff --git a/ydb/core/tx/columnshard/engines/scheme/indexes/abstract/checker.h b/ydb/core/tx/columnshard/engines/scheme/indexes/abstract/checker.h index a67a72df8e2f..88815c374eb2 100644 --- a/ydb/core/tx/columnshard/engines/scheme/indexes/abstract/checker.h +++ b/ydb/core/tx/columnshard/engines/scheme/indexes/abstract/checker.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/scheme/indexes/abstract/ya.make b/ydb/core/tx/columnshard/engines/scheme/indexes/abstract/ya.make index e758f9ecc430..a9991e37e26a 100644 --- a/ydb/core/tx/columnshard/engines/scheme/indexes/abstract/ya.make +++ b/ydb/core/tx/columnshard/engines/scheme/indexes/abstract/ya.make @@ -11,7 +11,7 @@ SRCS( PEERDIR( ydb/core/formats/arrow - ydb/core/formats/arrow/protos + ydb/library/formats/arrow/protos ) YQL_LAST_ABI_VERSION() diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/portions/ya.make b/ydb/core/tx/columnshard/engines/storage/indexes/portions/ya.make index 076d439d54af..0ce6d8f9987f 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/portions/ya.make +++ b/ydb/core/tx/columnshard/engines/storage/indexes/portions/ya.make @@ -7,7 +7,7 @@ SRCS( PEERDIR( ydb/core/formats/arrow - ydb/core/formats/arrow/protos + ydb/library/formats/arrow/protos ydb/core/tx/columnshard/engines/storage/chunks ydb/core/tx/columnshard/engines/scheme/indexes/abstract ydb/core/tx/columnshard/engines/portions diff --git a/ydb/core/tx/columnshard/test_helper/helper.cpp b/ydb/core/tx/columnshard/test_helper/helper.cpp index 879090b7fb68..d64003c5d525 100644 --- a/ydb/core/tx/columnshard/test_helper/helper.cpp +++ b/ydb/core/tx/columnshard/test_helper/helper.cpp @@ -1,7 +1,7 @@ #include "helper.h" #include -#include +#include #include #include #include diff --git a/ydb/core/tx/program/program.h b/ydb/core/tx/program/program.h index 99d72de0a0b0..3ab18eccc9d1 100644 --- a/ydb/core/tx/program/program.h +++ b/ydb/core/tx/program/program.h @@ -2,7 +2,7 @@ #include "registry.h" #include -#include +#include #include #include #include diff --git a/ydb/core/tx/program/ya.make b/ydb/core/tx/program/ya.make index 4e69528543a3..d8ef7ed60696 100644 --- a/ydb/core/tx/program/ya.make +++ b/ydb/core/tx/program/ya.make @@ -8,7 +8,7 @@ SRCS( PEERDIR( ydb/core/formats/arrow ydb/core/protos - ydb/core/formats/arrow/protos + ydb/library/formats/arrow/protos ydb/core/tablet_flat ydb/library/yql/minikql/comp_nodes ydb/library/yql/core/arrow_kernels/registry diff --git a/ydb/core/formats/arrow/accessor/abstract/accessor.cpp b/ydb/library/formats/arrow/accessor/abstract/accessor.cpp similarity index 95% rename from ydb/core/formats/arrow/accessor/abstract/accessor.cpp rename to ydb/library/formats/arrow/accessor/abstract/accessor.cpp index 566bbb4e7ad2..9d4c33efa42e 100644 --- a/ydb/core/formats/arrow/accessor/abstract/accessor.cpp +++ b/ydb/library/formats/arrow/accessor/abstract/accessor.cpp @@ -1,12 +1,10 @@ #include "accessor.h" -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include #include diff --git a/ydb/core/formats/arrow/accessor/abstract/accessor.h b/ydb/library/formats/arrow/accessor/abstract/accessor.h similarity index 100% rename from ydb/core/formats/arrow/accessor/abstract/accessor.h rename to ydb/library/formats/arrow/accessor/abstract/accessor.h diff --git a/ydb/library/formats/arrow/accessor/abstract/ya.make b/ydb/library/formats/arrow/accessor/abstract/ya.make new file mode 100644 index 000000000000..c3ebb89dace4 --- /dev/null +++ b/ydb/library/formats/arrow/accessor/abstract/ya.make @@ -0,0 +1,14 @@ +LIBRARY() + +PEERDIR( + ydb/library/formats/arrow/protos + ydb/library/formats/arrow/accessor/common + contrib/libs/apache/arrow + ydb/library/conclusion +) + +SRCS( + accessor.cpp +) + +END() diff --git a/ydb/core/formats/arrow/accessor/common/chunk_data.cpp b/ydb/library/formats/arrow/accessor/common/chunk_data.cpp similarity index 100% rename from ydb/core/formats/arrow/accessor/common/chunk_data.cpp rename to ydb/library/formats/arrow/accessor/common/chunk_data.cpp diff --git a/ydb/core/formats/arrow/accessor/common/chunk_data.h b/ydb/library/formats/arrow/accessor/common/chunk_data.h similarity index 100% rename from ydb/core/formats/arrow/accessor/common/chunk_data.h rename to ydb/library/formats/arrow/accessor/common/chunk_data.h diff --git a/ydb/core/formats/arrow/accessor/common/const.cpp b/ydb/library/formats/arrow/accessor/common/const.cpp similarity index 100% rename from ydb/core/formats/arrow/accessor/common/const.cpp rename to ydb/library/formats/arrow/accessor/common/const.cpp diff --git a/ydb/core/formats/arrow/accessor/common/const.h b/ydb/library/formats/arrow/accessor/common/const.h similarity index 100% rename from ydb/core/formats/arrow/accessor/common/const.h rename to ydb/library/formats/arrow/accessor/common/const.h diff --git a/ydb/core/formats/arrow/accessor/common/ya.make b/ydb/library/formats/arrow/accessor/common/ya.make similarity index 100% rename from ydb/core/formats/arrow/accessor/common/ya.make rename to ydb/library/formats/arrow/accessor/common/ya.make diff --git a/ydb/core/formats/arrow/accessor/composite/accessor.cpp b/ydb/library/formats/arrow/accessor/composite/accessor.cpp similarity index 100% rename from ydb/core/formats/arrow/accessor/composite/accessor.cpp rename to ydb/library/formats/arrow/accessor/composite/accessor.cpp diff --git a/ydb/core/formats/arrow/accessor/composite/accessor.h b/ydb/library/formats/arrow/accessor/composite/accessor.h similarity index 97% rename from ydb/core/formats/arrow/accessor/composite/accessor.h rename to ydb/library/formats/arrow/accessor/composite/accessor.h index 9b253b265e84..a86c36025d61 100644 --- a/ydb/core/formats/arrow/accessor/composite/accessor.h +++ b/ydb/library/formats/arrow/accessor/composite/accessor.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include namespace NKikimr::NArrow::NAccessor { diff --git a/ydb/core/formats/arrow/accessor/composite/ya.make b/ydb/library/formats/arrow/accessor/composite/ya.make similarity index 69% rename from ydb/core/formats/arrow/accessor/composite/ya.make rename to ydb/library/formats/arrow/accessor/composite/ya.make index 828c9a8e531d..ed407a50f688 100644 --- a/ydb/core/formats/arrow/accessor/composite/ya.make +++ b/ydb/library/formats/arrow/accessor/composite/ya.make @@ -2,7 +2,7 @@ LIBRARY() PEERDIR( contrib/libs/apache/arrow - ydb/core/formats/arrow/common + ydb/library/formats/arrow/common ) SRCS( diff --git a/ydb/library/formats/arrow/accessor/ya.make b/ydb/library/formats/arrow/accessor/ya.make new file mode 100644 index 000000000000..17f8331064bf --- /dev/null +++ b/ydb/library/formats/arrow/accessor/ya.make @@ -0,0 +1,8 @@ +LIBRARY() + +PEERDIR( + ydb/library/formats/arrow/accessor/abstract + ydb/library/formats/arrow/accessor/composite +) + +END() diff --git a/ydb/library/formats/arrow/arrow_helpers.cpp b/ydb/library/formats/arrow/arrow_helpers.cpp new file mode 100644 index 000000000000..d27b18af5bc9 --- /dev/null +++ b/ydb/library/formats/arrow/arrow_helpers.cpp @@ -0,0 +1,807 @@ +#include "arrow_helpers.h" +#include "switch_type.h" +#include "common/validation.h" +#include "permutations.h" +#include "simple_arrays_cache.h" +#include "replace_key.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define Y_VERIFY_OK(status) Y_ABORT_UNLESS(status.ok(), "%s", status.ToString().c_str()) + +namespace NKikimr::NArrow { + +TString SerializeSchema(const arrow::Schema& schema) { + auto buffer = TStatusValidator::GetValid(arrow::ipc::SerializeSchema(schema)); + return buffer->ToString(); +} + +std::shared_ptr MakeEmptyBatch(const std::shared_ptr& schema, const ui32 rowsCount) { + std::vector> columns; + columns.reserve(schema->num_fields()); + + for (auto& field : schema->fields()) { + auto result = NArrow::TThreadSimpleArraysCache::GetNull(field->type(), rowsCount); + columns.emplace_back(result); + Y_ABORT_UNLESS(result); + } + return arrow::RecordBatch::Make(schema, rowsCount, columns); +} + +std::shared_ptr CombineBatches(const std::vector>& batches) { + if (batches.empty()) { + return nullptr; + } + auto table = TStatusValidator::GetValid(arrow::Table::FromRecordBatches(batches)); + return table ? ToBatch(table, true) : nullptr; +} + +std::shared_ptr ToBatch(const std::shared_ptr& tableExt, const bool combine) { + if (!tableExt) { + return nullptr; + } + std::shared_ptr table; + if (combine) { + auto res = tableExt->CombineChunks(); + Y_ABORT_UNLESS(res.ok()); + table = *res; + } else { + table = tableExt; + } + std::vector> columns; + columns.reserve(table->num_columns()); + for (auto& col : table->columns()) { + AFL_VERIFY(col->num_chunks() == 1)("size", col->num_chunks())("size_bytes", GetTableDataSize(tableExt)) + ("schema", tableExt->schema()->ToString())("size_new", GetTableDataSize(table)); + columns.push_back(col->chunk(0)); + } + return arrow::RecordBatch::Make(table->schema(), table->num_rows(), columns); +} + +// Check if the permutation doesn't reorder anything +bool IsTrivial(const arrow::UInt64Array& permutation, const ui64 originalLength) { + if ((ui64)permutation.length() != originalLength) { + return false; + } + for (i64 i = 0; i < permutation.length(); ++i) { + if (permutation.Value(i) != (ui64)i) { + return false; + } + } + return true; +} + +std::shared_ptr Reorder(const std::shared_ptr& batch, + const std::shared_ptr& permutation, const bool canRemove) { + Y_ABORT_UNLESS(permutation->length() == batch->num_rows() || canRemove); + + auto res = IsTrivial(*permutation, batch->num_rows()) ? batch : arrow::compute::Take(batch, permutation); + Y_ABORT_UNLESS(res.ok()); + return (*res).record_batch(); +} + +THashMap> ShardingSplit(const std::shared_ptr& batch, const THashMap>& shardRows) { + AFL_VERIFY(batch); + std::shared_ptr permutation; + { + arrow::UInt64Builder builder; + Y_VERIFY_OK(builder.Reserve(batch->num_rows())); + + for (auto&& [shardId, rowIdxs]: shardRows) { + for (auto& row : rowIdxs) { + Y_VERIFY_OK(builder.Append(row)); + } + } + Y_VERIFY_OK(builder.Finish(&permutation)); + } + + auto reorderedBatch = Reorder(batch, permutation, false); + + THashMap> out; + + int offset = 0; + for (auto&& [shardId, shardRowIdxs] : shardRows) { + if (shardRowIdxs.empty()) { + continue; + } + out.emplace(shardId, reorderedBatch->Slice(offset, shardRowIdxs.size())); + offset += shardRowIdxs.size(); + } + + Y_ABORT_UNLESS(offset == batch->num_rows()); + return out; +} + +std::vector> ShardingSplit(const std::shared_ptr& batch, const std::vector>& shardRows, const ui32 numShards) { + AFL_VERIFY(batch); + std::shared_ptr permutation; + { + arrow::UInt64Builder builder; + Y_VERIFY_OK(builder.Reserve(batch->num_rows())); + + for (ui32 shardNo = 0; shardNo < numShards; ++shardNo) { + for (auto& row : shardRows[shardNo]) { + Y_VERIFY_OK(builder.Append(row)); + } + } + Y_VERIFY_OK(builder.Finish(&permutation)); + } + + auto reorderedBatch = Reorder(batch, permutation, false); + + std::vector> out(numShards); + + int offset = 0; + for (ui32 shardNo = 0; shardNo < numShards; ++shardNo) { + int length = shardRows[shardNo].size(); + if (length) { + out[shardNo] = reorderedBatch->Slice(offset, length); + offset += length; + } + } + + Y_ABORT_UNLESS(offset == batch->num_rows()); + return out; +} + +std::vector> ShardingSplit(const std::shared_ptr& batch, + const std::vector& sharding, ui32 numShards) { + AFL_VERIFY(batch); + Y_ABORT_UNLESS((size_t)batch->num_rows() == sharding.size()); + + std::vector> shardRows(numShards); + for (size_t row = 0; row < sharding.size(); ++row) { + ui32 shardNo = sharding[row]; + Y_ABORT_UNLESS(shardNo < numShards); + shardRows[shardNo].push_back(row); + } + return ShardingSplit(batch, shardRows, numShards); +} + +bool HasAllColumns(const std::shared_ptr& batch, const std::shared_ptr& schema) { + for (auto& field : schema->fields()) { + if (batch->schema()->GetFieldIndex(field->name()) < 0) { + return false; + } + } + return true; +} + +std::vector> MakeBuilders(const std::shared_ptr& schema, + size_t reserve, const std::map& sizeByColumn) { + std::vector> builders; + builders.reserve(schema->num_fields()); + + for (auto& field : schema->fields()) { + std::unique_ptr builder; + TStatusValidator::Validate(arrow::MakeBuilder(arrow::default_memory_pool(), field->type(), &builder)); + if (sizeByColumn.size()) { + auto it = sizeByColumn.find(field->name()); + if (it != sizeByColumn.end()) { + AFL_VERIFY(NArrow::ReserveData(*builder, it->second))("size", it->second)("field", field->name()); + } + } + + if (reserve) { + TStatusValidator::Validate(builder->Reserve(reserve)); + } + + builders.emplace_back(std::move(builder)); + + } + return builders; +} + +std::unique_ptr MakeBuilder(const std::shared_ptr& field) { + AFL_VERIFY(field); + return MakeBuilder(field->type()); +} + +std::unique_ptr MakeBuilder(const std::shared_ptr& type) { + AFL_VERIFY(type); + std::unique_ptr builder; + TStatusValidator::Validate(arrow::MakeBuilder(arrow::default_memory_pool(), type, &builder)); + return std::move(builder); +} + +std::vector> Finish(std::vector>&& builders) { + std::vector> out; + for (auto& builder : builders) { + std::shared_ptr array; + TStatusValidator::Validate(builder->Finish(&array)); + out.emplace_back(array); + } + return out; +} + +std::vector ColumnNames(const std::shared_ptr& schema) { + std::vector out; + out.reserve(schema->num_fields()); + for (int i = 0; i < schema->num_fields(); ++i) { + auto& name = schema->field(i)->name(); + out.emplace_back(TString(name.data(), name.size())); + } + return out; +} + +std::shared_ptr MakeUI64Array(ui64 value, i64 size) { + auto res = arrow::MakeArrayFromScalar(arrow::UInt64Scalar(value), size); + Y_ABORT_UNLESS(res.ok()); + return std::static_pointer_cast(*res); +} + +std::pair FindMinMaxPosition(const std::shared_ptr& array) { + if (array->length() == 0) { + return {-1, -1}; + } + + int minPos = 0; + int maxPos = 0; + SwitchType(array->type_id(), [&](const auto& type) { + using TWrap = std::decay_t; + using TArray = typename arrow::TypeTraits::ArrayType; + + auto& column = static_cast(*array); + + for (int i = 1; i < column.length(); ++i) { + const auto& value = column.GetView(i); + if (value < column.GetView(minPos)) { + minPos = i; + } + if (value > column.GetView(maxPos)) { + maxPos = i; + } + } + return true; + }); + return {minPos, maxPos}; +} + +std::shared_ptr MinScalar(const std::shared_ptr& type) { + std::shared_ptr out; + SwitchType(type->id(), [&](const auto& t) { + using TWrap = std::decay_t; + using T = typename TWrap::T; + using TScalar = typename arrow::TypeTraits::ScalarType; + + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + out = std::make_shared(arrow::Buffer::FromString(""), type); + } else if constexpr (std::is_same_v) { + std::string s(static_cast(*type).byte_width(), '\0'); + out = std::make_shared(arrow::Buffer::FromString(s), type); + } else if constexpr (std::is_same_v) { + return false; + } else if constexpr (arrow::is_temporal_type::value) { + using TCType = typename arrow::TypeTraits::CType; + out = std::make_shared(Min(), type); + } else if constexpr (arrow::has_c_type::value) { + using TCType = typename arrow::TypeTraits::CType; + out = std::make_shared(Min()); + } else { + return false; + } + return true; + }); + Y_ABORT_UNLESS(out); + return out; +} + +namespace { + +template +class TDefaultScalarValue { +public: + static constexpr T Value = 0; +}; + +template <> +class TDefaultScalarValue { +public: + static constexpr bool Value = false; +}; + +} + +std::shared_ptr DefaultScalar(const std::shared_ptr& type) { + std::shared_ptr out; + SwitchType(type->id(), [&](const auto& t) { + using TWrap = std::decay_t; + using T = typename TWrap::T; + using TScalar = typename arrow::TypeTraits::ScalarType; + + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + out = std::make_shared(arrow::Buffer::FromString(""), type); + } else if constexpr (std::is_same_v) { + std::string s(static_cast(*type).byte_width(), '\0'); + out = std::make_shared(arrow::Buffer::FromString(s), type); + } else if constexpr (std::is_same_v) { + return false; + } else if constexpr (arrow::is_temporal_type::value) { + using TCType = typename arrow::TypeTraits::CType; + out = std::make_shared(TDefaultScalarValue::Value, type); + } else if constexpr (arrow::has_c_type::value) { + using TCType = typename arrow::TypeTraits::CType; + out = std::make_shared(TDefaultScalarValue::Value); + } else { + return false; + } + return true; + }); + AFL_VERIFY(out)("type", type->ToString()); + return out; +} + +std::shared_ptr GetScalar(const std::shared_ptr& array, int position) { + auto res = array->GetScalar(position); + Y_ABORT_UNLESS(res.ok()); + return *res; +} + +bool IsGoodScalar(const std::shared_ptr& x) { + if (!x) { + return false; + } + + return SwitchType(x->type->id(), [&](const auto& type) { + using TWrap = std::decay_t; + using TScalar = typename arrow::TypeTraits::ScalarType; + using TValue = std::decay_t(*x).value)>; + + if constexpr (arrow::has_string_view()) { + const auto& xval = static_cast(*x).value; + return xval && xval->data(); + } + if constexpr (std::is_arithmetic_v) { + return true; + } + return false; + }); +} + +bool ScalarLess(const std::shared_ptr& x, const std::shared_ptr& y) { + Y_ABORT_UNLESS(x); + Y_ABORT_UNLESS(y); + return ScalarLess(*x, *y); +} + +bool ScalarLess(const arrow::Scalar& x, const arrow::Scalar& y) { + return ScalarCompare(x, y) < 0; +} + +bool ColumnEqualsScalar( + const std::shared_ptr& c, const ui32 position, const std::shared_ptr& s) { + AFL_VERIFY(c); + if (!s) { + return c->IsNull(position) ; + } + AFL_VERIFY(c->type()->Equals(s->type))("s", s->type->ToString())("c", c->type()->ToString()); + + return SwitchTypeImpl(c->type()->id(), [&](const auto& type) { + using TWrap = std::decay_t; + using TScalar = typename arrow::TypeTraits::ScalarType; + using TArrayType = typename arrow::TypeTraits::ArrayType; + using TValue = std::decay_t(*s).value)>; + + if constexpr (arrow::has_string_view()) { + const auto& cval = static_cast(*c).GetView(position); + const auto& sval = static_cast(*s).value; + AFL_VERIFY(sval); + TStringBuf cBuf(reinterpret_cast(cval.data()), cval.size()); + TStringBuf sBuf(reinterpret_cast(sval->data()), sval->size()); + return cBuf == sBuf; + } + if constexpr (std::is_arithmetic_v) { + const auto cval = static_cast(*c).GetView(position); + const auto sval = static_cast(*s).value; + return (cval == sval); + } + Y_ABORT_UNLESS(false); // TODO: non primitive types + return false; + }); +} + +int ScalarCompare(const arrow::Scalar& x, const arrow::Scalar& y) { + Y_VERIFY_S(x.type->Equals(y.type), x.type->ToString() + " vs " + y.type->ToString()); + + return SwitchTypeImpl(x.type->id(), [&](const auto& type) { + using TWrap = std::decay_t; + using TScalar = typename arrow::TypeTraits::ScalarType; + using TValue = std::decay_t(x).value)>; + + if constexpr (arrow::has_string_view()) { + const auto& xval = static_cast(x).value; + const auto& yval = static_cast(y).value; + Y_ABORT_UNLESS(xval); + Y_ABORT_UNLESS(yval); + TStringBuf xBuf(reinterpret_cast(xval->data()), xval->size()); + TStringBuf yBuf(reinterpret_cast(yval->data()), yval->size()); + if (xBuf < yBuf) { + return -1; + } else if (yBuf < xBuf) { + return 1; + } else { + return 0; + } + } + if constexpr (std::is_arithmetic_v) { + const auto& xval = static_cast(x).value; + const auto& yval = static_cast(y).value; + if (xval < yval) { + return -1; + } else if (yval < xval) { + return 1; + } else { + return 0; + } + } + Y_ABORT_UNLESS(false); // TODO: non primitive types + return 0; + }); +} + +int ScalarCompare(const std::shared_ptr& x, const std::shared_ptr& y) { + Y_ABORT_UNLESS(x); + Y_ABORT_UNLESS(y); + return ScalarCompare(*x, *y); +} + +int ScalarCompareNullable(const std::shared_ptr& x, const std::shared_ptr& y) { + if (!x && !!y) { + return -1; + } + if (!!x && !y) { + return 1; + } + if (!x && !y) { + return 0; + } + return ScalarCompare(*x, *y); +} + +std::shared_ptr BoolVecToArray(const std::vector& vec) { + std::shared_ptr out; + arrow::BooleanBuilder builder; + for (const auto val : vec) { + Y_ABORT_UNLESS(builder.Append(val).ok()); + } + Y_ABORT_UNLESS(builder.Finish(&out).ok()); + return out; +} + + +bool ArrayScalarsEqual(const std::shared_ptr& lhs, const std::shared_ptr& rhs) { + bool res = lhs->length() == rhs->length(); + for (int64_t i = 0; i < lhs->length() && res; ++i) { + res &= arrow::ScalarEquals(*lhs->GetScalar(i).ValueOrDie(), *rhs->GetScalar(i).ValueOrDie()); + } + return res; +} + +bool ReserveData(arrow::ArrayBuilder& builder, const size_t size) { + arrow::Status result = arrow::Status::OK(); + if (builder.type()->id() == arrow::Type::BINARY || + builder.type()->id() == arrow::Type::STRING) + { + static_assert(std::is_convertible_v&>, + "Expected StringBuilder to be BaseBinaryBuilder"); + auto& bBuilder = static_cast&>(builder); + result = bBuilder.ReserveData(size); + } + + if (!result.ok()) { + AFL_ERROR(NKikimrServices::ARROW_HELPER)("event", "ReserveData")("error", result.ToString()); + } + return result.ok(); +} + +template +bool MergeBatchColumnsImpl(const std::vector>& batches, std::shared_ptr& result, + const std::vector& columnsOrder, const bool orderFieldsAreNecessary, const TBuilder& builder) { + if (batches.empty()) { + result = nullptr; + return true; + } + if (batches.size() == 1) { + result = batches.front(); + return true; + } + std::vector> fields; + std::vector> columns; + std::map fieldNames; + for (auto&& i : batches) { + Y_ABORT_UNLESS(i); + for (auto&& f : i->schema()->fields()) { + if (!fieldNames.emplace(f->name(), fields.size()).second) { + AFL_ERROR(NKikimrServices::ARROW_HELPER)("event", "duplicated column")("name", f->name()); + return false; + } + fields.emplace_back(f); + } + if (i->num_rows() != batches.front()->num_rows()) { + AFL_ERROR(NKikimrServices::ARROW_HELPER)("event", "inconsistency record sizes")("i", i->num_rows())("front", batches.front()->num_rows()); + return false; + } + for (auto&& c : i->columns()) { + columns.emplace_back(c); + } + } + + Y_ABORT_UNLESS(fields.size() == columns.size()); + if (columnsOrder.size()) { + std::vector> fieldsOrdered; + std::vector> columnsOrdered; + for (auto&& i : columnsOrder) { + auto it = fieldNames.find(i); + if (orderFieldsAreNecessary) { + Y_ABORT_UNLESS(it != fieldNames.end()); + } else if (it == fieldNames.end()) { + continue; + } + fieldsOrdered.emplace_back(fields[it->second]); + columnsOrdered.emplace_back(columns[it->second]); + } + std::swap(fieldsOrdered, fields); + std::swap(columnsOrdered, columns); + } + result = builder(std::make_shared(fields), batches.front()->num_rows(), std::move(columns)); + return true; +} + +bool MergeBatchColumns(const std::vector>& batches, std::shared_ptr& result, const std::vector& columnsOrder, const bool orderFieldsAreNecessary) { + const auto builder = [](const std::shared_ptr& schema, const ui32 recordsCount, std::vector>&& columns) { + return arrow::Table::Make(schema, columns, recordsCount); + }; + + return MergeBatchColumnsImpl(batches, result, columnsOrder, orderFieldsAreNecessary, builder); +} + +bool MergeBatchColumns(const std::vector>& batches, std::shared_ptr& result, const std::vector& columnsOrder, const bool orderFieldsAreNecessary) { + const auto builder = [](const std::shared_ptr& schema, const ui32 recordsCount, std::vector>&& columns) { + return arrow::RecordBatch::Make(schema, recordsCount, columns); + }; + + return MergeBatchColumnsImpl(batches, result, columnsOrder, orderFieldsAreNecessary, builder); +} + +std::partial_ordering ColumnsCompare(const std::vector>& x, const ui32 xRow, const std::vector>& y, const ui32 yRow) { + return TRawReplaceKey(&x, xRow).CompareNotNull(TRawReplaceKey(&y, yRow)); +} + +NJson::TJsonValue DebugJson(std::shared_ptr array, const ui32 position) { + NJson::TJsonValue result = NJson::JSON_ARRAY; + for (auto&& i : array->columns()) { + result.AppendValue(DebugJson(i, position)); + } + return result; +} + +TString DebugString(std::shared_ptr array, const ui32 position) { + if (!array) { + return "_NO_DATA"; + } + Y_ABORT_UNLESS(position < array->length()); + TStringBuilder result; + SwitchType(array->type_id(), [&](const auto& type) { + using TWrap = std::decay_t; + using TArray = typename arrow::TypeTraits::ArrayType; + + auto& column = static_cast(*array); + if constexpr (arrow::has_string_view()) { + auto value = column.GetString(position); + result << TString(value.data(), value.size()); + } + if constexpr (arrow::has_c_type()) { + result << column.Value(position); + } + return true; + }); + return result; +} + +NJson::TJsonValue DebugJson(std::shared_ptr array, const ui32 position) { + if (!array) { + return NJson::JSON_NULL; + } + Y_ABORT_UNLESS(position < array->length()); + NJson::TJsonValue result = NJson::JSON_MAP; + SwitchType(array->type_id(), [&](const auto& type) { + using TWrap = std::decay_t; + using TArray = typename arrow::TypeTraits::ArrayType; + + auto& column = static_cast(*array); + result.InsertValue("type", typeid(TArray).name()); + if constexpr (arrow::has_string_view()) { + auto value = column.GetString(position); + result.InsertValue("value", TString(value.data(), value.size())); + } + if constexpr (arrow::has_c_type()) { + result.InsertValue("value", column.Value(position)); + } + return true; + }); + return result; +} + +NJson::TJsonValue DebugJson(std::shared_ptr array, const ui32 head, const ui32 tail) { + if (!array) { + return NJson::JSON_NULL; + } + NJson::TJsonValue resultFull = NJson::JSON_MAP; + resultFull.InsertValue("length", array->length()); + SwitchType(array->type_id(), [&](const auto& type) { + using TWrap = std::decay_t; + using TArray = typename arrow::TypeTraits::ArrayType; + + auto& column = static_cast(*array); + resultFull.InsertValue("type", typeid(TArray).name()); + resultFull.InsertValue("head", head); + resultFull.InsertValue("tail", tail); + auto& result = resultFull.InsertValue("data", NJson::JSON_ARRAY); + for (int i = 0; i < column.length(); ++i) { + if (i >= (int)head && i + (int)tail < column.length()) { + continue; + } + if constexpr (arrow::has_string_view()) { + auto value = column.GetString(i); + result.AppendValue(TString(value.data(), value.size())); + } + if constexpr (arrow::has_c_type()) { + result.AppendValue(column.Value(i)); + } + } + return true; + }); + return resultFull; +} + +NJson::TJsonValue DebugJson(std::shared_ptr batch, const ui32 head, const ui32 tail) { + if (!batch) { + return NJson::JSON_NULL; + } + NJson::TJsonValue result = NJson::JSON_ARRAY; + ui32 idx = 0; + for (auto&& i : batch->columns()) { + auto& jsonColumn = result.AppendValue(NJson::JSON_MAP); + jsonColumn.InsertValue("name", batch->column_name(idx)); + jsonColumn.InsertValue("data", DebugJson(i, head, tail)); + ++idx; + } + return result; +} + +std::shared_ptr MergeColumns(const std::vector>& batches) { + std::vector> columns; + std::vector> fields; + std::optional recordsCount; + std::set columnNames; + for (auto&& batch : batches) { + if (!batch) { + continue; + } + for (auto&& column : batch->columns()) { + columns.emplace_back(column); + if (!recordsCount) { + recordsCount = column->length(); + } else { + Y_ABORT_UNLESS(*recordsCount == column->length()); + } + } + for (auto&& field : batch->schema()->fields()) { + AFL_VERIFY(columnNames.emplace(field->name()).second)("field_name", field->name()); + fields.emplace_back(field); + } + } + if (columns.empty()) { + return nullptr; + } + auto schema = std::make_shared(fields); + return arrow::RecordBatch::Make(schema, *recordsCount, columns); +} + +std::vector> SliceToRecordBatches(const std::shared_ptr& t) { + if (!t->num_rows()) { + return {}; + } + std::vector positions; + { + for (auto&& i : t->columns()) { + ui32 pos = 0; + for (auto&& arr : i->chunks()) { + positions.emplace_back(pos); + pos += arr->length(); + } + AFL_VERIFY(pos == t->num_rows()); + } + positions.emplace_back(t->num_rows()); + } + std::sort(positions.begin(), positions.end()); + positions.erase(std::unique(positions.begin(), positions.end()), positions.end()); + AFL_VERIFY(positions.size() > 1)("size", positions.size())("positions", JoinSeq(",", positions)); + std::vector>> slicedData; + slicedData.resize(positions.size() - 1); + for (auto&& i : t->columns()) { + ui32 currentPosition = 0; + auto it = i->chunks().begin(); + ui32 length = 0; + const auto initializeIt = [&length, &it, &i]() { + for (; it != i->chunks().end() && !(*it)->length(); ++it) { + } + if (it != i->chunks().end()) { + length = (*it)->length(); + } + }; + initializeIt(); + for (ui32 idx = 0; idx + 1 < positions.size(); ++idx) { + AFL_VERIFY(it != i->chunks().end()); + AFL_VERIFY(positions[idx + 1] - currentPosition <= length)("length", length)("idx+1", positions[idx + 1])("pos", currentPosition); + auto chunk = (*it)->Slice(positions[idx] - currentPosition, positions[idx + 1] - positions[idx]); + AFL_VERIFY_DEBUG(chunk->length() == positions[idx + 1] - positions[idx])("length", chunk->length())("expect", positions[idx + 1] - positions[idx]); + if (positions[idx + 1] - currentPosition == length) { + ++it; + initializeIt(); + currentPosition = positions[idx + 1]; + } + slicedData[idx].emplace_back(chunk); + } + } + std::vector> result; + ui32 count = 0; + for (auto&& i : slicedData) { + AFL_VERIFY(i.size()); + AFL_VERIFY(i.front()->length()); + result.emplace_back(arrow::RecordBatch::Make(t->schema(), i.front()->length(), i)); + count += result.back()->num_rows(); + } + AFL_VERIFY(count == t->num_rows())("count", count)("t", t->num_rows())("sd_size", slicedData.size())("columns", t->num_columns())( + "schema", t->schema()->ToString()); + return result; +} + +std::shared_ptr ToTable(const std::shared_ptr& batch) { + if (!batch) { + return nullptr; + } + return TStatusValidator::GetValid(arrow::Table::FromRecordBatches(batch->schema(), {batch})); +} + +bool HasNulls(const std::shared_ptr& column) { + AFL_VERIFY(column); + return column->null_bitmap_data(); +} + +std::vector ConvertStrings(const std::vector& input) { + std::vector result; + for (auto&& i : input) { + result.emplace_back(i); + } + return result; +} + +std::vector ConvertStrings(const std::vector& input) { + std::vector result; + for (auto&& i : input) { + result.emplace_back(i); + } + return result; +} + +} diff --git a/ydb/library/formats/arrow/arrow_helpers.h b/ydb/library/formats/arrow/arrow_helpers.h new file mode 100644 index 000000000000..8bceee2d836e --- /dev/null +++ b/ydb/library/formats/arrow/arrow_helpers.h @@ -0,0 +1,101 @@ +#pragma once +#include "switch_type.h" +#include +#include +#include +#include +#include +#include +#include + +namespace NKikimr::NArrow { + +using TArrayVec = std::vector>; + +template +inline bool ArrayEqualValue(const std::shared_ptr& x, const std::shared_ptr& y) { + auto& arrX = static_cast(*x); + auto& arrY = static_cast(*y); + for (int i = 0; i < x->length(); ++i) { + if (arrX.Value(i) != arrY.Value(i)) { + return false; + } + } + return true; +} + +template +inline bool ArrayEqualView(const std::shared_ptr& x, const std::shared_ptr& y) { + auto& arrX = static_cast(*x); + auto& arrY = static_cast(*y); + for (int i = 0; i < x->length(); ++i) { + if (arrX.GetView(i) != arrY.GetView(i)) { + return false; + } + } + return true; +} + +struct TSortDescription; + +TString SerializeSchema(const arrow::Schema& schema); + +std::shared_ptr MakeEmptyBatch(const std::shared_ptr& schema, const ui32 rowsCount = 0); +std::shared_ptr ToTable(const std::shared_ptr& batch); + +std::shared_ptr ToBatch(const std::shared_ptr& combinedTable, const bool combine); +std::shared_ptr CombineBatches(const std::vector>& batches); +std::shared_ptr MergeColumns(const std::vector>& rb); +std::vector> ShardingSplit(const std::shared_ptr& batch, const std::vector& sharding, ui32 numShards); +std::vector> ShardingSplit(const std::shared_ptr& batch, const std::vector>& shardRows, const ui32 numShards); +THashMap> ShardingSplit(const std::shared_ptr& batch, const THashMap>& shardRows); + +std::unique_ptr MakeBuilder(const std::shared_ptr& field); +std::unique_ptr MakeBuilder(const std::shared_ptr& type); + +std::vector> MakeBuilders(const std::shared_ptr& schema, + size_t reserve = 0, const std::map& sizeByColumn = {}); +std::vector> Finish(std::vector>&& builders); + +std::shared_ptr MakeUI64Array(ui64 value, i64 size); +std::vector ColumnNames(const std::shared_ptr& schema); +bool ReserveData(arrow::ArrayBuilder& builder, const size_t size); +bool MergeBatchColumns(const std::vector>& batches, std::shared_ptr& result, const std::vector& columnsOrder = {}, const bool orderFieldsAreNecessary = true); +bool MergeBatchColumns(const std::vector>& batches, std::shared_ptr& result, const std::vector& columnsOrder = {}, const bool orderFieldsAreNecessary = true); + +bool HasAllColumns(const std::shared_ptr& batch, const std::shared_ptr& schema); + +std::pair FindMinMaxPosition(const std::shared_ptr& column); + +std::shared_ptr DefaultScalar(const std::shared_ptr& type); +std::shared_ptr MinScalar(const std::shared_ptr& type); +std::shared_ptr GetScalar(const std::shared_ptr& array, int position); +bool IsGoodScalar(const std::shared_ptr& x); +int ScalarCompare(const arrow::Scalar& x, const arrow::Scalar& y); +int ScalarCompare(const std::shared_ptr& x, const std::shared_ptr& y); +int ScalarCompareNullable(const std::shared_ptr& x, const std::shared_ptr& y); +std::partial_ordering ColumnsCompare( + const std::vector>& x, const ui32 xRow, const std::vector>& y, const ui32 yRow); +bool ColumnEqualsScalar( + const std::shared_ptr& c, const ui32 position, const std::shared_ptr& s); +bool ScalarLess(const std::shared_ptr& x, const std::shared_ptr& y); +bool ScalarLess(const arrow::Scalar& x, const arrow::Scalar& y); + +bool HasNulls(const std::shared_ptr& column); + +std::vector> SliceToRecordBatches(const std::shared_ptr& t); + +bool ArrayScalarsEqual(const std::shared_ptr& lhs, const std::shared_ptr& rhs); +std::shared_ptr BoolVecToArray(const std::vector& vec); + +NJson::TJsonValue DebugJson(std::shared_ptr array, const ui32 head, const ui32 tail); +NJson::TJsonValue DebugJson(std::shared_ptr batch, const ui32 head, const ui32 tail); + +NJson::TJsonValue DebugJson(std::shared_ptr array, const ui32 position); +TString DebugString(std::shared_ptr array, const ui32 position); +NJson::TJsonValue DebugJson(std::shared_ptr array, const ui32 position); + +std::shared_ptr Reorder(const std::shared_ptr& batch, + const std::shared_ptr& permutation, const bool canRemove); + +} diff --git a/ydb/library/formats/arrow/common/validation.h b/ydb/library/formats/arrow/common/validation.h new file mode 100644 index 000000000000..171b50041db4 --- /dev/null +++ b/ydb/library/formats/arrow/common/validation.h @@ -0,0 +1,3 @@ +#pragma once + +#include diff --git a/ydb/core/formats/arrow/common/vector_operations.h b/ydb/library/formats/arrow/common/vector_operations.h similarity index 100% rename from ydb/core/formats/arrow/common/vector_operations.h rename to ydb/library/formats/arrow/common/vector_operations.h diff --git a/ydb/library/formats/arrow/common/ya.make b/ydb/library/formats/arrow/common/ya.make new file mode 100644 index 000000000000..d2fa92cfc9f2 --- /dev/null +++ b/ydb/library/formats/arrow/common/ya.make @@ -0,0 +1,12 @@ +LIBRARY() + +PEERDIR( + contrib/libs/apache/arrow + ydb/library/formats/arrow/switch + ydb/library/actors/core + ydb/library/conclusion + ydb/library/formats/arrow/splitter + ydb/library/formats/arrow/validation +) + +END() diff --git a/ydb/core/formats/arrow/hash/xx_hash.cpp b/ydb/library/formats/arrow/hash/xx_hash.cpp similarity index 100% rename from ydb/core/formats/arrow/hash/xx_hash.cpp rename to ydb/library/formats/arrow/hash/xx_hash.cpp diff --git a/ydb/core/formats/arrow/hash/xx_hash.h b/ydb/library/formats/arrow/hash/xx_hash.h similarity index 100% rename from ydb/core/formats/arrow/hash/xx_hash.h rename to ydb/library/formats/arrow/hash/xx_hash.h diff --git a/ydb/library/formats/arrow/hash/ya.make b/ydb/library/formats/arrow/hash/ya.make new file mode 100644 index 000000000000..f9a20aba9be8 --- /dev/null +++ b/ydb/library/formats/arrow/hash/ya.make @@ -0,0 +1,17 @@ +LIBRARY() + +PEERDIR( + contrib/libs/apache/arrow + ydb/library/formats/arrow/simple_builder + ydb/library/formats/arrow/switch + ydb/library/actors/core + ydb/library/services + ydb/library/actors/protos +) + +SRCS( + xx_hash.cpp +) + +END() + diff --git a/ydb/core/formats/arrow/input_stream.h b/ydb/library/formats/arrow/input_stream.h similarity index 100% rename from ydb/core/formats/arrow/input_stream.h rename to ydb/library/formats/arrow/input_stream.h diff --git a/ydb/core/formats/arrow/modifier/schema.cpp b/ydb/library/formats/arrow/modifier/schema.cpp similarity index 97% rename from ydb/core/formats/arrow/modifier/schema.cpp rename to ydb/library/formats/arrow/modifier/schema.cpp index 728eff839592..87b7de842c2c 100644 --- a/ydb/core/formats/arrow/modifier/schema.cpp +++ b/ydb/library/formats/arrow/modifier/schema.cpp @@ -1,6 +1,6 @@ #include "schema.h" #include -#include +#include #include namespace NKikimr::NArrow::NModifier { diff --git a/ydb/core/formats/arrow/modifier/schema.h b/ydb/library/formats/arrow/modifier/schema.h similarity index 100% rename from ydb/core/formats/arrow/modifier/schema.h rename to ydb/library/formats/arrow/modifier/schema.h diff --git a/ydb/core/formats/arrow/modifier/subset.cpp b/ydb/library/formats/arrow/modifier/subset.cpp similarity index 100% rename from ydb/core/formats/arrow/modifier/subset.cpp rename to ydb/library/formats/arrow/modifier/subset.cpp diff --git a/ydb/core/formats/arrow/modifier/subset.h b/ydb/library/formats/arrow/modifier/subset.h similarity index 96% rename from ydb/core/formats/arrow/modifier/subset.h rename to ydb/library/formats/arrow/modifier/subset.h index fc15d44e4fb8..23430af5524f 100644 --- a/ydb/core/formats/arrow/modifier/subset.h +++ b/ydb/library/formats/arrow/modifier/subset.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include #include diff --git a/ydb/core/formats/arrow/modifier/ya.make b/ydb/library/formats/arrow/modifier/ya.make similarity index 67% rename from ydb/core/formats/arrow/modifier/ya.make rename to ydb/library/formats/arrow/modifier/ya.make index 4b2b53f5513a..4a509475a173 100644 --- a/ydb/core/formats/arrow/modifier/ya.make +++ b/ydb/library/formats/arrow/modifier/ya.make @@ -3,8 +3,8 @@ LIBRARY() PEERDIR( contrib/libs/apache/arrow ydb/library/conclusion - ydb/core/formats/arrow/switch - ydb/core/formats/arrow/protos + ydb/library/formats/arrow/switch + ydb/library/formats/arrow/protos ydb/library/actors/core ) diff --git a/ydb/library/formats/arrow/permutations.cpp b/ydb/library/formats/arrow/permutations.cpp new file mode 100644 index 000000000000..8eb270c2a424 --- /dev/null +++ b/ydb/library/formats/arrow/permutations.cpp @@ -0,0 +1,214 @@ +#include "permutations.h" + +#include "arrow_helpers.h" +#include "replace_key.h" +#include "size_calcer.h" + +#include +#include + +#include + +#include +#include + +namespace NKikimr::NArrow { + +std::shared_ptr MakePermutation(const int size, const bool reverse) { + arrow::UInt64Builder builder; + TStatusValidator::Validate(builder.Reserve(size)); + + if (size) { + if (reverse) { + ui64 value = size - 1; + for (i64 i = 0; i < size; ++i, --value) { + TStatusValidator::Validate(builder.Append(value)); + } + } else { + for (i64 i = 0; i < size; ++i) { + TStatusValidator::Validate(builder.Append(i)); + } + } + } + + std::shared_ptr out; + TStatusValidator::Validate(builder.Finish(&out)); + return out; +} + +template +std::shared_ptr MakeFilterPermutationImpl(const std::vector& indexes) { + if (indexes.empty()) { + return {}; + } + + arrow::UInt64Builder builder; + if (!builder.Reserve(indexes.size()).ok()) { + return {}; + } + + for (auto&& i : indexes) { + TStatusValidator::Validate(builder.Append(i)); + } + std::shared_ptr out; + TStatusValidator::Validate(builder.Finish(&out)); + return out; +} + +std::shared_ptr MakeFilterPermutation(const std::vector& indexes) { + return MakeFilterPermutationImpl(indexes); +} + +std::shared_ptr MakeFilterPermutation(const std::vector& indexes) { + return MakeFilterPermutationImpl(indexes); +} + +std::shared_ptr CopyRecords(const std::shared_ptr& source, const std::vector& indexes) { + Y_ABORT_UNLESS(!!source); + auto schema = source->schema(); + std::vector> columns; + for (auto&& i : source->columns()) { + columns.emplace_back(CopyRecords(i, indexes)); + } + return arrow::RecordBatch::Make(schema, indexes.size(), columns); +} + +std::shared_ptr CopyRecords(const std::shared_ptr& source, const std::vector& indexes) { + if (!source) { + return source; + } + std::shared_ptr result; + SwitchType(source->type_id(), [&](const auto& type) { + using TWrap = std::decay_t; + using TArray = typename arrow::TypeTraits::ArrayType; + using TBuilder = typename arrow::TypeTraits::BuilderType; + auto& column = static_cast(*source); + + std::unique_ptr builder; + TStatusValidator::Validate(arrow::MakeBuilder(arrow::default_memory_pool(), source->type(), &builder)); + auto& builderImpl = static_cast(*builder); + + if constexpr (arrow::has_string_view::value) { + ui64 sumByIndexes = 0; + for (auto&& idx : indexes) { + Y_ABORT_UNLESS(idx < (ui64)column.length()); + sumByIndexes += column.GetView(idx).size(); + } + TStatusValidator::Validate(builderImpl.ReserveData(sumByIndexes)); + } + + TStatusValidator::Validate(builder->Reserve(indexes.size())); + + { + const ui32 arraySize = column.length(); + for (auto&& i : indexes) { + Y_ABORT_UNLESS(i < arraySize); + builderImpl.UnsafeAppend(column.GetView(i)); + } + } + + TStatusValidator::Validate(builder->Finish(&result)); + return true; + }); + Y_ABORT_UNLESS(result); + return result; +} + +ui64 TShardedRecordBatch::GetMemorySize() const { + return NArrow::GetTableMemorySize(RecordBatch); +} + +TShardedRecordBatch::TShardedRecordBatch(const std::shared_ptr& batch) { + AFL_VERIFY(batch); + RecordBatch = TStatusValidator::GetValid(arrow::Table::FromRecordBatches(batch->schema(), {batch})); +} + + +TShardedRecordBatch::TShardedRecordBatch(const std::shared_ptr& batch) + : RecordBatch(batch) +{ + AFL_VERIFY(RecordBatch); +} + +TShardedRecordBatch::TShardedRecordBatch(const std::shared_ptr& batch, std::vector>&& splittedByShards) + : RecordBatch(batch) + , SplittedByShards(std::move(splittedByShards)) +{ + AFL_VERIFY(RecordBatch); + AFL_VERIFY(SplittedByShards.size()); +} + +std::vector> TShardingSplitIndex::Apply(const std::shared_ptr& input) { + AFL_VERIFY(input); + AFL_VERIFY(input->num_rows() == RecordsCount); + auto permutation = BuildPermutation(); + auto resultBatch = NArrow::TStatusValidator::GetValid(arrow::compute::Take(input, *permutation)).table(); + AFL_VERIFY(resultBatch->num_rows() == RecordsCount); + std::vector> result; + ui64 startIndex = 0; + for (auto&& i : Remapping) { + result.emplace_back(resultBatch->Slice(startIndex, i.size())); + startIndex += i.size(); + } + AFL_VERIFY(startIndex == RecordsCount); + return result; +} + +NKikimr::NArrow::TShardedRecordBatch TShardingSplitIndex::Apply(const ui32 shardsCount, const std::shared_ptr& input, const std::string& hashColumnName) { + AFL_VERIFY(input); + if (shardsCount == 1) { + return TShardedRecordBatch(input); + } + auto hashColumn = input->GetColumnByName(hashColumnName); + if (!hashColumn) { + return TShardedRecordBatch(input); + } + std::optional splitter; + if (hashColumn->type()->id() == arrow::Type::UINT64) { + splitter = TShardingSplitIndex::Build(shardsCount, *hashColumn); + } else if (hashColumn->type()->id() == arrow::Type::UINT32) { + splitter = TShardingSplitIndex::Build(shardsCount, *hashColumn); + } else if (hashColumn->type()->id() == arrow::Type::INT64) { + splitter = TShardingSplitIndex::Build(shardsCount, *hashColumn); + } else if (hashColumn->type()->id() == arrow::Type::INT32) { + splitter = TShardingSplitIndex::Build(shardsCount, *hashColumn); + } else { + Y_ABORT_UNLESS(false); + } + auto resultBatch = NArrow::TStatusValidator::GetValid(input->RemoveColumn(input->schema()->GetFieldIndex(hashColumnName))); + return TShardedRecordBatch(resultBatch, splitter->DetachRemapping()); +} + +TShardedRecordBatch TShardingSplitIndex::Apply(const ui32 shardsCount, const std::shared_ptr& input, const std::string& hashColumnName) { + return Apply(shardsCount, TStatusValidator::GetValid(arrow::Table::FromRecordBatches(input->schema(), {input})) + , hashColumnName); +} + +std::shared_ptr TShardingSplitIndex::BuildPermutation() const { + arrow::UInt64Builder builder; + Y_ABORT_UNLESS(builder.Reserve(RecordsCount).ok()); + + for (auto&& i : Remapping) { + for (auto&& idx : i) { + TStatusValidator::Validate(builder.Append(idx)); + } + } + + std::shared_ptr out; + Y_ABORT_UNLESS(builder.Finish(&out).ok()); + return out; +} + +std::shared_ptr ReverseRecords(const std::shared_ptr& batch) { + AFL_VERIFY(batch); + auto permutation = NArrow::MakePermutation(batch->num_rows(), true); + return NArrow::TStatusValidator::GetValid(arrow::compute::Take(batch, permutation)).record_batch(); +} + +std::shared_ptr ReverseRecords(const std::shared_ptr& batch) { + AFL_VERIFY(batch); + auto permutation = NArrow::MakePermutation(batch->num_rows(), true); + return NArrow::TStatusValidator::GetValid(arrow::compute::Take(batch, permutation)).table(); +} + +} diff --git a/ydb/library/formats/arrow/permutations.h b/ydb/library/formats/arrow/permutations.h new file mode 100644 index 000000000000..48dca1fa427d --- /dev/null +++ b/ydb/library/formats/arrow/permutations.h @@ -0,0 +1,149 @@ +#pragma once +#include "arrow_helpers.h" + +#include +#include +#include +#include + +namespace NKikimr::NArrow { + +class TShardedRecordBatch { +private: + YDB_READONLY_DEF(std::shared_ptr, RecordBatch); + YDB_READONLY_DEF(std::vector>, SplittedByShards); +public: + TShardedRecordBatch(const std::shared_ptr& batch); + TShardedRecordBatch(const std::shared_ptr& batch); + + void Cut(const ui32 limit) { + RecordBatch = RecordBatch->Slice(0, limit); + for (auto&& i : SplittedByShards) { + auto it = std::lower_bound(i.begin(), i.end(), limit); + if (it != i.end()) { + i.erase(it, i.end()); + } + } + } + + bool IsSharded() const { + return SplittedByShards.size() > 1; + } + + TShardedRecordBatch(const std::shared_ptr& batch, std::vector>&& splittedByShards); + + ui64 GetMemorySize() const; + + ui64 GetRecordsCount() const { + return RecordBatch->num_rows(); + } +}; + +class TShardingSplitIndex { +private: + ui32 ShardsCount = 0; + std::vector> Remapping; + ui32 RecordsCount = 0; + + template + std::vector MergeLists(const std::vector& base, const TIterator itFrom, const TIterator itTo) { + std::vector result; + result.reserve(base.size() + (itTo - itFrom)); + auto itBase = base.begin(); + auto itExt = itFrom; + while (itBase != base.end() && itExt != itTo) { + if (*itBase < *itExt) { + result.emplace_back(*itBase); + ++itBase; + } else { + result.emplace_back(*itExt); + ++itExt; + } + } + if (itBase == base.end()) { + result.insert(result.end(), itExt, itTo); + } else if (itExt == itTo) { + result.insert(result.end(), itBase, base.end()); + } + return result; + } + + template + void Initialize(const arrow::ChunkedArray& arrowHashArrayChunked) { + Y_ABORT_UNLESS(ShardsCount); + Remapping.resize(ShardsCount); + const ui32 expectation = arrowHashArrayChunked.length() / ShardsCount + 1; + for (auto&& i : Remapping) { + i.reserve(2 * expectation); + } + for (auto&& arrowHashArrayAbstract : arrowHashArrayChunked.chunks()) { + auto& arrowHashArray = static_cast(*arrowHashArrayAbstract); + ui64 offset = 0; + for (ui64 i = 0; i < (ui64)arrowHashArray.length(); ++i) { + const i64 v = arrowHashArray.GetView(i); + const ui32 idx = ((v < 0) ? (-v) : v) % ShardsCount; + Remapping[idx].emplace_back(offset + i); + } + offset += (ui64)arrowHashArray.length(); + } + std::deque*> sizeCorrection; + for (auto&& i : Remapping) { + sizeCorrection.emplace_back(&i); + } + const auto pred = [](const std::vector* l, const std::vector* r) { + return l->size() < r->size(); + }; + std::sort(sizeCorrection.begin(), sizeCorrection.end(), pred); + while (sizeCorrection.size() > 1 && sizeCorrection.back()->size() > expectation && sizeCorrection.front()->size() < expectation) { + const ui32 uselessRecords = sizeCorrection.back()->size() - expectation; + const ui32 needRecords = expectation - sizeCorrection.front()->size(); + const ui32 moveRecords = std::min(needRecords, uselessRecords); + if (moveRecords == 0) { + break; + } + *sizeCorrection.front() = MergeLists(*sizeCorrection.front(), sizeCorrection.back()->end() - moveRecords, sizeCorrection.back()->end()); + sizeCorrection.back()->resize(sizeCorrection.back()->size() - moveRecords); + if (sizeCorrection.back()->size() <= expectation) { + sizeCorrection.pop_back(); + } + if (sizeCorrection.front()->size() >= expectation) { + sizeCorrection.pop_front(); + } + } + } + + TShardingSplitIndex(const ui32 shardsCount, const arrow::ChunkedArray& arrowHashArray) + : ShardsCount(shardsCount) + , RecordsCount(arrowHashArray.length()) { + } + +public: + + std::vector> DetachRemapping() { + return std::move(Remapping); + } + + template + static TShardingSplitIndex Build(const ui32 shardsCount, const arrow::ChunkedArray& arrowHashArray) { + TShardingSplitIndex result(shardsCount, arrowHashArray); + result.Initialize(arrowHashArray); + return result; + } + + std::shared_ptr BuildPermutation() const; + + std::vector> Apply(const std::shared_ptr& input); + static TShardedRecordBatch Apply(const ui32 shardsCount, const std::shared_ptr& input, const std::string& hashColumnName); + static TShardedRecordBatch Apply(const ui32 shardsCount, const std::shared_ptr& input, const std::string& hashColumnName); +}; + +std::shared_ptr MakePermutation(const int size, const bool reverse = false); +std::shared_ptr MakeFilterPermutation(const std::vector& indexes); +std::shared_ptr MakeFilterPermutation(const std::vector& indexes); +std::shared_ptr ReverseRecords(const std::shared_ptr& batch); +std::shared_ptr ReverseRecords(const std::shared_ptr& batch); + +std::shared_ptr CopyRecords(const std::shared_ptr& source, const std::vector& indexes); +std::shared_ptr CopyRecords(const std::shared_ptr& source, const std::vector& indexes); + +} diff --git a/ydb/core/formats/arrow/protos/accessor.proto b/ydb/library/formats/arrow/protos/accessor.proto similarity index 100% rename from ydb/core/formats/arrow/protos/accessor.proto rename to ydb/library/formats/arrow/protos/accessor.proto diff --git a/ydb/core/formats/arrow/protos/fields.proto b/ydb/library/formats/arrow/protos/fields.proto similarity index 100% rename from ydb/core/formats/arrow/protos/fields.proto rename to ydb/library/formats/arrow/protos/fields.proto diff --git a/ydb/core/formats/arrow/protos/ssa.proto b/ydb/library/formats/arrow/protos/ssa.proto similarity index 100% rename from ydb/core/formats/arrow/protos/ssa.proto rename to ydb/library/formats/arrow/protos/ssa.proto diff --git a/ydb/core/formats/arrow/protos/ya.make b/ydb/library/formats/arrow/protos/ya.make similarity index 100% rename from ydb/core/formats/arrow/protos/ya.make rename to ydb/library/formats/arrow/protos/ya.make diff --git a/ydb/core/formats/arrow/replace_key.cpp b/ydb/library/formats/arrow/replace_key.cpp similarity index 100% rename from ydb/core/formats/arrow/replace_key.cpp rename to ydb/library/formats/arrow/replace_key.cpp diff --git a/ydb/core/formats/arrow/replace_key.h b/ydb/library/formats/arrow/replace_key.h similarity index 88% rename from ydb/core/formats/arrow/replace_key.h rename to ydb/library/formats/arrow/replace_key.h index 8fb54a18dddd..20db01a98b9c 100644 --- a/ydb/core/formats/arrow/replace_key.h +++ b/ydb/library/formats/arrow/replace_key.h @@ -5,8 +5,6 @@ #include "common/validation.h" #include "switch/compare.h" -#include - #include #include @@ -278,5 +276,36 @@ class TReplaceKeyHelper { static size_t LowerBound(const std::vector& batchKeys, const TReplaceKey& key, size_t offset); }; +template +static bool IsSelfSorted(const std::shared_ptr& batch) { + if (batch->num_rows() < 2) { + return true; + } + auto& columns = batch->columns(); + + for (int i = 1; i < batch->num_rows(); ++i) { + TRawReplaceKey prev(&columns, i - 1); + TRawReplaceKey current(&columns, i); + if constexpr (desc) { + if (prev < current) { + AFL_DEBUG(NKikimrServices::ARROW_HELPER)("event", "prev < current")("current", current.DebugString())("prev", prev.DebugString()); + return false; + } + } else { + if (current < prev) { + AFL_DEBUG(NKikimrServices::ARROW_HELPER)("event", "current < prev")("current", current.DebugString())("prev", prev.DebugString()); + return false; + } + } + if constexpr (uniq) { + if (prev == current) { + AFL_DEBUG(NKikimrServices::ARROW_HELPER)("event", "equal")("current", current.DebugString())("prev", prev.DebugString()); + return false; + } + } + } + return true; +} + } diff --git a/ydb/core/formats/arrow/scalar/serialization.cpp b/ydb/library/formats/arrow/scalar/serialization.cpp similarity index 97% rename from ydb/core/formats/arrow/scalar/serialization.cpp rename to ydb/library/formats/arrow/scalar/serialization.cpp index 2b8fb74a92ab..e488dfabd687 100644 --- a/ydb/core/formats/arrow/scalar/serialization.cpp +++ b/ydb/library/formats/arrow/scalar/serialization.cpp @@ -1,5 +1,5 @@ #include "serialization.h" -#include +#include #include namespace NKikimr::NArrow::NScalar { diff --git a/ydb/core/formats/arrow/scalar/serialization.h b/ydb/library/formats/arrow/scalar/serialization.h similarity index 100% rename from ydb/core/formats/arrow/scalar/serialization.h rename to ydb/library/formats/arrow/scalar/serialization.h diff --git a/ydb/core/formats/arrow/scalar/ya.make b/ydb/library/formats/arrow/scalar/ya.make similarity index 79% rename from ydb/core/formats/arrow/scalar/ya.make rename to ydb/library/formats/arrow/scalar/ya.make index d6284ba9ca44..41ca57bd3b7d 100644 --- a/ydb/core/formats/arrow/scalar/ya.make +++ b/ydb/library/formats/arrow/scalar/ya.make @@ -3,7 +3,7 @@ LIBRARY() PEERDIR( contrib/libs/apache/arrow ydb/library/conclusion - ydb/core/formats/arrow/switch + ydb/library/formats/arrow/switch ydb/library/actors/core ) diff --git a/ydb/core/formats/arrow/simple_arrays_cache.cpp b/ydb/library/formats/arrow/simple_arrays_cache.cpp similarity index 98% rename from ydb/core/formats/arrow/simple_arrays_cache.cpp rename to ydb/library/formats/arrow/simple_arrays_cache.cpp index 5c06b061061f..e963f50607c3 100644 --- a/ydb/core/formats/arrow/simple_arrays_cache.cpp +++ b/ydb/library/formats/arrow/simple_arrays_cache.cpp @@ -1,6 +1,8 @@ #include "simple_arrays_cache.h" #include "common/validation.h" +#include + namespace NKikimr::NArrow { std::shared_ptr TThreadSimpleArraysCache::GetNullImpl(const std::shared_ptr& type, const ui32 recordsCount) { diff --git a/ydb/core/formats/arrow/simple_arrays_cache.h b/ydb/library/formats/arrow/simple_arrays_cache.h similarity index 100% rename from ydb/core/formats/arrow/simple_arrays_cache.h rename to ydb/library/formats/arrow/simple_arrays_cache.h diff --git a/ydb/core/formats/arrow/simple_builder/array.cpp b/ydb/library/formats/arrow/simple_builder/array.cpp similarity index 100% rename from ydb/core/formats/arrow/simple_builder/array.cpp rename to ydb/library/formats/arrow/simple_builder/array.cpp diff --git a/ydb/core/formats/arrow/simple_builder/array.h b/ydb/library/formats/arrow/simple_builder/array.h similarity index 100% rename from ydb/core/formats/arrow/simple_builder/array.h rename to ydb/library/formats/arrow/simple_builder/array.h diff --git a/ydb/core/formats/arrow/simple_builder/batch.cpp b/ydb/library/formats/arrow/simple_builder/batch.cpp similarity index 100% rename from ydb/core/formats/arrow/simple_builder/batch.cpp rename to ydb/library/formats/arrow/simple_builder/batch.cpp diff --git a/ydb/core/formats/arrow/simple_builder/batch.h b/ydb/library/formats/arrow/simple_builder/batch.h similarity index 100% rename from ydb/core/formats/arrow/simple_builder/batch.h rename to ydb/library/formats/arrow/simple_builder/batch.h diff --git a/ydb/core/formats/arrow/simple_builder/filler.cpp b/ydb/library/formats/arrow/simple_builder/filler.cpp similarity index 100% rename from ydb/core/formats/arrow/simple_builder/filler.cpp rename to ydb/library/formats/arrow/simple_builder/filler.cpp diff --git a/ydb/core/formats/arrow/simple_builder/filler.h b/ydb/library/formats/arrow/simple_builder/filler.h similarity index 100% rename from ydb/core/formats/arrow/simple_builder/filler.h rename to ydb/library/formats/arrow/simple_builder/filler.h diff --git a/ydb/core/formats/arrow/simple_builder/ya.make b/ydb/library/formats/arrow/simple_builder/ya.make similarity index 100% rename from ydb/core/formats/arrow/simple_builder/ya.make rename to ydb/library/formats/arrow/simple_builder/ya.make diff --git a/ydb/library/formats/arrow/size_calcer.cpp b/ydb/library/formats/arrow/size_calcer.cpp new file mode 100644 index 000000000000..0f10f2e000fe --- /dev/null +++ b/ydb/library/formats/arrow/size_calcer.cpp @@ -0,0 +1,208 @@ +#include "size_calcer.h" +#include "switch_type.h" +#include "arrow_helpers.h" +#include +#include +#include + +namespace NKikimr::NArrow { + +ui32 TRowSizeCalculator::GetRowBitWidth(const ui32 row) const { + Y_ABORT_UNLESS(Prepared); + ui32 result = CommonSize; + for (auto&& c : BinaryColumns) { + result += GetBitWidthAligned(c->GetView(row).size() * 8); + } + for (auto&& c : StringColumns) { + result += GetBitWidthAligned(c->GetView(row).size() * 8); + } + return result; +} + +bool TRowSizeCalculator::InitBatch(const std::shared_ptr& batch) { + Batch = batch; + CommonSize = 0; + BinaryColumns.clear(); + StringColumns.clear(); + Prepared = false; + for (ui32 i = 0; i < (ui32)Batch->num_columns(); ++i) { + auto fSize = std::dynamic_pointer_cast(Batch->column(i)->type()); + if (fSize) { + CommonSize += GetBitWidthAligned(fSize->bit_width()); + } else { + auto c = Batch->column(i); + if (c->type()->id() == arrow::Type::BINARY) { + const arrow::BinaryArray& viewArray = static_cast(*c); + BinaryColumns.emplace_back(&viewArray); + } else if (c->type()->id() == arrow::Type::STRING) { + const arrow::StringArray& viewArray = static_cast(*c); + StringColumns.emplace_back(&viewArray); + } else { + return false; + } + } + } + Prepared = true; + return true; +} + +ui32 TRowSizeCalculator::GetRowBytesSize(const ui32 row) const { + const ui32 bitsWidth = GetRowBitWidth(row); + ui32 result = bitsWidth / 8; + if (bitsWidth % 8) { + ++result; + } + return result; +} + +ui64 GetArrayMemorySize(const std::shared_ptr& data) { + if (!data) { + return 0; + } + ui64 result = 0; + for (auto&& i : data->buffers) { + if (i) { + result += i->capacity(); + } + } + for (auto&& i : data->child_data) { + for (auto&& b : i->buffers) { + if (b) { + result += b->capacity(); + } + } + } + if (data->dictionary) { + for (auto&& b : data->dictionary->buffers) { + if (b) { + result += b->capacity(); + } + } + } + return result; +} + + +ui64 GetBatchDataSize(const std::shared_ptr& batch) { + if (!batch) { + return 0; + } + ui64 bytes = 0; + for (auto& column : batch->columns()) { + bytes += GetArrayDataSize(column); + } + return bytes; +} + +ui64 GetBatchMemorySize(const std::shared_ptr& batch) { + if (!batch) { + return 0; + } + ui64 bytes = 0; + for (auto& column : batch->column_data()) { + bytes += GetArrayMemorySize(column); + } + return bytes; +} + +ui64 GetTableMemorySize(const std::shared_ptr& batch) { + if (!batch) { + return 0; + } + ui64 bytes = 0; + for (auto& column : batch->columns()) { + for (auto&& chunk : column->chunks()) { + bytes += GetArrayMemorySize(chunk->data()); + } + } + return bytes; +} + +ui64 GetTableDataSize(const std::shared_ptr& batch) { + if (!batch) { + return 0; + } + ui64 bytes = 0; + for (auto& column : batch->columns()) { + for (auto&& chunk : column->chunks()) { + bytes += GetArrayDataSize(chunk); + } + } + return bytes; +} + +template +ui64 GetArrayDataSizeImpl(const std::shared_ptr& column) { + return sizeof(typename TType::c_type) * column->length(); +} + +template <> +ui64 GetArrayDataSizeImpl(const std::shared_ptr& column) { + return column->length() * 8; // Special value for empty lines +} + +template <> +ui64 GetArrayDataSizeImpl(const std::shared_ptr& column) { + auto typedColumn = std::static_pointer_cast(column); + return typedColumn->total_values_length() + sizeof(arrow::StringArray::offset_type) * column->length(); +} + +template <> +ui64 GetArrayDataSizeImpl(const std::shared_ptr& column) { + auto typedColumn = std::static_pointer_cast(column); + return typedColumn->total_values_length() + sizeof(arrow::LargeStringArray::offset_type) * column->length(); +} + +template <> +ui64 GetArrayDataSizeImpl(const std::shared_ptr& column) { + auto typedColumn = std::static_pointer_cast(column); + return typedColumn->total_values_length() + sizeof(arrow::BinaryArray::offset_type) * column->length(); +} + +template <> +ui64 GetArrayDataSizeImpl(const std::shared_ptr& column) { + auto typedColumn = std::static_pointer_cast(column); + return typedColumn->total_values_length() + sizeof(arrow::LargeBinaryArray::offset_type) * column->length(); +} + +template <> +ui64 GetArrayDataSizeImpl(const std::shared_ptr& column) { + auto typedColumn = std::static_pointer_cast(column); + return typedColumn->byte_width() * typedColumn->length(); +} + +template <> +ui64 GetArrayDataSizeImpl(const std::shared_ptr& column) { + return sizeof(ui64) * 2 * column->length(); +} + +ui64 GetArrayDataSize(const std::shared_ptr& column) { + auto type = column->type(); + if (type->id() == arrow::Type::DICTIONARY) { + auto dictArray = static_pointer_cast(column); + return GetDictionarySize(dictArray); + } + ui64 bytes = 0; + bool success = SwitchTypeWithNull(type->id(), [&](TTypeWrapper typeHolder) { + Y_UNUSED(typeHolder); + bytes = GetArrayDataSizeImpl(column); + return true; + }); + + // Add null bit mask overhead if any. + if (HasNulls(column)) { + bytes += column->length() / 8 + 1; + } + + Y_DEBUG_ABORT_UNLESS(success, "Unsupported arrow type %s", type->ToString().data()); + return bytes; +} + +ui64 GetDictionarySize(const std::shared_ptr& data) { + if (!data) { + return 0; + } + return GetArrayDataSize(data->dictionary()) + GetArrayDataSize(data->indices()); +} + +} diff --git a/ydb/library/formats/arrow/size_calcer.h b/ydb/library/formats/arrow/size_calcer.h new file mode 100644 index 000000000000..4101e55da4af --- /dev/null +++ b/ydb/library/formats/arrow/size_calcer.h @@ -0,0 +1,63 @@ +#pragma once + +#include +#include +#include + +#include +#include +#include +#include + +namespace NKikimr::NArrow { + +class TRowSizeCalculator { +private: + std::shared_ptr Batch; + ui32 CommonSize = 0; + std::vector BinaryColumns; + std::vector StringColumns; + bool Prepared = false; + const ui32 AlignBitsCount = 1; + + ui32 GetBitWidthAligned(const ui32 bitWidth) const { + if (AlignBitsCount == 1) { + return bitWidth; + } + ui32 result = bitWidth / AlignBitsCount; + if (bitWidth % AlignBitsCount) { + result += 1; + } + result *= AlignBitsCount; + return result; + } + +public: + + ui64 GetApproxSerializeSize(const ui64 dataSize) const { + return Max(dataSize * 1.05, dataSize + Batch->num_columns() * 8); + } + + TRowSizeCalculator(const ui32 alignBitsCount) + : AlignBitsCount(alignBitsCount) + { + + } + bool InitBatch(const std::shared_ptr& batch); + ui32 GetRowBitWidth(const ui32 row) const; + ui32 GetRowBytesSize(const ui32 row) const; +}; + +// Return size in bytes including size of bitmap mask +ui64 GetBatchDataSize(const std::shared_ptr& batch); +ui64 GetTableDataSize(const std::shared_ptr& batch); +// Return size in bytes including size of bitmap mask +ui64 GetArrayMemorySize(const std::shared_ptr& data); +ui64 GetBatchMemorySize(const std::shared_ptr&batch); +ui64 GetTableMemorySize(const std::shared_ptr& batch); +// Return size in bytes *not* including size of bitmap mask +ui64 GetArrayDataSize(const std::shared_ptr& column); + +ui64 GetDictionarySize(const std::shared_ptr& data); + +} diff --git a/ydb/core/formats/arrow/splitter/similar_packer.cpp b/ydb/library/formats/arrow/splitter/similar_packer.cpp similarity index 100% rename from ydb/core/formats/arrow/splitter/similar_packer.cpp rename to ydb/library/formats/arrow/splitter/similar_packer.cpp diff --git a/ydb/core/formats/arrow/splitter/similar_packer.h b/ydb/library/formats/arrow/splitter/similar_packer.h similarity index 100% rename from ydb/core/formats/arrow/splitter/similar_packer.h rename to ydb/library/formats/arrow/splitter/similar_packer.h diff --git a/ydb/core/formats/arrow/splitter/stats.cpp b/ydb/library/formats/arrow/splitter/stats.cpp similarity index 100% rename from ydb/core/formats/arrow/splitter/stats.cpp rename to ydb/library/formats/arrow/splitter/stats.cpp diff --git a/ydb/core/formats/arrow/splitter/stats.h b/ydb/library/formats/arrow/splitter/stats.h similarity index 100% rename from ydb/core/formats/arrow/splitter/stats.h rename to ydb/library/formats/arrow/splitter/stats.h diff --git a/ydb/library/formats/arrow/splitter/ya.make b/ydb/library/formats/arrow/splitter/ya.make new file mode 100644 index 000000000000..9720a4366b75 --- /dev/null +++ b/ydb/library/formats/arrow/splitter/ya.make @@ -0,0 +1,14 @@ +LIBRARY() + +SRCS( + stats.cpp + similar_packer.cpp +) + +PEERDIR( + contrib/libs/apache/arrow + ydb/library/actors/core + ydb/library/conclusion +) + +END() diff --git a/ydb/core/formats/arrow/switch/compare.cpp b/ydb/library/formats/arrow/switch/compare.cpp similarity index 100% rename from ydb/core/formats/arrow/switch/compare.cpp rename to ydb/library/formats/arrow/switch/compare.cpp diff --git a/ydb/core/formats/arrow/switch/compare.h b/ydb/library/formats/arrow/switch/compare.h similarity index 100% rename from ydb/core/formats/arrow/switch/compare.h rename to ydb/library/formats/arrow/switch/compare.h diff --git a/ydb/library/formats/arrow/switch/switch_type.cpp b/ydb/library/formats/arrow/switch/switch_type.cpp new file mode 100644 index 000000000000..b8396151cb4e --- /dev/null +++ b/ydb/library/formats/arrow/switch/switch_type.cpp @@ -0,0 +1,5 @@ +#include "switch_type.h" + +namespace NKikimr::NArrow { + +} diff --git a/ydb/library/formats/arrow/switch/switch_type.h b/ydb/library/formats/arrow/switch/switch_type.h new file mode 100644 index 000000000000..ab9f6aa1bbfe --- /dev/null +++ b/ydb/library/formats/arrow/switch/switch_type.h @@ -0,0 +1,184 @@ +#pragma once +#include +#include + +#include +#include + +extern "C" { +#include +} + +namespace NKikimr::NArrow { + +template +struct TTypeWrapper +{ + using T = TType; +}; + +template +TResult SwitchTypeImpl(arrow::Type::type typeId, TFunc&& f) { + switch (typeId) { + case arrow::Type::NA: { + if constexpr (EnableNull) { + return f(TTypeWrapper()); + } + break; + } + case arrow::Type::BOOL: + return f(TTypeWrapper()); + case arrow::Type::UINT8: + return f(TTypeWrapper()); + case arrow::Type::INT8: + return f(TTypeWrapper()); + case arrow::Type::UINT16: + return f(TTypeWrapper()); + case arrow::Type::INT16: + return f(TTypeWrapper()); + case arrow::Type::UINT32: + return f(TTypeWrapper()); + case arrow::Type::INT32: + return f(TTypeWrapper()); + case arrow::Type::UINT64: + return f(TTypeWrapper()); + case arrow::Type::INT64: + return f(TTypeWrapper()); + case arrow::Type::HALF_FLOAT: + return f(TTypeWrapper()); + case arrow::Type::FLOAT: + return f(TTypeWrapper()); + case arrow::Type::DOUBLE: + return f(TTypeWrapper()); + case arrow::Type::STRING: + return f(TTypeWrapper()); + case arrow::Type::BINARY: + return f(TTypeWrapper()); + case arrow::Type::FIXED_SIZE_BINARY: + return f(TTypeWrapper()); + case arrow::Type::DATE32: + return f(TTypeWrapper()); + case arrow::Type::DATE64: + return f(TTypeWrapper()); + case arrow::Type::TIMESTAMP: + return f(TTypeWrapper()); + case arrow::Type::TIME32: + return f(TTypeWrapper()); + case arrow::Type::TIME64: + return f(TTypeWrapper()); + case arrow::Type::INTERVAL_MONTHS: + return f(TTypeWrapper()); + case arrow::Type::DECIMAL: + return f(TTypeWrapper()); + case arrow::Type::DURATION: + return f(TTypeWrapper()); + case arrow::Type::LARGE_STRING: + return f(TTypeWrapper()); + case arrow::Type::LARGE_BINARY: + return f(TTypeWrapper()); + case arrow::Type::DECIMAL256: + case arrow::Type::DENSE_UNION: + case arrow::Type::DICTIONARY: + case arrow::Type::EXTENSION: + case arrow::Type::FIXED_SIZE_LIST: + case arrow::Type::INTERVAL_DAY_TIME: + case arrow::Type::LARGE_LIST: + case arrow::Type::LIST: + case arrow::Type::MAP: + case arrow::Type::MAX_ID: + case arrow::Type::SPARSE_UNION: + case arrow::Type::STRUCT: + break; + } + + return defaultValue; +} + +template +bool SwitchType(arrow::Type::type typeId, TFunc&& f) { + return SwitchTypeImpl(typeId, std::move(f)); +} + +template +bool SwitchTypeWithNull(arrow::Type::type typeId, TFunc&& f) { + return SwitchType(typeId, std::move(f)); +} + +template +bool SwitchArrayType(const arrow::Datum& column, TFunc&& f) { + auto type = column.type(); + Y_ABORT_UNLESS(type); + return SwitchType(type->id(), std::forward(f)); +} + +template +bool Append(arrow::ArrayBuilder& builder, const typename T::c_type& value) { + using TBuilder = typename arrow::TypeTraits::BuilderType; + + TStatusValidator::Validate(static_cast(builder).Append(value)); + return true; +} + +template +bool Append(arrow::ArrayBuilder& builder, arrow::util::string_view value) { + using TBuilder = typename arrow::TypeTraits::BuilderType; + + TStatusValidator::Validate(static_cast(builder).Append(value)); + return true; +} + +template +bool Append(arrow::ArrayBuilder& builder, const typename T::c_type* values, size_t size) { + using TBuilder = typename arrow::NumericBuilder; + + TStatusValidator::Validate(static_cast(builder).AppendValues(values, size)); + return true; +} + +template +bool Append(arrow::ArrayBuilder& builder, const std::vector& values) { + using TBuilder = typename arrow::NumericBuilder; + + TStatusValidator::Validate(static_cast(builder).AppendValues(values.data(), values.size())); + return true; +} + +template +[[nodiscard]] bool Append(T& builder, const arrow::Array& array, int position, ui64* recordSize = nullptr) { + Y_DEBUG_ABORT_UNLESS(builder.type()->id() == array.type_id()); + return SwitchType(array.type_id(), [&](const auto& type) { + using TWrap = std::decay_t; + using TArray = typename arrow::TypeTraits::ArrayType; + using TBuilder = typename arrow::TypeTraits::BuilderType; + + auto& typedArray = static_cast(array); + auto& typedBuilder = static_cast(builder); + + if (typedArray.IsNull(position)) { + TStatusValidator::Validate(typedBuilder.AppendNull()); + if (recordSize) { + *recordSize += 4; + } + return true; + } else { + if constexpr (!arrow::has_string_view::value) { + TStatusValidator::Validate(typedBuilder.Append(typedArray.GetView(position))); + if (recordSize) { + *recordSize += sizeof(typedArray.GetView(position)); + } + return true; + } + if constexpr (arrow::has_string_view::value) { + TStatusValidator::Validate(typedBuilder.Append(typedArray.GetView(position))); + if (recordSize) { + *recordSize += typedArray.GetView(position).size(); + } + return true; + } + } + Y_ABORT_UNLESS(false, "unpredictable variant"); + return false; + }); +} + +} diff --git a/ydb/library/formats/arrow/switch/ya.make b/ydb/library/formats/arrow/switch/ya.make new file mode 100644 index 000000000000..4a42fa96cacb --- /dev/null +++ b/ydb/library/formats/arrow/switch/ya.make @@ -0,0 +1,13 @@ +LIBRARY() + +PEERDIR( + contrib/libs/apache/arrow + ydb/library/actors/core +) + +SRCS( + switch_type.cpp + compare.cpp +) + +END() diff --git a/ydb/core/formats/arrow/switch_type.h b/ydb/library/formats/arrow/switch_type.h similarity index 100% rename from ydb/core/formats/arrow/switch_type.h rename to ydb/library/formats/arrow/switch_type.h diff --git a/ydb/core/formats/arrow/transformer/abstract.cpp b/ydb/library/formats/arrow/transformer/abstract.cpp similarity index 100% rename from ydb/core/formats/arrow/transformer/abstract.cpp rename to ydb/library/formats/arrow/transformer/abstract.cpp diff --git a/ydb/core/formats/arrow/transformer/abstract.h b/ydb/library/formats/arrow/transformer/abstract.h similarity index 100% rename from ydb/core/formats/arrow/transformer/abstract.h rename to ydb/library/formats/arrow/transformer/abstract.h diff --git a/ydb/core/formats/arrow/transformer/composite.cpp b/ydb/library/formats/arrow/transformer/composite.cpp similarity index 100% rename from ydb/core/formats/arrow/transformer/composite.cpp rename to ydb/library/formats/arrow/transformer/composite.cpp diff --git a/ydb/core/formats/arrow/transformer/composite.h b/ydb/library/formats/arrow/transformer/composite.h similarity index 100% rename from ydb/core/formats/arrow/transformer/composite.h rename to ydb/library/formats/arrow/transformer/composite.h diff --git a/ydb/library/formats/arrow/transformer/ya.make b/ydb/library/formats/arrow/transformer/ya.make new file mode 100644 index 000000000000..601adb56ae18 --- /dev/null +++ b/ydb/library/formats/arrow/transformer/ya.make @@ -0,0 +1,12 @@ +LIBRARY() + +PEERDIR( + contrib/libs/apache/arrow +) + +SRCS( + abstract.cpp + composite.cpp +) + +END() diff --git a/ydb/library/formats/arrow/ut/ut_arrow.cpp b/ydb/library/formats/arrow/ut/ut_arrow.cpp new file mode 100644 index 000000000000..22240b623530 --- /dev/null +++ b/ydb/library/formats/arrow/ut/ut_arrow.cpp @@ -0,0 +1,302 @@ +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace NKikimr { +namespace { + +struct TDataRow { + bool Bool; + i8 Int8; + i16 Int16; + i32 Int32; + i64 Int64; + ui8 UInt8; + ui16 UInt16; + ui32 UInt32; + ui64 UInt64; + float Float32; + double Float64; + std::string String; + std::string Utf8; + std::string Json; + std::string Yson; + ui16 Date; + ui32 Datetime; + i64 Timestamp; + i64 Interval; + std::string JsonDocument; + //ui64 Decimal[2]; + + bool operator == (const TDataRow& r) const { + return (Bool == r.Bool) && + (Int8 == r.Int8) && + (Int16 == r.Int16) && + (Int32 == r.Int32) && + (Int64 == r.Int64) && + (UInt8 == r.UInt8) && + (UInt16 == r.UInt16) && + (UInt32 == r.UInt32) && + (UInt64 == r.UInt64) && + (Float32 == r.Float32) && + (Float64 == r.Float64) && + (String == r.String) && + (Utf8 == r.Utf8) && + (Json == r.Json) && + (Yson == r.Yson) && + (Date == r.Date) && + (Datetime == r.Datetime) && + (Timestamp == r.Timestamp) && + (Interval == r.Interval) && + (JsonDocument == r.JsonDocument); + //(Decimal[0] == r.Decimal[0] && Decimal[1] == r.Decimal[1]); + } + + static std::shared_ptr MakeArrowSchema() { + std::vector> fields = { + arrow::field("bool", arrow::boolean()), + arrow::field("i8", arrow::int8()), + arrow::field("i16", arrow::int16()), + arrow::field("i32", arrow::int32()), + arrow::field("i64", arrow::int64()), + arrow::field("ui8", arrow::uint8()), + arrow::field("ui16", arrow::uint16()), + arrow::field("ui32", arrow::uint32()), + arrow::field("ui64", arrow::uint64()), + arrow::field("f32", arrow::float32()), + arrow::field("f64", arrow::float64()), + arrow::field("string", arrow::binary()), + arrow::field("utf8", arrow::utf8()), + arrow::field("json", arrow::utf8()), + arrow::field("yson", arrow::binary()), + arrow::field("date", arrow::uint16()), + arrow::field("datetime", arrow::uint32()), + arrow::field("ts", arrow::timestamp(arrow::TimeUnit::TimeUnit::MICRO)), + arrow::field("ival", arrow::duration(arrow::TimeUnit::TimeUnit::MICRO)), + arrow::field("json_doc", arrow::binary()), + //arrow::field("dec", arrow::decimal(NScheme::DECIMAL_PRECISION, NScheme::DECIMAL_SCALE)), + }; + + return std::make_shared(std::move(fields)); + } +}; + + +std::shared_ptr GetColumn(const arrow::Table& table, int i, int chunk = 0) { + return table.column(i)->chunk(chunk); +} + +template +std::vector ToVector(const std::shared_ptr& table) { + std::vector rows; + + auto arbool = std::static_pointer_cast(GetColumn(*table, 0)); + auto ari8 = std::static_pointer_cast(GetColumn(*table, 1)); + auto ari16 = std::static_pointer_cast(GetColumn(*table, 2)); + auto ari32 = std::static_pointer_cast(GetColumn(*table, 3)); + auto ari64 = std::static_pointer_cast(GetColumn(*table, 4)); + auto aru8 = std::static_pointer_cast(GetColumn(*table, 5)); + auto aru16 = std::static_pointer_cast(GetColumn(*table, 6)); + auto aru32 = std::static_pointer_cast(GetColumn(*table, 7)); + auto aru64 = std::static_pointer_cast(GetColumn(*table, 8)); + auto arf32 = std::static_pointer_cast(GetColumn(*table, 9)); + auto arf64 = std::static_pointer_cast(GetColumn(*table, 10)); + + auto arstr = std::static_pointer_cast(GetColumn(*table, 11)); + auto arutf = std::static_pointer_cast(GetColumn(*table, 12)); + auto arj = std::static_pointer_cast(GetColumn(*table, 13)); + auto ary = std::static_pointer_cast(GetColumn(*table, 14)); + + auto ard = std::static_pointer_cast(GetColumn(*table, 15)); + auto ardt = std::static_pointer_cast(GetColumn(*table, 16)); + auto arts = std::static_pointer_cast(GetColumn(*table, 17)); + auto arival = std::static_pointer_cast(GetColumn(*table, 18)); + + auto arjd = std::static_pointer_cast(GetColumn(*table, 19)); + //auto ardec = std::static_pointer_cast(GetColumn(*table, 19)); + + for (int64_t i = 0; i < table->num_rows(); ++i) { + //ui64 dec[2]; + //memcpy(dec, ardec->Value(i), 16); + TDataRow r{ arbool->Value(i), + ari8->Value(i), ari16->Value(i), ari32->Value(i), ari64->Value(i), + aru8->Value(i), aru16->Value(i), aru32->Value(i), aru64->Value(i), + arf32->Value(i), arf64->Value(i), + arstr->GetString(i), arutf->GetString(i), arj->GetString(i), ary->GetString(i), + ard->Value(i), ardt->Value(i), arts->Value(i), arival->Value(i), arjd->GetString(i) + //{dec[0], dec[1]} + }; + rows.emplace_back(std::move(r)); + } + + return rows; +} + +class TDataRowTableBuilder +{ +public: + TDataRowTableBuilder() + : Bts(arrow::timestamp(arrow::TimeUnit::TimeUnit::MICRO), arrow::default_memory_pool()) + , Bival(arrow::duration(arrow::TimeUnit::TimeUnit::MICRO), arrow::default_memory_pool()) + //, Bdec(arrow::decimal(NScheme::DECIMAL_PRECISION, NScheme::DECIMAL_SCALE), arrow::default_memory_pool()) + {} + + void AddRow(const TDataRow& row) { + UNIT_ASSERT(Bbool.Append(row.Bool).ok()); + UNIT_ASSERT(Bi8.Append(row.Int8).ok()); + UNIT_ASSERT(Bi16.Append(row.Int16).ok()); + UNIT_ASSERT(Bi32.Append(row.Int32).ok()); + UNIT_ASSERT(Bi64.Append(row.Int64).ok()); + UNIT_ASSERT(Bu8.Append(row.UInt8).ok()); + UNIT_ASSERT(Bu16.Append(row.UInt16).ok()); + UNIT_ASSERT(Bu32.Append(row.UInt32).ok()); + UNIT_ASSERT(Bu64.Append(row.UInt64).ok()); + UNIT_ASSERT(Bf32.Append(row.Float32).ok()); + UNIT_ASSERT(Bf64.Append(row.Float64).ok()); + + UNIT_ASSERT(Bstr.Append(row.String).ok()); + UNIT_ASSERT(Butf.Append(row.Utf8).ok()); + UNIT_ASSERT(Bj.Append(row.Json).ok()); + UNIT_ASSERT(By.Append(row.Yson).ok()); + + UNIT_ASSERT(Bd.Append(row.Date).ok()); + UNIT_ASSERT(Bdt.Append(row.Datetime).ok()); + UNIT_ASSERT(Bts.Append(row.Timestamp).ok()); + UNIT_ASSERT(Bival.Append(row.Interval).ok()); + + UNIT_ASSERT(Bjd.Append(row.JsonDocument).ok()); + //UNIT_ASSERT(Bdec.Append((const char *)&row.Decimal).ok()); + } + + std::shared_ptr Finish() { + std::shared_ptr arbool; + std::shared_ptr ari8; + std::shared_ptr ari16; + std::shared_ptr ari32; + std::shared_ptr ari64; + std::shared_ptr aru8; + std::shared_ptr aru16; + std::shared_ptr aru32; + std::shared_ptr aru64; + std::shared_ptr arf32; + std::shared_ptr arf64; + + std::shared_ptr arstr; + std::shared_ptr arutf; + std::shared_ptr arj; + std::shared_ptr ary; + + std::shared_ptr ard; + std::shared_ptr ardt; + std::shared_ptr arts; + std::shared_ptr arival; + + std::shared_ptr arjd; + //std::shared_ptr ardec; + + UNIT_ASSERT(Bbool.Finish(&arbool).ok()); + UNIT_ASSERT(Bi8.Finish(&ari8).ok()); + UNIT_ASSERT(Bi16.Finish(&ari16).ok()); + UNIT_ASSERT(Bi32.Finish(&ari32).ok()); + UNIT_ASSERT(Bi64.Finish(&ari64).ok()); + UNIT_ASSERT(Bu8.Finish(&aru8).ok()); + UNIT_ASSERT(Bu16.Finish(&aru16).ok()); + UNIT_ASSERT(Bu32.Finish(&aru32).ok()); + UNIT_ASSERT(Bu64.Finish(&aru64).ok()); + UNIT_ASSERT(Bf32.Finish(&arf32).ok()); + UNIT_ASSERT(Bf64.Finish(&arf64).ok()); + + UNIT_ASSERT(Bstr.Finish(&arstr).ok()); + UNIT_ASSERT(Butf.Finish(&arutf).ok()); + UNIT_ASSERT(Bj.Finish(&arj).ok()); + UNIT_ASSERT(By.Finish(&ary).ok()); + + UNIT_ASSERT(Bd.Finish(&ard).ok()); + UNIT_ASSERT(Bdt.Finish(&ardt).ok()); + UNIT_ASSERT(Bts.Finish(&arts).ok()); + UNIT_ASSERT(Bival.Finish(&arival).ok()); + + UNIT_ASSERT(Bjd.Finish(&arjd).ok()); + //UNIT_ASSERT(Bdec.Finish(&ardec).ok()); + + std::shared_ptr schema = TDataRow::MakeArrowSchema(); + return arrow::Table::Make(schema, { + arbool, + ari8, ari16, ari32, ari64, + aru8, aru16, aru32, aru64, + arf32, arf64, + arstr, arutf, arj, ary, + ard, ardt, arts, arival, arjd + //ardec + }); + } + + static std::shared_ptr Build(const std::vector& rows) { + TDataRowTableBuilder builder; + for (const TDataRow& row : rows) { + builder.AddRow(row); + } + return builder.Finish(); + } + +private: + arrow::BooleanBuilder Bbool; + arrow::Int8Builder Bi8; + arrow::Int16Builder Bi16; + arrow::Int32Builder Bi32; + arrow::Int64Builder Bi64; + arrow::UInt8Builder Bu8; + arrow::UInt16Builder Bu16; + arrow::UInt32Builder Bu32; + arrow::UInt64Builder Bu64; + arrow::FloatBuilder Bf32; + arrow::DoubleBuilder Bf64; + arrow::BinaryBuilder Bstr; + arrow::StringBuilder Butf; + arrow::BinaryBuilder Bj; + arrow::BinaryBuilder By; + arrow::UInt16Builder Bd; + arrow::UInt32Builder Bdt; + arrow::TimestampBuilder Bts; + arrow::DurationBuilder Bival; + arrow::BinaryBuilder Bjd; + //arrow::Decimal128Builder Bdec; +}; + +std::vector TestRows() { + std::vector rows = { + {false, -1, -1, -1, -1, 1, 1, 1, 1, -1.0f, -1.0, "s1", "u1", "{\"j\":1}", "{y:1}", 0, 0, 0, 0, "{\"jd\":1}" }, + {false, 2, 2, 2, 2, 2, 2, 2, 2, 2.0f, 2.0, "s2", "u2", "{\"j\":2}", "{y:2}", 0, 0, 0, 0, "{\"jd\":1}" }, + {false, -3, -3, -3, -3, 3, 3, 3, 3, -3.0f, -3.0, "s3", "u3", "{\"j\":3}", "{y:3}", 0, 0, 0, 0, "{\"jd\":1}" }, + {false, -4, -4, -4, -4, 4, 4, 4, 4, 4.0f, 4.0, "s4", "u4", "{\"j\":4}", "{y:4}", 0, 0, 0, 0, "{\"jd\":1}" }, + }; + return rows; +} + +} + +Y_UNIT_TEST_SUITE(ArrowTest) { + Y_UNIT_TEST(Basic) { + std::vector rows = TestRows(); + + std::shared_ptr table = TDataRowTableBuilder::Build(rows); + + auto expectedSchema = TDataRow::MakeArrowSchema(); + UNIT_ASSERT_EQUAL(expectedSchema->Equals(*table->schema()), true); + + std::vector readRows = ToVector(table); + + UNIT_ASSERT_EQUAL(rows.size(), readRows.size()); + for (size_t i = 0; i < rows.size(); ++i) { + UNIT_ASSERT_EQUAL(rows[i], readRows[i]); + } + } +} + +} diff --git a/ydb/core/formats/arrow/ut/ut_size_calcer.cpp b/ydb/library/formats/arrow/ut/ut_size_calcer.cpp similarity index 91% rename from ydb/core/formats/arrow/ut/ut_size_calcer.cpp rename to ydb/library/formats/arrow/ut/ut_size_calcer.cpp index 1db712f43c7a..be569d2d6bf3 100644 --- a/ydb/core/formats/arrow/ut/ut_size_calcer.cpp +++ b/ydb/library/formats/arrow/ut/ut_size_calcer.cpp @@ -1,10 +1,9 @@ #include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include Y_UNIT_TEST_SUITE(SizeCalcer) { diff --git a/ydb/library/formats/arrow/ut/ya.make b/ydb/library/formats/arrow/ut/ya.make new file mode 100644 index 000000000000..ae8b7e80261f --- /dev/null +++ b/ydb/library/formats/arrow/ut/ya.make @@ -0,0 +1,29 @@ +UNITTEST_FOR(ydb/library/formats/arrow) + +SIZE(SMALL) + +PEERDIR( + contrib/libs/apache/arrow + ydb/library/arrow_kernels + + # for NYql::NUdf alloc stuff used in binary_json + ydb/library/yql/public/udf/service/exception_policy + ydb/library/yql/sql/pg_dummy +) + +ADDINCL( + ydb/library/arrow_clickhouse +) + +YQL_LAST_ABI_VERSION() + +CFLAGS( + -Wno-unused-parameter +) + +SRCS( + ut_arrow.cpp + ut_size_calcer.cpp +) + +END() diff --git a/ydb/core/formats/arrow/validation/validation.cpp b/ydb/library/formats/arrow/validation/validation.cpp similarity index 100% rename from ydb/core/formats/arrow/validation/validation.cpp rename to ydb/library/formats/arrow/validation/validation.cpp diff --git a/ydb/core/formats/arrow/validation/validation.h b/ydb/library/formats/arrow/validation/validation.h similarity index 100% rename from ydb/core/formats/arrow/validation/validation.h rename to ydb/library/formats/arrow/validation/validation.h diff --git a/ydb/core/formats/arrow/validation/ya.make b/ydb/library/formats/arrow/validation/ya.make similarity index 100% rename from ydb/core/formats/arrow/validation/ya.make rename to ydb/library/formats/arrow/validation/ya.make diff --git a/ydb/library/formats/arrow/ya.make b/ydb/library/formats/arrow/ya.make new file mode 100644 index 000000000000..453bdea5d62b --- /dev/null +++ b/ydb/library/formats/arrow/ya.make @@ -0,0 +1,52 @@ +RECURSE_FOR_TESTS( + ut +) + +# Not allowed: ydb/(?!library|services/bg_tasks/abstract/interface.h) +# In: ./ydb/library/formats + +LIBRARY() + +PEERDIR( + contrib/libs/apache/arrow + ydb/library/formats/arrow/accessor + ydb/library/formats/arrow/simple_builder + ydb/library/formats/arrow/transformer + ydb/library/formats/arrow/splitter + ydb/library/formats/arrow/modifier + ydb/library/formats/arrow/scalar + ydb/library/formats/arrow/hash + ydb/library/actors/core + ydb/library/arrow_kernels + ydb/library/binary_json + ydb/library/dynumber + ydb/library/services + ydb/library/yql/core/arrow_kernels/request +) + +IF (OS_WINDOWS) + ADDINCL( + ydb/library/yql/udfs/common/clickhouse/client/base + ydb/library/arrow_clickhouse + ) +ELSE() + PEERDIR( + ydb/library/arrow_clickhouse + ) + ADDINCL( + ydb/library/arrow_clickhouse + ) +ENDIF() + +YQL_LAST_ABI_VERSION() + +SRCS( + arrow_helpers.cpp + input_stream.h + permutations.cpp + replace_key.cpp + size_calcer.cpp + simple_arrays_cache.cpp +) + +END() diff --git a/ydb/library/formats/ya.make b/ydb/library/formats/ya.make new file mode 100644 index 000000000000..82dc38d8e6b4 --- /dev/null +++ b/ydb/library/formats/ya.make @@ -0,0 +1,9 @@ +RECURSE( + arrow +) + +LIBRARY() + +YQL_LAST_ABI_VERSION() + +END() diff --git a/ydb/library/ya.make b/ydb/library/ya.make index bfb595172feb..a9ab9de4308e 100644 --- a/ydb/library/ya.make +++ b/ydb/library/ya.make @@ -11,6 +11,7 @@ RECURSE( chunks_limiter dynumber folder_service + formats grpc http_proxy keys From 751ac1df950a9c027de63740ee9f2e216c105bcb Mon Sep 17 00:00:00 2001 From: Grigoriy Pisarenko Date: Thu, 12 Sep 2024 19:39:00 +0000 Subject: [PATCH 2/6] Refactored dependencies --- ydb/core/tx/columnshard/engines/filter.h | 2 +- ydb/core/tx/columnshard/engines/insert_table/meta.h | 2 +- ydb/core/tx/columnshard/engines/insert_table/ya.make | 2 +- ydb/core/tx/columnshard/engines/portions/column_record.h | 4 ++-- ydb/core/tx/columnshard/engines/portions/meta.h | 2 +- ydb/core/tx/columnshard/engines/portions/portion_info.cpp | 4 ++-- ydb/core/tx/columnshard/engines/portions/portion_info.h | 4 ++-- ydb/core/tx/columnshard/engines/predicate/container.h | 2 +- ydb/core/tx/columnshard/engines/predicate/predicate.cpp | 2 +- .../reader/plain_reader/constructor/read_metadata.h | 2 +- .../engines/reader/plain_reader/iterator/fetched_data.cpp | 4 ++-- .../engines/reader/plain_reader/iterator/fetching.cpp | 2 +- .../engines/reader/plain_reader/iterator/source.cpp | 2 +- .../tx/columnshard/engines/scheme/abstract/index_info.cpp | 2 +- ydb/core/tx/columnshard/engines/scheme/column/info.h | 4 ++-- ydb/core/tx/columnshard/engines/scheme/column_features.h | 4 ++-- ydb/core/tx/columnshard/engines/scheme/index_info.cpp | 2 +- ydb/core/tx/columnshard/engines/scheme/index_info.h | 2 +- .../tx/columnshard/engines/scheme/tiering/tier_info.h | 2 +- .../engines/scheme/versions/abstract_scheme.cpp | 2 +- .../engines/storage/actualizer/scheme/counters.h | 2 +- .../engines/storage/actualizer/tiering/counters.h | 2 +- .../columnshard/engines/storage/indexes/bloom/checker.cpp | 2 +- .../tx/columnshard/engines/storage/indexes/bloom/meta.cpp | 2 +- .../engines/storage/indexes/count_min_sketch/checker.cpp | 2 +- .../engines/storage/indexes/count_min_sketch/meta.cpp | 2 +- .../tx/columnshard/engines/storage/indexes/max/meta.cpp | 2 +- .../storage/optimizer/sbuckets/logic/abstract/logic.h | 2 +- .../engines/storage/optimizer/ut/ut_optimizer.cpp | 4 ++-- ydb/core/tx/columnshard/engines/ut/helper.cpp | 2 +- .../tx/columnshard/operations/batch_builder/merger.cpp | 2 +- ydb/core/tx/columnshard/splitter/abstract/chunk_meta.h | 2 +- ydb/core/tx/columnshard/splitter/batch_slice.h | 4 ++-- ydb/core/tx/columnshard/splitter/ut/ut_splitter.cpp | 8 ++++---- ydb/core/tx/columnshard/transactions/locks/interaction.h | 2 +- .../tx/columnshard/ut_rw/ut_columnshard_read_write.cpp | 6 +++--- ydb/core/tx/columnshard/ut_rw/ut_normalizer.cpp | 6 +++--- ydb/core/tx/data_events/write_data.h | 2 +- ydb/core/tx/sharding/unboxed_reader.h | 2 +- ydb/core/tx/sharding/ut/ut_sharding.cpp | 2 +- ydb/services/ext_index/ut/ut_ext_index.cpp | 2 +- 41 files changed, 56 insertions(+), 56 deletions(-) diff --git a/ydb/core/tx/columnshard/engines/filter.h b/ydb/core/tx/columnshard/engines/filter.h index 7670b0eab1d3..39167306b993 100644 --- a/ydb/core/tx/columnshard/engines/filter.h +++ b/ydb/core/tx/columnshard/engines/filter.h @@ -2,7 +2,7 @@ #include "defs.h" #include -#include +#include #include namespace NKikimr::NOlap { diff --git a/ydb/core/tx/columnshard/engines/insert_table/meta.h b/ydb/core/tx/columnshard/engines/insert_table/meta.h index cb55848c7067..a913e88c973a 100644 --- a/ydb/core/tx/columnshard/engines/insert_table/meta.h +++ b/ydb/core/tx/columnshard/engines/insert_table/meta.h @@ -1,6 +1,6 @@ #pragma once #include -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/insert_table/ya.make b/ydb/core/tx/columnshard/engines/insert_table/ya.make index 852761344626..e6fde75077d5 100644 --- a/ydb/core/tx/columnshard/engines/insert_table/ya.make +++ b/ydb/core/tx/columnshard/engines/insert_table/ya.make @@ -12,7 +12,7 @@ SRCS( PEERDIR( contrib/libs/apache/arrow - ydb/core/formats/arrow/modifier + ydb/library/formats/arrow/modifier ydb/core/protos ydb/core/formats/arrow ydb/core/tablet_flat diff --git a/ydb/core/tx/columnshard/engines/portions/column_record.h b/ydb/core/tx/columnshard/engines/portions/column_record.h index 2b984e4f05f3..18fd0984d61b 100644 --- a/ydb/core/tx/columnshard/engines/portions/column_record.h +++ b/ydb/core/tx/columnshard/engines/portions/column_record.h @@ -2,8 +2,8 @@ #include "common.h" -#include -#include +#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/portions/meta.h b/ydb/core/tx/columnshard/engines/portions/meta.h index 9c2e5cd332d3..ad57ef1325c3 100644 --- a/ydb/core/tx/columnshard/engines/portions/meta.h +++ b/ydb/core/tx/columnshard/engines/portions/meta.h @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/portions/portion_info.cpp b/ydb/core/tx/columnshard/engines/portions/portion_info.cpp index 0605d5ffcbab..6652bf9c4c5f 100644 --- a/ydb/core/tx/columnshard/engines/portions/portion_info.cpp +++ b/ydb/core/tx/columnshard/engines/portions/portion_info.cpp @@ -7,11 +7,11 @@ #include #include #include -#include +#include #include #include #include -#include +#include #include diff --git a/ydb/core/tx/columnshard/engines/portions/portion_info.h b/ydb/core/tx/columnshard/engines/portions/portion_info.h index c29013dd6a7e..6fa105745b05 100644 --- a/ydb/core/tx/columnshard/engines/portions/portion_info.h +++ b/ydb/core/tx/columnshard/engines/portions/portion_info.h @@ -5,9 +5,9 @@ #include #include -#include +#include #include -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/predicate/container.h b/ydb/core/tx/columnshard/engines/predicate/container.h index adab69f68dc9..7d969cf9a759 100644 --- a/ydb/core/tx/columnshard/engines/predicate/container.h +++ b/ydb/core/tx/columnshard/engines/predicate/container.h @@ -2,7 +2,7 @@ #include "predicate.h" #include -#include +#include #include diff --git a/ydb/core/tx/columnshard/engines/predicate/predicate.cpp b/ydb/core/tx/columnshard/engines/predicate/predicate.cpp index 535172afd526..a6831ca2ad50 100644 --- a/ydb/core/tx/columnshard/engines/predicate/predicate.cpp +++ b/ydb/core/tx/columnshard/engines/predicate/predicate.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include diff --git a/ydb/core/tx/columnshard/engines/reader/plain_reader/constructor/read_metadata.h b/ydb/core/tx/columnshard/engines/reader/plain_reader/constructor/read_metadata.h index cbd397bf366e..5f5ad70db296 100644 --- a/ydb/core/tx/columnshard/engines/reader/plain_reader/constructor/read_metadata.h +++ b/ydb/core/tx/columnshard/engines/reader/plain_reader/constructor/read_metadata.h @@ -1,7 +1,7 @@ #pragma once #include #include -#include +#include #include #include diff --git a/ydb/core/tx/columnshard/engines/reader/plain_reader/iterator/fetched_data.cpp b/ydb/core/tx/columnshard/engines/reader/plain_reader/iterator/fetched_data.cpp index ac7fe2c16bf3..bf38c466b75b 100644 --- a/ydb/core/tx/columnshard/engines/reader/plain_reader/iterator/fetched_data.cpp +++ b/ydb/core/tx/columnshard/engines/reader/plain_reader/iterator/fetched_data.cpp @@ -1,8 +1,8 @@ #include "fetched_data.h" #include -#include -#include +#include +#include namespace NKikimr::NOlap { diff --git a/ydb/core/tx/columnshard/engines/reader/plain_reader/iterator/fetching.cpp b/ydb/core/tx/columnshard/engines/reader/plain_reader/iterator/fetching.cpp index e72e7b3cf2e8..2d0ec349aa6a 100644 --- a/ydb/core/tx/columnshard/engines/reader/plain_reader/iterator/fetching.cpp +++ b/ydb/core/tx/columnshard/engines/reader/plain_reader/iterator/fetching.cpp @@ -1,7 +1,7 @@ #include "fetching.h" #include "source.h" -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/reader/plain_reader/iterator/source.cpp b/ydb/core/tx/columnshard/engines/reader/plain_reader/iterator/source.cpp index 38a73b19d65e..bef10d38f6b1 100644 --- a/ydb/core/tx/columnshard/engines/reader/plain_reader/iterator/source.cpp +++ b/ydb/core/tx/columnshard/engines/reader/plain_reader/iterator/source.cpp @@ -4,7 +4,7 @@ #include "plain_read_data.h" #include "source.h" -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/scheme/abstract/index_info.cpp b/ydb/core/tx/columnshard/engines/scheme/abstract/index_info.cpp index f5473eaab885..974cf998d957 100644 --- a/ydb/core/tx/columnshard/engines/scheme/abstract/index_info.cpp +++ b/ydb/core/tx/columnshard/engines/scheme/abstract/index_info.cpp @@ -1,7 +1,7 @@ #include "index_info.h" #include #include -#include +#include #include #include diff --git a/ydb/core/tx/columnshard/engines/scheme/column/info.h b/ydb/core/tx/columnshard/engines/scheme/column/info.h index 5e3259cbd707..ef47445bf665 100644 --- a/ydb/core/tx/columnshard/engines/scheme/column/info.h +++ b/ydb/core/tx/columnshard/engines/scheme/column/info.h @@ -1,11 +1,11 @@ #pragma once #include -#include +#include #include #include #include #include -#include +#include #include #include diff --git a/ydb/core/tx/columnshard/engines/scheme/column_features.h b/ydb/core/tx/columnshard/engines/scheme/column_features.h index 288ac6e195e4..c31c2a970a00 100644 --- a/ydb/core/tx/columnshard/engines/scheme/column_features.h +++ b/ydb/core/tx/columnshard/engines/scheme/column_features.h @@ -3,11 +3,11 @@ #include #include -#include +#include #include #include #include -#include +#include #include #include diff --git a/ydb/core/tx/columnshard/engines/scheme/index_info.cpp b/ydb/core/tx/columnshard/engines/scheme/index_info.cpp index c6203f9142a2..5059cc5eba79 100644 --- a/ydb/core/tx/columnshard/engines/scheme/index_info.cpp +++ b/ydb/core/tx/columnshard/engines/scheme/index_info.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/scheme/index_info.h b/ydb/core/tx/columnshard/engines/scheme/index_info.h index 0c04b4abd8d1..869d0393b6c1 100644 --- a/ydb/core/tx/columnshard/engines/scheme/index_info.h +++ b/ydb/core/tx/columnshard/engines/scheme/index_info.h @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/scheme/tiering/tier_info.h b/ydb/core/tx/columnshard/engines/scheme/tiering/tier_info.h index 1f5411e8e03c..8d290a8adcf2 100644 --- a/ydb/core/tx/columnshard/engines/scheme/tiering/tier_info.h +++ b/ydb/core/tx/columnshard/engines/scheme/tiering/tier_info.h @@ -2,7 +2,7 @@ #include "common.h" #include -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/scheme/versions/abstract_scheme.cpp b/ydb/core/tx/columnshard/engines/scheme/versions/abstract_scheme.cpp index c7b8e5cb6a53..8143c27a82ef 100644 --- a/ydb/core/tx/columnshard/engines/scheme/versions/abstract_scheme.cpp +++ b/ydb/core/tx/columnshard/engines/scheme/versions/abstract_scheme.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include namespace NKikimr::NOlap { diff --git a/ydb/core/tx/columnshard/engines/storage/actualizer/scheme/counters.h b/ydb/core/tx/columnshard/engines/storage/actualizer/scheme/counters.h index 8f5ab2ff4311..95aa18603f46 100644 --- a/ydb/core/tx/columnshard/engines/storage/actualizer/scheme/counters.h +++ b/ydb/core/tx/columnshard/engines/storage/actualizer/scheme/counters.h @@ -1,6 +1,6 @@ #pragma once #include -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/storage/actualizer/tiering/counters.h b/ydb/core/tx/columnshard/engines/storage/actualizer/tiering/counters.h index 7d7a1cc3d830..a5c278799fdd 100644 --- a/ydb/core/tx/columnshard/engines/storage/actualizer/tiering/counters.h +++ b/ydb/core/tx/columnshard/engines/storage/actualizer/tiering/counters.h @@ -1,6 +1,6 @@ #pragma once #include -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom/checker.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bloom/checker.cpp index aad793c858c2..1613bd10e7d0 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom/checker.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom/checker.cpp @@ -1,6 +1,6 @@ #include "checker.h" #include -#include +#include #include #include diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.cpp index fc89fb8b8ada..a2d84cb10f6d 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/bloom/meta.cpp @@ -1,6 +1,6 @@ #include "meta.h" #include "checker.h" -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/count_min_sketch/checker.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/count_min_sketch/checker.cpp index af88ef8299d2..aa40668897d4 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/count_min_sketch/checker.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/count_min_sketch/checker.cpp @@ -1,6 +1,6 @@ #include "checker.h" #include -#include +#include #include #include diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/count_min_sketch/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/count_min_sketch/meta.cpp index 5e0465848b29..80d154a751be 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/count_min_sketch/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/count_min_sketch/meta.cpp @@ -1,6 +1,6 @@ #include "meta.h" #include "checker.h" -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/storage/indexes/max/meta.cpp b/ydb/core/tx/columnshard/engines/storage/indexes/max/meta.cpp index 553daec4f0fe..b672f278e017 100644 --- a/ydb/core/tx/columnshard/engines/storage/indexes/max/meta.cpp +++ b/ydb/core/tx/columnshard/engines/storage/indexes/max/meta.cpp @@ -1,6 +1,6 @@ #include "meta.h" -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/engines/storage/optimizer/sbuckets/logic/abstract/logic.h b/ydb/core/tx/columnshard/engines/storage/optimizer/sbuckets/logic/abstract/logic.h index 3e4cef0f2970..b2d169db8698 100644 --- a/ydb/core/tx/columnshard/engines/storage/optimizer/sbuckets/logic/abstract/logic.h +++ b/ydb/core/tx/columnshard/engines/storage/optimizer/sbuckets/logic/abstract/logic.h @@ -2,7 +2,7 @@ #include #include -#include +#include namespace NKikimr::NOlap::NStorageOptimizer::NSBuckets { diff --git a/ydb/core/tx/columnshard/engines/storage/optimizer/ut/ut_optimizer.cpp b/ydb/core/tx/columnshard/engines/storage/optimizer/ut/ut_optimizer.cpp index c4aec7dd234e..420a9e5901e9 100644 --- a/ydb/core/tx/columnshard/engines/storage/optimizer/ut/ut_optimizer.cpp +++ b/ydb/core/tx/columnshard/engines/storage/optimizer/ut/ut_optimizer.cpp @@ -3,8 +3,8 @@ #include #include #include -#include -#include +#include +#include #include #include diff --git a/ydb/core/tx/columnshard/engines/ut/helper.cpp b/ydb/core/tx/columnshard/engines/ut/helper.cpp index 66c67eb672d5..56a5c26ba492 100644 --- a/ydb/core/tx/columnshard/engines/ut/helper.cpp +++ b/ydb/core/tx/columnshard/engines/ut/helper.cpp @@ -1,5 +1,5 @@ #include "helper.h" -#include +#include namespace NKikimr::NOlap::NEngines::NTest { diff --git a/ydb/core/tx/columnshard/operations/batch_builder/merger.cpp b/ydb/core/tx/columnshard/operations/batch_builder/merger.cpp index f82c0bd42e05..823f6ac1cf3d 100644 --- a/ydb/core/tx/columnshard/operations/batch_builder/merger.cpp +++ b/ydb/core/tx/columnshard/operations/batch_builder/merger.cpp @@ -1,6 +1,6 @@ #include "merger.h" #include -#include +#include namespace NKikimr::NOlap { diff --git a/ydb/core/tx/columnshard/splitter/abstract/chunk_meta.h b/ydb/core/tx/columnshard/splitter/abstract/chunk_meta.h index 0e54b4c9e42a..526a2a037967 100644 --- a/ydb/core/tx/columnshard/splitter/abstract/chunk_meta.h +++ b/ydb/core/tx/columnshard/splitter/abstract/chunk_meta.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include #include diff --git a/ydb/core/tx/columnshard/splitter/batch_slice.h b/ydb/core/tx/columnshard/splitter/batch_slice.h index 3b2dd6f1bf8e..f1b019544d8c 100644 --- a/ydb/core/tx/columnshard/splitter/batch_slice.h +++ b/ydb/core/tx/columnshard/splitter/batch_slice.h @@ -3,8 +3,8 @@ #include "column_info.h" #include "blob_info.h" #include -#include -#include +#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/splitter/ut/ut_splitter.cpp b/ydb/core/tx/columnshard/splitter/ut/ut_splitter.cpp index 06c6f5020f29..7ca04ee36933 100644 --- a/ydb/core/tx/columnshard/splitter/ut/ut_splitter.cpp +++ b/ydb/core/tx/columnshard/splitter/ut/ut_splitter.cpp @@ -4,11 +4,11 @@ #include #include #include -#include -#include -#include +#include +#include +#include #include -#include +#include #include #include #include diff --git a/ydb/core/tx/columnshard/transactions/locks/interaction.h b/ydb/core/tx/columnshard/transactions/locks/interaction.h index bd48eb1c9460..7884568d7397 100644 --- a/ydb/core/tx/columnshard/transactions/locks/interaction.h +++ b/ydb/core/tx/columnshard/transactions/locks/interaction.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include #include diff --git a/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp b/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp index 8a110e996b96..ad5ec1f688fd 100644 --- a/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp +++ b/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp @@ -16,9 +16,9 @@ #include #include #include -#include -#include -#include +#include +#include +#include #include namespace NKikimr { diff --git a/ydb/core/tx/columnshard/ut_rw/ut_normalizer.cpp b/ydb/core/tx/columnshard/ut_rw/ut_normalizer.cpp index 68eecd9f7b73..734047952707 100644 --- a/ydb/core/tx/columnshard/ut_rw/ut_normalizer.cpp +++ b/ydb/core/tx/columnshard/ut_rw/ut_normalizer.cpp @@ -7,9 +7,9 @@ #include -#include -#include -#include +#include +#include +#include namespace NKikimr { diff --git a/ydb/core/tx/data_events/write_data.h b/ydb/core/tx/data_events/write_data.h index d409e87b5a7b..0acbec1bcf98 100644 --- a/ydb/core/tx/data_events/write_data.h +++ b/ydb/core/tx/data_events/write_data.h @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include diff --git a/ydb/core/tx/sharding/unboxed_reader.h b/ydb/core/tx/sharding/unboxed_reader.h index fed3e972e2b9..11a31d7e3597 100644 --- a/ydb/core/tx/sharding/unboxed_reader.h +++ b/ydb/core/tx/sharding/unboxed_reader.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include namespace NKikimr::NMiniKQL { diff --git a/ydb/core/tx/sharding/ut/ut_sharding.cpp b/ydb/core/tx/sharding/ut/ut_sharding.cpp index 14dd61a48560..dce5bc6e7500 100644 --- a/ydb/core/tx/sharding/ut/ut_sharding.cpp +++ b/ydb/core/tx/sharding/ut/ut_sharding.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/ydb/services/ext_index/ut/ut_ext_index.cpp b/ydb/services/ext_index/ut/ut_ext_index.cpp index bede1b70619b..ec67f99c8478 100644 --- a/ydb/services/ext_index/ut/ut_ext_index.cpp +++ b/ydb/services/ext_index/ut/ut_ext_index.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include From 7d9d5f065f69f317180fd2fec8630a6d219d64e6 Mon Sep 17 00:00:00 2001 From: Grigoriy Pisarenko Date: Thu, 12 Sep 2024 19:45:25 +0000 Subject: [PATCH 3/6] Removed comments in ya.make --- ydb/library/formats/arrow/ya.make | 3 --- 1 file changed, 3 deletions(-) diff --git a/ydb/library/formats/arrow/ya.make b/ydb/library/formats/arrow/ya.make index 453bdea5d62b..c9b3d255c848 100644 --- a/ydb/library/formats/arrow/ya.make +++ b/ydb/library/formats/arrow/ya.make @@ -2,9 +2,6 @@ RECURSE_FOR_TESTS( ut ) -# Not allowed: ydb/(?!library|services/bg_tasks/abstract/interface.h) -# In: ./ydb/library/formats - LIBRARY() PEERDIR( From 26d1c339e6d216414152027d139dd852770c65ad Mon Sep 17 00:00:00 2001 From: Grigoriy Pisarenko Date: Thu, 12 Sep 2024 20:16:34 +0000 Subject: [PATCH 4/6] Added missing PEERDIR --- ydb/core/tx/schemeshard/olap/operations/alter/abstract/ya.make | 1 + 1 file changed, 1 insertion(+) diff --git a/ydb/core/tx/schemeshard/olap/operations/alter/abstract/ya.make b/ydb/core/tx/schemeshard/olap/operations/alter/abstract/ya.make index 66a40695a604..bb4459a412fd 100644 --- a/ydb/core/tx/schemeshard/olap/operations/alter/abstract/ya.make +++ b/ydb/core/tx/schemeshard/olap/operations/alter/abstract/ya.make @@ -12,6 +12,7 @@ PEERDIR( ydb/library/accessor ydb/core/protos ydb/library/actors/wilson + ydb/library/formats/arrow ) YQL_LAST_ABI_VERSION() From 9b3003ec4cd254697fe3c0050fc09e47f63a0f6b Mon Sep 17 00:00:00 2001 From: Grigoriy Pisarenko Date: Thu, 12 Sep 2024 20:24:11 +0000 Subject: [PATCH 5/6] Fixed uresolved include 1 --- ydb/core/tx/schemeshard/olap/operations/alter/common/update.h | 2 +- .../tx/schemeshard/olap/operations/alter/standalone/update.cpp | 2 +- ydb/core/tx/schemeshard/olap/operations/alter_store.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ydb/core/tx/schemeshard/olap/operations/alter/common/update.h b/ydb/core/tx/schemeshard/olap/operations/alter/common/update.h index 013c5601780e..fd10245bc284 100644 --- a/ydb/core/tx/schemeshard/olap/operations/alter/common/update.h +++ b/ydb/core/tx/schemeshard/olap/operations/alter/common/update.h @@ -2,7 +2,7 @@ #include #include #include -#include +#include namespace NKikimr::NSchemeShard::NOlap::NAlter { diff --git a/ydb/core/tx/schemeshard/olap/operations/alter/standalone/update.cpp b/ydb/core/tx/schemeshard/olap/operations/alter/standalone/update.cpp index a442ca80392f..2902534fbb24 100644 --- a/ydb/core/tx/schemeshard/olap/operations/alter/standalone/update.cpp +++ b/ydb/core/tx/schemeshard/olap/operations/alter/standalone/update.cpp @@ -1,7 +1,7 @@ #include "update.h" #include #include -#include +#include namespace NKikimr::NSchemeShard::NOlap::NAlter { diff --git a/ydb/core/tx/schemeshard/olap/operations/alter_store.cpp b/ydb/core/tx/schemeshard/olap/operations/alter_store.cpp index 98a4b6b8d1c7..57f05068b162 100644 --- a/ydb/core/tx/schemeshard/olap/operations/alter_store.cpp +++ b/ydb/core/tx/schemeshard/olap/operations/alter_store.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include "checks.h" From 9a45cdf5f416a088777c35fd03a5f8558471fc3d Mon Sep 17 00:00:00 2001 From: Grigoriy Pisarenko Date: Thu, 12 Sep 2024 21:11:32 +0000 Subject: [PATCH 6/6] Fixed build 2 --- ydb/core/tx/columnshard/transactions/locks/interaction.h | 1 + 1 file changed, 1 insertion(+) diff --git a/ydb/core/tx/columnshard/transactions/locks/interaction.h b/ydb/core/tx/columnshard/transactions/locks/interaction.h index 7884568d7397..abd9ef92f6d5 100644 --- a/ydb/core/tx/columnshard/transactions/locks/interaction.h +++ b/ydb/core/tx/columnshard/transactions/locks/interaction.h @@ -1,4 +1,5 @@ #pragma once +#include #include #include