From cf89927e670128738c067df5baad08a2301b4b4a Mon Sep 17 00:00:00 2001 From: Seaven Date: Thu, 18 Jul 2024 21:12:51 +0800 Subject: [PATCH] upxxx Signed-off-by: Seaven --- be/src/column/column_access_path.cpp | 73 +- be/src/column/column_access_path.h | 13 +- be/src/column/json_column.cpp | 69 +- be/src/column/json_column.h | 15 +- be/src/common/config.h | 7 +- be/src/exprs/json_functions.cpp | 2 +- be/src/storage/CMakeLists.txt | 1 + be/src/storage/compaction_utils.cpp | 1 + be/src/storage/lake/delta_writer.cpp | 2 +- be/src/storage/lake/general_tablet_writer.cpp | 11 +- be/src/storage/lake/general_tablet_writer.h | 5 +- be/src/storage/lake/pk_tablet_writer.cpp | 4 +- be/src/storage/lake/tablet.cpp | 5 +- be/src/storage/lake/tablet_writer.h | 7 +- be/src/storage/lake/versioned_tablet.cpp | 4 +- be/src/storage/olap_common.h | 4 + be/src/storage/rowset/column_reader.cpp | 110 +- be/src/storage/rowset/column_reader.h | 4 + be/src/storage/rowset/column_writer.h | 1 + .../storage/rowset/json_column_compactor.cpp | 197 +++ be/src/storage/rowset/json_column_compactor.h | 79 ++ .../storage/rowset/json_column_iterator.cpp | 406 ++++-- be/src/storage/rowset/json_column_iterator.h | 24 +- be/src/storage/rowset/json_column_writer.cpp | 304 ++--- be/src/storage/rowset/json_column_writer.h | 49 +- be/src/storage/rowset/rowset_writer.cpp | 1 + be/src/storage/rowset/rowset_writer_context.h | 2 + be/src/storage/rowset/segment_writer.cpp | 2 + be/src/storage/rowset/segment_writer.h | 1 + be/src/storage/tablet_updates.cpp | 1 + be/src/util/json_flattener.cpp | 1188 ++++++++++++++--- be/src/util/json_flattener.h | 232 +++- be/test/CMakeLists.txt | 1 + be/test/exprs/flat_json_functions_test.cpp | 84 +- be/test/exprs/json_functions_test.cpp | 118 +- .../rowset/flat_json_column_rw_test.cpp | 788 ++++++++++- be/test/util/json_flattener_test.cpp | 272 ++++ gensrc/proto/segment.proto | 380 +++--- 38 files changed, 3584 insertions(+), 883 deletions(-) create mode 100644 be/src/storage/rowset/json_column_compactor.cpp create mode 100644 be/src/storage/rowset/json_column_compactor.h create mode 100644 be/test/util/json_flattener_test.cpp diff --git a/be/src/column/column_access_path.cpp b/be/src/column/column_access_path.cpp index eaae046f69628b..b8a706c7bb043c 100644 --- a/be/src/column/column_access_path.cpp +++ b/be/src/column/column_access_path.cpp @@ -15,6 +15,7 @@ #include "column/column_access_path.h" #include +#include #include #include "column/column.h" @@ -23,8 +24,10 @@ #include "column/vectorized_fwd.h" #include "common/object_pool.h" #include "common/status.h" +#include "common/statusor.h" #include "exprs/expr.h" #include "exprs/expr_context.h" +#include "gen_cpp/PlanNodes_types.h" #include "runtime/runtime_state.h" #include "runtime/types.h" #include "types/logical_type.h" @@ -66,22 +69,13 @@ Status ColumnAccessPath::init(const std::string& parent_path, const TColumnAcces for (const auto& child : column_path.children) { ColumnAccessPathPtr child_path = std::make_unique(); - RETURN_IF_ERROR(child_path->init(_absolute_path + "/", child, state, pool)); + RETURN_IF_ERROR(child_path->init(_absolute_path + ".", child, state, pool)); _children.emplace_back(std::move(child_path)); } return Status::OK(); } -Status ColumnAccessPath::init(TAccessPathType::type type, const std::string& path, uint32_t index) { - _type = type; - _path = path; - _column_index = index; - _absolute_path = path; - _value_type = TypeDescriptor(LogicalType::TYPE_JSON); - return Status::OK(); -} - ColumnAccessPath* ColumnAccessPath::get_child(const std::string& path) { for (const auto& child : _children) { if (child->_path == path) { @@ -175,6 +169,16 @@ size_t ColumnAccessPath::leaf_size() const { return size; } +void ColumnAccessPath::get_all_leafs(std::vector* result) { + if (_children.empty()) { + result->emplace_back(this); + return; + } + for (const auto& child : _children) { + child->get_all_leafs(result); + } +} + const std::string ColumnAccessPath::to_string() const { std::stringstream ss; ss << _path << "(" << _type << ")"; @@ -184,15 +188,58 @@ const std::string ColumnAccessPath::to_string() const { StatusOr> ColumnAccessPath::create(const TColumnAccessPath& column_path, RuntimeState* state, ObjectPool* pool) { auto path = std::make_unique(); - RETURN_IF_ERROR(path->init("/", column_path, state, pool)); + RETURN_IF_ERROR(path->init("", column_path, state, pool)); return path; } StatusOr> ColumnAccessPath::create(const TAccessPathType::type& type, const std::string& path, uint32_t index) { auto p = std::make_unique(); - RETURN_IF_ERROR(p->init(type, path, index)); - return p; + p->_type = type; + p->_path = path; + p->_column_index = index; + p->_absolute_path = path; + p->_value_type = TypeDescriptor(LogicalType::TYPE_JSON); + p->_children.clear(); + return std::move(p); +} + +ColumnAccessPath* insert_json_path_impl(const std::string& path, ColumnAccessPath* root) { + if (path.empty()) { + return root; + } + + size_t pos = 0; + if (path.starts_with("\"")) { + pos = path.find('\"', 1); + DCHECK(pos != std::string::npos); + } + pos = path.find('.', pos); + std::string key; + std::string next; + if (pos == std::string::npos) { + key = path; + } else { + key = path.substr(0, pos); + next = path.substr(pos + 1); + } + + auto child = root->get_child(key); + if (child == nullptr) { + auto n = ColumnAccessPath::create(TAccessPathType::FIELD, key, 0); + DCHECK(n.ok()); + root->children().emplace_back(std::move(n.value())); + child = root->children().back().get(); + } + return insert_json_path_impl(next, child); +} + +void ColumnAccessPath::insert_json_path(ColumnAccessPath* root, LogicalType type, const std::string& path) { + auto leaf = insert_json_path_impl(path, root); + leaf->_type = TAccessPathType::type::FIELD; + leaf->_column_index = 0; + leaf->_absolute_path = path; + leaf->_value_type = TypeDescriptor(type); } } // namespace starrocks diff --git a/be/src/column/column_access_path.h b/be/src/column/column_access_path.h index c05d75df4c7570..d6edddfccd45fc 100644 --- a/be/src/column/column_access_path.h +++ b/be/src/column/column_access_path.h @@ -18,9 +18,11 @@ #include #include +#include "column/column.h" #include "common/status.h" #include "gen_cpp/PlanNodes_types.h" #include "runtime/types.h" +#include "types/logical_type.h" namespace starrocks { @@ -41,15 +43,14 @@ class ColumnAccessPath { static StatusOr> create(const TColumnAccessPath& column_path, RuntimeState* state, ObjectPool* pool); - // for test - static StatusOr> create(const TAccessPathType::type& type, - const std::string& path, uint32_t index); - Status init(const std::string& parent_path, const TColumnAccessPath& column_path, RuntimeState* state, ObjectPool* pool); // for test - Status init(TAccessPathType::type type, const std::string& path, uint32_t index); + static StatusOr> create(const TAccessPathType::type& type, + const std::string& path, uint32_t index); + static void insert_json_path(ColumnAccessPath* root, LogicalType type, const std::string& path); + // end test const std::string& path() const { return _path; } @@ -86,6 +87,8 @@ class ColumnAccessPath { size_t leaf_size() const; + void get_all_leafs(std::vector* result); + private: // path type, to mark the path is KEY/OFFSET/FIELD/ALL/INDEX TAccessPathType::type _type; diff --git a/be/src/column/json_column.cpp b/be/src/column/json_column.cpp index 12c7a836f6bf28..d4aea4cc6f8e48 100644 --- a/be/src/column/json_column.cpp +++ b/be/src/column/json_column.cpp @@ -66,7 +66,8 @@ std::string JsonColumn::debug_item(size_t idx) const { std::ostringstream ss; ss << "{"; size_t i = 0; - for (; i < _flat_column_paths.size() - i; i++) { + // flat json debug is different with normal, lose quota + for (; i < _flat_column_paths.size() - 1; i++) { ss << _flat_column_paths[i] << ": "; ss << get_flat_field(i)->debug_item(idx) << ", "; } @@ -163,51 +164,31 @@ const ColumnPtr& JsonColumn::get_flat_field(int index) const { return _flat_columns[index]; } +ColumnPtr& JsonColumn::get_remain() { + DCHECK(_flat_columns.size() == _flat_column_paths.size() + 1); + return _flat_columns[_flat_columns.size() - 1]; +} + +const ColumnPtr& JsonColumn::get_remain() const { + DCHECK(_flat_columns.size() == _flat_column_paths.size() + 1); + return _flat_columns[_flat_columns.size() - 1]; +} + LogicalType JsonColumn::get_flat_field_type(const std::string& path) const { DCHECK(_path_to_index.count(path) > 0); return _flat_column_types[_path_to_index.at(path)]; } -void JsonColumn::init_flat_columns(const std::vector& paths) { - if (_flat_column_paths.empty()) { - _flat_column_paths.insert(_flat_column_paths.cbegin(), paths.cbegin(), paths.cend()); - _flat_column_types.assign(paths.size(), LogicalType::TYPE_JSON); - for (size_t i = 0; i < _flat_column_paths.size(); i++) { - // nullable column - _flat_columns.emplace_back(NullableColumn::create(JsonColumn::create(), NullColumn::create())); - _path_to_index[_flat_column_paths[i]] = i; - } - } else { - DCHECK(_flat_column_paths.size() == paths.size()); - DCHECK(_flat_columns.size() == paths.size()); - DCHECK(_flat_column_types.size() == paths.size()); - for (size_t i = 0; i < _flat_column_paths.size(); i++) { - DCHECK(_flat_column_paths[i] == paths[i]); - DCHECK(_flat_columns[i]->is_nullable()); - DCHECK(_flat_column_types[i] == LogicalType::TYPE_JSON); - } - } -} - -void JsonColumn::init_flat_columns(const std::vector& paths, const std::vector& types) { - if (_flat_column_paths.empty()) { - DCHECK_EQ(paths.size(), types.size()); - _flat_column_paths.insert(_flat_column_paths.cbegin(), paths.cbegin(), paths.cend()); - _flat_column_types.insert(_flat_column_types.cbegin(), types.cbegin(), types.cend()); - for (size_t i = 0; i < _flat_column_paths.size(); i++) { - // nullable column - _flat_columns.emplace_back(ColumnHelper::create_column(TypeDescriptor(types[i]), true)); - _path_to_index[_flat_column_paths[i]] = i; - } - } else { - DCHECK(_flat_column_paths.size() == paths.size()); - DCHECK(_flat_columns.size() == paths.size()); - DCHECK(_flat_column_types.size() == paths.size()); - for (size_t i = 0; i < _flat_column_paths.size(); i++) { - DCHECK(_flat_column_paths[i] == paths[i]); - DCHECK(_flat_columns[i]->is_nullable()); - DCHECK(_flat_column_types[i] == types[i]); - } +void JsonColumn::set_flat_columns(const std::vector& paths, const std::vector& types, + const Columns& flat_columns) { + DCHECK_EQ(paths.size(), types.size()); + DCHECK_GE(paths.size(), flat_columns.size()); + DCHECK_LE(paths.size(), flat_columns.size() + 1); + _flat_column_paths.insert(_flat_column_paths.cbegin(), paths.cbegin(), paths.cend()); + _flat_column_types.insert(_flat_column_types.cbegin(), types.cbegin(), types.cend()); + _flat_columns.insert(_flat_columns.cbegin(), flat_columns.cbegin(), flat_columns.cend()); + for (size_t i = 0; i < _flat_column_paths.size(); i++) { + _path_to_index[_flat_column_paths[i]] = i; } } @@ -307,7 +288,11 @@ void JsonColumn::append(const Column& src, size_t offset, size_t count) { if (other_json->is_flat_json() && !is_flat_json()) { // only hit in AggregateIterator (Aggregate mode in storage) DCHECK_EQ(0, this->size()); - init_flat_columns(other_json->_flat_column_paths, other_json->_flat_column_types); + std::vector copy; + for (const auto& col : other_json->_flat_columns) { + copy.emplace_back(col->clone_empty()); + } + set_flat_columns(other_json->flat_column_paths(), other_json->flat_column_types(), copy); } if (is_flat_json()) { diff --git a/be/src/column/json_column.h b/be/src/column/json_column.h index 977e1771bf0fa1..833c22925eb054 100644 --- a/be/src/column/json_column.h +++ b/be/src/column/json_column.h @@ -113,27 +113,34 @@ class JsonColumn final : public ColumnFactory, JsonColum Columns& get_flat_fields() { return _flat_columns; }; + const Columns& get_flat_fields() const { return _flat_columns; }; + ColumnPtr& get_flat_field(int index); const ColumnPtr& get_flat_field(int index) const; + ColumnPtr& get_remain(); + + const ColumnPtr& get_remain() const; + const std::vector& flat_column_paths() const { return _flat_column_paths; } const std::vector& flat_column_types() const { return _flat_column_types; } bool has_flat_column(const std::string& path) const; - void init_flat_columns(const std::vector& paths); + bool has_remain() const { return _flat_columns.size() == (_flat_column_paths.size() + 1); } - void init_flat_columns(const std::vector& paths, const std::vector& types); + void set_flat_columns(const std::vector& paths, const std::vector& types, + const Columns& flat_columns); std::string debug_flat_paths() const; private: - // flat-columns + // flat-columns[sub_columns, remain_column] Columns _flat_columns; - // flat-column paths + // flat-column paths, doesn't contains remain column std::vector _flat_column_paths; std::vector _flat_column_types; std::unordered_map _path_to_index; diff --git a/be/src/common/config.h b/be/src/common/config.h index d51c6947699fe4..d953aae71c58d0 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1295,16 +1295,13 @@ CONF_mInt32(dictionary_cache_refresh_threadpool_size, "8"); CONF_mBool(enable_json_flat, "true"); // extract flat json column when row_num * null_factor < null_row_num -CONF_mDouble(json_flat_null_factor, "0.3"); +CONF_mDouble(json_flat_null_factor, "0.4"); // extract flat json column when row_num * sparsity_factor < hit_row_num CONF_mDouble(json_flat_sparsity_factor, "0.9"); -// only flatten json when the number of sub-field in the JSON exceeds the limit -CONF_mInt32(json_flat_internal_column_min_limit, "5"); - // the maximum number of extracted JSON sub-field -CONF_mInt32(json_flat_column_max, "20"); +CONF_mInt32(json_flat_column_max, "100"); // Allowable intervals for continuous generation of pk dumps // Disable when pk_dump_interval_seconds <= 0 diff --git a/be/src/exprs/json_functions.cpp b/be/src/exprs/json_functions.cpp index 11f77c2cd16835..46a76b331f6372 100644 --- a/be/src/exprs/json_functions.cpp +++ b/be/src/exprs/json_functions.cpp @@ -588,7 +588,7 @@ StatusOr JsonFunctions::_flat_json_query_impl(FunctionContext* contex chunk.append_column(flat_column, 0); return state->cast_expr->evaluate_checked(nullptr, &chunk); } - return flat_column; + return flat_column->clone(); } } diff --git a/be/src/storage/CMakeLists.txt b/be/src/storage/CMakeLists.txt index 9e31a771b6c255..2cb33120e98ad6 100644 --- a/be/src/storage/CMakeLists.txt +++ b/be/src/storage/CMakeLists.txt @@ -81,6 +81,7 @@ add_library(Storage STATIC rowset/bitshuffle_page.cpp rowset/bitshuffle_wrapper.cpp rowset/column_iterator.cpp + rowset/json_column_compactor.cpp rowset/json_column_iterator.cpp rowset/json_column_writer.cpp rowset/cast_column_iterator.cpp diff --git a/be/src/storage/compaction_utils.cpp b/be/src/storage/compaction_utils.cpp index 7e429bb150b634..f33528f9c48883 100644 --- a/be/src/storage/compaction_utils.cpp +++ b/be/src/storage/compaction_utils.cpp @@ -69,6 +69,7 @@ Status CompactionUtils::construct_output_rowset_writer(Tablet* tablet, uint32_t context.writer_type = (algorithm == VERTICAL_COMPACTION ? RowsetWriterType::kVertical : RowsetWriterType::kHorizontal); context.gtid = gtid; + context.is_compaction = true; Status st = RowsetFactory::create_rowset_writer(context, output_rowset_writer); if (!st.ok()) { std::stringstream ss; diff --git a/be/src/storage/lake/delta_writer.cpp b/be/src/storage/lake/delta_writer.cpp index faf88bd791b9f0..4baf6438b152c4 100644 --- a/be/src/storage/lake/delta_writer.cpp +++ b/be/src/storage/lake/delta_writer.cpp @@ -230,7 +230,7 @@ Status DeltaWriterImpl::build_schema_and_writer() { _txn_id, nullptr, false /** no compaction**/); } else { _tablet_writer = std::make_unique(_tablet_manager, _tablet_id, _write_schema, - _txn_id); + _txn_id, false); } RETURN_IF_ERROR(_tablet_writer->open()); _mem_table_sink = std::make_unique(_tablet_writer.get()); diff --git a/be/src/storage/lake/general_tablet_writer.cpp b/be/src/storage/lake/general_tablet_writer.cpp index 5b0f4e7dd07d9f..3a179f5f9ee05f 100644 --- a/be/src/storage/lake/general_tablet_writer.cpp +++ b/be/src/storage/lake/general_tablet_writer.cpp @@ -32,8 +32,8 @@ namespace starrocks::lake { HorizontalGeneralTabletWriter::HorizontalGeneralTabletWriter(TabletManager* tablet_mgr, int64_t tablet_id, std::shared_ptr schema, int64_t txn_id, - ThreadPool* flush_pool) - : TabletWriter(tablet_mgr, tablet_id, std::move(schema), txn_id, flush_pool) {} + bool is_compaction, ThreadPool* flush_pool) + : TabletWriter(tablet_mgr, tablet_id, std::move(schema), txn_id, is_compaction, flush_pool) {} HorizontalGeneralTabletWriter::~HorizontalGeneralTabletWriter() = default; @@ -82,6 +82,7 @@ Status HorizontalGeneralTabletWriter::reset_segment_writer() { DCHECK(_schema != nullptr); auto name = gen_segment_filename(_txn_id); SegmentWriterOptions opts; + opts.is_compaction = _is_compaction; WritableFileOptions wopts; if (config::enable_transparent_data_encryption) { ASSIGN_OR_RETURN(auto pair, KeyCache::instance().create_encryption_meta_pair_using_current_kek()); @@ -118,8 +119,9 @@ Status HorizontalGeneralTabletWriter::flush_segment_writer(SegmentPB* segment) { VerticalGeneralTabletWriter::VerticalGeneralTabletWriter(TabletManager* tablet_mgr, int64_t tablet_id, std::shared_ptr schema, int64_t txn_id, - uint32_t max_rows_per_segment, ThreadPool* flush_pool) - : TabletWriter(tablet_mgr, tablet_id, std::move(schema), txn_id, flush_pool), + uint32_t max_rows_per_segment, bool is_compaction, + ThreadPool* flush_pool) + : TabletWriter(tablet_mgr, tablet_id, std::move(schema), txn_id, _is_compaction, flush_pool), _max_rows_per_segment(max_rows_per_segment) {} VerticalGeneralTabletWriter::~VerticalGeneralTabletWriter() { @@ -267,6 +269,7 @@ StatusOr> VerticalGeneralTabletWriter::create_seg DCHECK(_schema != nullptr); auto name = gen_segment_filename(_txn_id); SegmentWriterOptions opts; + opts.is_compaction = _is_compaction; WritableFileOptions wopts; if (config::enable_transparent_data_encryption) { ASSIGN_OR_RETURN(auto pair, KeyCache::instance().create_encryption_meta_pair_using_current_kek()); diff --git a/be/src/storage/lake/general_tablet_writer.h b/be/src/storage/lake/general_tablet_writer.h index 8b640c75ebc96f..815117fa2dc867 100644 --- a/be/src/storage/lake/general_tablet_writer.h +++ b/be/src/storage/lake/general_tablet_writer.h @@ -34,7 +34,7 @@ class HorizontalGeneralTabletWriter : public TabletWriter { public: explicit HorizontalGeneralTabletWriter(TabletManager* tablet_mgr, int64_t tablet_id, std::shared_ptr schema, int64_t txn_id, - ThreadPool* flush_pool = nullptr); + bool is_compaction, ThreadPool* flush_pool = nullptr); ~HorizontalGeneralTabletWriter() override; @@ -84,7 +84,8 @@ class VerticalGeneralTabletWriter : public TabletWriter { public: explicit VerticalGeneralTabletWriter(TabletManager* tablet_mgr, int64_t tablet_id, std::shared_ptr schema, int64_t txn_id, - uint32_t max_rows_per_segment, ThreadPool* flush_pool = nullptr); + uint32_t max_rows_per_segment, bool is_compaction, + ThreadPool* flush_pool = nullptr); ~VerticalGeneralTabletWriter() override; diff --git a/be/src/storage/lake/pk_tablet_writer.cpp b/be/src/storage/lake/pk_tablet_writer.cpp index 9d86da16da1ce4..67e386eab160f6 100644 --- a/be/src/storage/lake/pk_tablet_writer.cpp +++ b/be/src/storage/lake/pk_tablet_writer.cpp @@ -32,7 +32,7 @@ namespace starrocks::lake { HorizontalPkTabletWriter::HorizontalPkTabletWriter(TabletManager* tablet_mgr, int64_t tablet_id, std::shared_ptr schema, int64_t txn_id, ThreadPool* flush_pool, bool is_compaction) - : HorizontalGeneralTabletWriter(tablet_mgr, tablet_id, std::move(schema), txn_id, flush_pool), + : HorizontalGeneralTabletWriter(tablet_mgr, tablet_id, std::move(schema), txn_id, is_compaction, flush_pool), _rowset_txn_meta(std::make_unique()) { if (is_compaction) { auto rows_mapper_filename = lake_rows_mapper_filename(tablet_id, txn_id); @@ -111,7 +111,7 @@ VerticalPkTabletWriter::VerticalPkTabletWriter(TabletManager* tablet_mgr, int64_ uint32_t max_rows_per_segment, ThreadPool* flush_pool, bool is_compaction) : VerticalGeneralTabletWriter(tablet_mgr, tablet_id, std::move(schema), txn_id, max_rows_per_segment, - flush_pool) { + is_compaction, flush_pool) { if (is_compaction) { auto rows_mapper_filename = lake_rows_mapper_filename(tablet_id, txn_id); if (rows_mapper_filename.ok()) { diff --git a/be/src/storage/lake/tablet.cpp b/be/src/storage/lake/tablet.cpp index 58644a00945517..7a9ffcb63f9987 100644 --- a/be/src/storage/lake/tablet.cpp +++ b/be/src/storage/lake/tablet.cpp @@ -85,11 +85,12 @@ StatusOr> Tablet::new_writer(WriterType type, int6 } } else { if (type == kHorizontal) { - return std::make_unique(_mgr, _id, tablet_schema, txn_id, flush_pool); + return std::make_unique(_mgr, _id, tablet_schema, txn_id, is_compaction, + flush_pool); } else { DCHECK(type == kVertical); return std::make_unique(_mgr, _id, tablet_schema, txn_id, max_rows_per_segment, - flush_pool); + is_compaction, flush_pool); } } } diff --git a/be/src/storage/lake/tablet_writer.h b/be/src/storage/lake/tablet_writer.h index 011af0425784d1..e4a208f854a487 100644 --- a/be/src/storage/lake/tablet_writer.h +++ b/be/src/storage/lake/tablet_writer.h @@ -41,12 +41,13 @@ enum WriterType : int { kHorizontal = 0, kVertical = 1 }; class TabletWriter { public: explicit TabletWriter(TabletManager* tablet_mgr, int64_t tablet_id, std::shared_ptr schema, - int64_t txn_id, ThreadPool* flush_pool = nullptr) + int64_t txn_id, bool is_compaction, ThreadPool* flush_pool = nullptr) : _tablet_mgr(tablet_mgr), _tablet_id(tablet_id), _schema(std::move(schema)), _txn_id(txn_id), - _flush_pool(flush_pool) {} + _flush_pool(flush_pool), + _is_compaction(is_compaction) {} virtual ~TabletWriter() = default; @@ -141,6 +142,8 @@ class TabletWriter { uint32_t _seg_id = 0; bool _finished = false; OlapWriterStatistics _stats; + + bool _is_compaction = false; }; } // namespace lake diff --git a/be/src/storage/lake/versioned_tablet.cpp b/be/src/storage/lake/versioned_tablet.cpp index 0dabb13dc3fade..fc2649f034e0fb 100644 --- a/be/src/storage/lake/versioned_tablet.cpp +++ b/be/src/storage/lake/versioned_tablet.cpp @@ -51,11 +51,11 @@ StatusOr> VersionedTablet::new_writer(WriterType t } else { if (type == kHorizontal) { return std::make_unique(_tablet_mgr, id(), tablet_schema, txn_id, - flush_pool); + is_compaction, flush_pool); } else { DCHECK(type == kVertical); return std::make_unique(_tablet_mgr, id(), tablet_schema, txn_id, - max_rows_per_segment, flush_pool); + max_rows_per_segment, is_compaction, flush_pool); } } } diff --git a/be/src/storage/olap_common.h b/be/src/storage/olap_common.h index 0110abb48971de..ebf1c111dd47ac 100644 --- a/be/src/storage/olap_common.h +++ b/be/src/storage/olap_common.h @@ -302,7 +302,11 @@ struct OlapReaderStatistics { // ------ for json type, to count flat column ------ // key: json absolute path, value: count int64_t json_flatten_ns = 0; + int64_t json_cast_ns = 0; + int64_t json_merge_ns = 0; + int64_t json_init_ns = 0; std::unordered_map flat_json_hits; + std::unordered_map merge_json_hits; std::unordered_map dynamic_json_hits; }; diff --git a/be/src/storage/rowset/column_reader.cpp b/be/src/storage/rowset/column_reader.cpp index 8b2946d388d254..8afdcb4646f593 100644 --- a/be/src/storage/rowset/column_reader.cpp +++ b/be/src/storage/rowset/column_reader.cpp @@ -45,6 +45,7 @@ #include "column/datum_convert.h" #include "common/compiler_util.h" #include "common/logging.h" +#include "common/statusor.h" #include "runtime/types.h" #include "storage/column_predicate.h" #include "storage/inverted/index_descriptor.hpp" @@ -131,6 +132,8 @@ Status ColumnReader::_init(ColumnMetaPB* meta, const TabletColumn* column) { // TODO(mofei) store format_version in ColumnReader const JsonMetaPB& json_meta = meta->json_meta(); CHECK_EQ(kJsonMetaDefaultFormatVersion, json_meta.format_version()) << "Only format_version=1 is supported"; + _is_flat_json = json_meta.is_flat(); + _has_remain = json_meta.has_remain(); } if (is_scalar_field_type(delegate_type(_column_type))) { RETURN_IF_ERROR(EncodingInfo::get(delegate_type(_column_type), meta->encoding(), &_encoding_info)); @@ -664,51 +667,100 @@ StatusOr> ColumnReader::new_iterator(ColumnAcces const TabletColumn* column) { if (_column_type == LogicalType::TYPE_JSON) { auto json_iter = std::make_unique(this); - if (path == nullptr || path->children().empty()) { - return json_iter; - } - std::vector> flat_iters; - // short name path, e.g. 'a' - std::vector flat_paths; + // access sub columns + std::vector access_paths; + std::vector target_paths; std::vector target_types; - std::vector source_types; - { - for (auto& p : path->children()) { - if (UNLIKELY(!p->children().empty())) { - // @todo: support later - return Status::InvalidArgument("doesn't support multi-layer json access path: " + - p->absolute_path()); - } - flat_paths.emplace_back(p->path()); + if (path != nullptr && !path->children().empty()) { + path->get_all_leafs(&access_paths); + for (auto& p : access_paths) { + target_paths.emplace_back(p->absolute_path()); // use absolute path, not relative path target_types.emplace_back(p->value_type().type); } } - int start = is_nullable() ? 1 : 0; - for (auto& p : flat_paths) { - for (size_t i = start; i < _sub_readers->size(); i++) { + if (!_is_flat_json) { + if (path == nullptr || path->children().empty()) { + return json_iter; + } + // dynamic flattern + // we must dynamic flat json, because we don't know other segment wasn't the paths + return create_json_dynamic_flat_iterator(std::move(json_iter), target_paths, target_types); + } + + std::vector source_paths; + std::vector source_types; + std::unique_ptr null_iter; + std::vector> all_iters; + size_t start = is_nullable() ? 1 : 0; + size_t end = _has_remain ? _sub_readers->size() - 1 : _sub_readers->size(); + if (is_nullable()) { + ASSIGN_OR_RETURN(null_iter, (*_sub_readers)[0]->new_iterator()); + } + + if (path == nullptr || path->children().empty()) { + DCHECK(_is_flat_json); + for (size_t i = start; i < end; i++) { + const auto& rd = (*_sub_readers)[i]; + std::string name = rd->name(); + ASSIGN_OR_RETURN(auto iter, rd->new_iterator()); + source_paths.emplace_back(name); + source_types.emplace_back(rd->column_type()); + all_iters.emplace_back(std::move(iter)); + } + + if (_has_remain) { + const auto& rd = (*_sub_readers)[end]; + ASSIGN_OR_RETURN(auto iter, rd->new_iterator()); + all_iters.emplace_back(std::move(iter)); + } + // access whole json + return create_json_merge_iterator(std::move(null_iter), std::move(all_iters), source_paths, source_types); + } + + bool need_remain = false; + for (size_t k = 0; k < target_paths.size(); k++) { + auto& target = target_paths[k]; + size_t i = start; + for (; i < end; i++) { const auto& rd = (*_sub_readers)[i]; - if (rd->name() == p) { + std::string name = rd->name(); + // target: b.b2.b3 + // source: b.b2 + if (target == name || target.starts_with(name + ".")) { ASSIGN_OR_RETURN(auto iter, rd->new_iterator()); - flat_iters.emplace_back(std::move(iter)); + source_paths.emplace_back(name); source_types.emplace_back(rd->column_type()); + all_iters.emplace_back(std::move(iter)); break; + } else if (name.starts_with(target + ".")) { + // target: b.b2 + // source: b.b2.b3 + if (target_types[k] != TYPE_JSON && !is_string_type(target_types[k])) { + // don't need column and remain + break; + } + need_remain = true; + ASSIGN_OR_RETURN(auto iter, rd->new_iterator()); + source_paths.emplace_back(name); + source_types.emplace_back(rd->column_type()); + all_iters.emplace_back(std::move(iter)); } } + need_remain |= (i == end); } - if (flat_iters.size() != flat_paths.size()) { - // we must dynamic flat json, because we don't know other segment wasn't the paths - return create_json_dynamic_flat_iterator(std::move(json_iter), flat_paths, target_types, path); + if (_has_remain && need_remain) { + const auto& rd = (*_sub_readers)[end]; + ASSIGN_OR_RETURN(auto iter, rd->new_iterator()); + all_iters.emplace_back(std::move(iter)); } - std::unique_ptr null_iterator; - if (is_nullable()) { - ASSIGN_OR_RETURN(null_iterator, (*_sub_readers)[0]->new_iterator()); - } - return create_json_flat_iterator(this, std::move(null_iterator), std::move(flat_iters), flat_paths, - target_types, source_types, path); + DCHECK(!source_paths.empty()); + DCHECK(!all_iters.empty()); + return create_json_flat_iterator(this, std::move(null_iter), std::move(all_iters), target_paths, target_types, + source_paths, source_types); } else if (is_scalar_field_type(delegate_type(_column_type))) { return std::make_unique(this); } else if (_column_type == LogicalType::TYPE_ARRAY) { diff --git a/be/src/storage/rowset/column_reader.h b/be/src/storage/rowset/column_reader.h index e6e74ea82452e2..09aa158ca81608 100644 --- a/be/src/storage/rowset/column_reader.h +++ b/be/src/storage/rowset/column_reader.h @@ -194,6 +194,8 @@ class ColumnReader { const std::vector>* sub_readers() const { return _sub_readers.get(); } + bool has_remain_json() const { return _has_remain; } + private: const std::string& file_name() const { return _segment->file_name(); } template @@ -294,6 +296,8 @@ class ColumnReader { // only for json flat column std::string _name; + bool _is_flat_json = false; + bool _has_remain = false; // only used for inverted index load OnceFlag _inverted_index_load_once; diff --git a/be/src/storage/rowset/column_writer.h b/be/src/storage/rowset/column_writer.h index 474c0f4d208f09..b82c39218c4b2a 100644 --- a/be/src/storage/rowset/column_writer.h +++ b/be/src/storage/rowset/column_writer.h @@ -85,6 +85,7 @@ struct ColumnWriterOptions { GlobalDictMap* global_dict = nullptr; bool need_flat = false; + bool is_compaction = false; std::string field_name; }; diff --git a/be/src/storage/rowset/json_column_compactor.cpp b/be/src/storage/rowset/json_column_compactor.cpp new file mode 100644 index 00000000000000..309671601ed38a --- /dev/null +++ b/be/src/storage/rowset/json_column_compactor.cpp @@ -0,0 +1,197 @@ +// Copyright 2021-present StarRocks, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "storage/rowset/json_column_compactor.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "column/column.h" +#include "column/column_helper.h" +#include "column/column_viewer.h" +#include "column/json_column.h" +#include "column/nullable_column.h" +#include "column/vectorized_fwd.h" +#include "common/config.h" +#include "common/status.h" +#include "gen_cpp/segment.pb.h" +#include "gutil/casts.h" +#include "gutil/macros.h" +#include "runtime/types.h" +#include "storage/rowset/column_writer.h" +#include "storage/rowset/common.h" +#include "types/constexpr.h" +#include "types/logical_type.h" +#include "util/json_flattener.h" +#include "velocypack/vpack.h" + +namespace starrocks { +Status FlatJsonColumnCompactor::append(const Column& column) { + // compection will reuse column, must copy in there. + auto clone = column.clone_empty(); + clone->append(column); + _json_datas.emplace_back(std::move(clone)); + + _estimate_size += column.byte_size(); + return Status::OK(); +} + +Status FlatJsonColumnCompactor::_compact_columns(std::vector& json_datas) { + // all json datas must full json + JsonPathDeriver deriver; + std::vector vc; + for (const auto& js : json_datas) { + vc.emplace_back(js.get()); + } + deriver.derived(vc); + + _flat_paths = deriver.flat_paths(); + _flat_types = deriver.flat_types(); + _has_remain = deriver.has_remain_json(); + + if (_flat_paths.empty()) { + // write json directly + _is_flat = false; + _json_meta->mutable_json_meta()->set_has_remain(false); + _json_meta->mutable_json_meta()->set_is_flat(false); + + for (auto& col : json_datas) { + JsonColumn* json_col; + if (col->is_nullable()) { + auto nullable_column = down_cast(col.get()); + json_col = down_cast(nullable_column->data_column().get()); + } else { + json_col = down_cast(col.get()); + } + + if (!json_col->is_flat_json()) { + RETURN_IF_ERROR(_json_writer->append(*col)); + } else { + JsonMerger merger(json_col->flat_column_paths(), json_col->flat_column_types(), json_col->has_remain()); + auto j = merger.merge(json_col->get_flat_fields()); + RETURN_IF_ERROR(_json_writer->append(*j)); + } + } + return Status::OK(); + } + + _is_flat = true; + RETURN_IF_ERROR(_init_flat_writers()); + + JsonFlattener flattener(deriver); + HyperJsonTransformer transformer(deriver); + + for (auto& col : json_datas) { + JsonColumn* json_col; + if (col->is_nullable()) { + auto nullable_column = down_cast(col.get()); + json_col = down_cast(nullable_column->data_column().get()); + } else { + json_col = down_cast(col.get()); + } + + if (!json_col->is_flat_json()) { + flattener.flatten(json_col); + _flat_columns = flattener.mutable_result(); + } else { + transformer.init_compaction_task(json_col); + RETURN_IF_ERROR(transformer.trans(json_col->get_flat_fields())); + _flat_columns = transformer.mutable_result(); + transformer.reset(); + } + + // recode null column in 1st + if (_json_meta->is_nullable()) { + auto nulls = NullColumn::create(); + uint8_t IS_NULL = 1; + uint8_t NOT_NULL = 0; + if (col->only_null()) { + nulls->append_value_multiple_times(&IS_NULL, col->size()); + } else if (col->is_nullable()) { + auto* nullable_column = down_cast(col.get()); + auto* nl = down_cast(nullable_column->null_column().get()); + nulls->append(*nl, 0, nl->size()); + } else { + nulls->append_value_multiple_times(&NOT_NULL, col->size()); + } + + _flat_columns.insert(_flat_columns.begin(), nulls); + } + + RETURN_IF_ERROR(_write_flat_column()); + _flat_columns.clear(); + col->resize_uninitialized(0); // release after write + } + + _json_datas.clear(); // release after write + return Status::OK(); +} + +Status FlatJsonColumnCompactor::finish() { + for (const auto& js : _json_datas) { + DCHECK_GT(js->size(), 0); + } + RETURN_IF_ERROR(_compact_columns(_json_datas)); + for (auto& iter : _flat_writers) { + RETURN_IF_ERROR(iter->finish()); + } + return _json_writer->finish(); +} + +uint64_t FlatJsonColumnCompactor::estimate_buffer_size() { + return _estimate_size; +} + +Status JsonColumnCompactor::append(const Column& column) { + const JsonColumn* json_col; + NullColumnPtr nulls = nullptr; + if (column.is_nullable()) { + auto nullable_column = down_cast(column); + nulls = nullable_column.null_column(); + json_col = down_cast(nullable_column.data_column().get()); + } else { + json_col = down_cast(&column); + } + + if (!json_col->is_flat_json()) { + return _json_writer->append(column); + } + + JsonMerger merger(json_col->flat_column_paths(), json_col->flat_column_types(), json_col->has_remain()); + auto p = merger.merge(json_col->get_flat_fields()); + + if (column.is_nullable()) { + auto n = NullableColumn::create(p, nulls); + return _json_writer->append(*n); + } else { + return _json_writer->append(*p); + } +} + +Status JsonColumnCompactor::finish() { + _json_meta->mutable_json_meta()->set_format_version(kJsonMetaDefaultFormatVersion); + _json_meta->mutable_json_meta()->set_has_remain(false); + _json_meta->mutable_json_meta()->set_is_flat(false); + return _json_writer->finish(); +} + +} // namespace starrocks diff --git a/be/src/storage/rowset/json_column_compactor.h b/be/src/storage/rowset/json_column_compactor.h new file mode 100644 index 00000000000000..3ea644ebf81b8a --- /dev/null +++ b/be/src/storage/rowset/json_column_compactor.h @@ -0,0 +1,79 @@ +// Copyright 2021-present StarRocks, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "storage/rowset/column_writer.h" +#include "storage/rowset/json_column_writer.h" + +namespace starrocks { +class FlatJsonColumnCompactor final : public FlatJsonColumnWriter { +public: + FlatJsonColumnCompactor(const ColumnWriterOptions& opts, const TypeInfoPtr& type_info, WritableFile* wfile, + std::unique_ptr json_writer) + : FlatJsonColumnWriter(opts, type_info, wfile, std::move(json_writer)) {} + + Status append(const Column& column) override; + uint64_t estimate_buffer_size() override; + + Status finish() override; + +private: + Status _compact_columns(std::vector& json_datas); + +private: + std::vector _json_datas; + size_t _estimate_size = 0; +}; + +class JsonColumnCompactor final : public ColumnWriter { +public: + JsonColumnCompactor(const ColumnWriterOptions& opts, const TypeInfoPtr& type_info, WritableFile* wfile, + std::unique_ptr json_writer) + : ColumnWriter(std::move(type_info), opts.meta->length(), opts.meta->is_nullable()), + _json_meta(opts.meta), + _json_writer(std::move(json_writer)) {} + + JsonColumnCompactor(const ColumnWriterOptions& opts, const TypeInfoPtr& type_info, WritableFile* wfile); + + ~JsonColumnCompactor() override = default; + + Status init() override { return _json_writer->init(); } + + Status append(const Column& column) override; + + Status finish_current_page() override { return _json_writer->finish_current_page(); } + + uint64_t estimate_buffer_size() override { return _json_writer->estimate_buffer_size(); } + + Status finish() override; + + Status write_data() override { return _json_writer->write_data(); } + Status write_ordinal_index() override { return _json_writer->write_ordinal_index(); } + Status write_zone_map() override { return _json_writer->write_zone_map(); } + Status write_bitmap_index() override { return _json_writer->write_bitmap_index(); } + Status write_bloom_filter_index() override { return _json_writer->write_bloom_filter_index(); } + ordinal_t get_next_rowid() const override { return _json_writer->get_next_rowid(); } + uint64_t total_mem_footprint() const override { return _json_writer->total_mem_footprint(); } + +private: + void _flat_column(std::vector& json_datas); + +private: + ColumnMetaPB* _json_meta; + std::unique_ptr _json_writer; +}; +} // namespace starrocks diff --git a/be/src/storage/rowset/json_column_iterator.cpp b/be/src/storage/rowset/json_column_iterator.cpp index ea70a04b4f9567..585a9c6ba70e5c 100644 --- a/be/src/storage/rowset/json_column_iterator.cpp +++ b/be/src/storage/rowset/json_column_iterator.cpp @@ -15,6 +15,7 @@ #include "storage/rowset/json_column_iterator.h" #include +#include #include #include #include @@ -46,19 +47,26 @@ namespace starrocks { class JsonFlatColumnIterator final : public ColumnIterator { public: - JsonFlatColumnIterator(ColumnReader* _reader, std::unique_ptr& null_iter, - std::vector>& field_iters, - std::vector& flat_paths, std::vector target_types, - std::vector source_types, ColumnAccessPath* path) - : _reader(_reader), + JsonFlatColumnIterator(ColumnReader* reader, std::unique_ptr null_iter, + std::vector> field_iters, + const std::vector& target_paths, const std::vector& target_types, + const std::vector& source_paths, const std::vector& source_types) + : _reader(reader), _null_iter(std::move(null_iter)), _flat_iters(std::move(field_iters)), - _flat_paths(std::move(flat_paths)), - _target_types(std::move(target_types)), - _source_types(std::move(source_types)), - _path(path) {} - - ~JsonFlatColumnIterator() override = default; + _target_paths(target_paths), + _target_types(target_types), + _source_paths(source_paths), + _source_types(source_types){}; + + ~JsonFlatColumnIterator() override { + if (transformer != nullptr) { + auto [c, m, f] = transformer->cost_ms(); + _opts.stats->json_cast_ns += c; + _opts.stats->json_merge_ns += m; + _opts.stats->json_flatten_ns += f; + } + } [[nodiscard]] Status init(const ColumnIteratorOptions& opts) override; @@ -82,22 +90,22 @@ class JsonFlatColumnIterator final : public ColumnIterator { private: template - Status _read_and_cast(JsonColumn* json_column, FUNC fn); + Status _read(JsonColumn* json_column, FUNC fn); private: ColumnReader* _reader; std::unique_ptr _null_iter; std::vector> _flat_iters; - std::vector _flat_paths; + std::vector _target_paths; std::vector _target_types; + std::vector _source_paths; std::vector _source_types; + + std::vector _source_column_modules; // to avoid create column with find type - std::vector _source_columns; - ColumnAccessPath* _path; - ObjectPool _pool; - std::vector _cast_exprs; + std::unique_ptr transformer; }; Status JsonFlatColumnIterator::init(const ColumnIteratorOptions& opts) { @@ -110,66 +118,56 @@ Status JsonFlatColumnIterator::init(const ColumnIteratorOptions& opts) { RETURN_IF_ERROR(iter->init(opts)); } - // update stats - DCHECK(_path != nullptr); - auto abs_path = _path->absolute_path(); - if (opts.stats->flat_json_hits.count(abs_path) == 0) { - opts.stats->flat_json_hits[abs_path] = 1; - } else { - opts.stats->flat_json_hits[abs_path] = opts.stats->flat_json_hits[abs_path] + 1; - } - - DCHECK(_target_types.size() == _source_types.size()); + bool has_remain = _source_paths.size() != _flat_iters.size(); + transformer = std::make_unique(_target_paths, _target_types, false); + { + SCOPED_RAW_TIMER(&_opts.stats->json_init_ns); + transformer->init_read_task(_source_paths, _source_types, has_remain); - for (int i = 0; i < _target_types.size(); i++) { - if (_target_types[i] == _source_types[i]) { - _cast_exprs.push_back(nullptr); - _source_columns.push_back(nullptr); - continue; + for (int i = 0; i < _source_paths.size(); i++) { + auto column = ColumnHelper::create_column(TypeDescriptor(_source_types[i]), true); + _source_column_modules.emplace_back(column); } + } - TypeDescriptor source_type(_source_types[i]); - TypeDescriptor target_type(_target_types[i]); - - SlotDescriptor source_slot(i, "mock_solt", source_type); - ColumnRef* col_ref = _pool.add(new ColumnRef(&source_slot)); + DCHECK_EQ(_source_column_modules.size(), _source_paths.size()); + if (has_remain) { + _source_column_modules.emplace_back(JsonColumn::create()); + } - auto cast_expr = VectorizedCastExprFactory::from_type(source_type, target_type, col_ref, &_pool); - _cast_exprs.push_back(cast_expr); - _source_columns.push_back(ColumnHelper::create_column(TypeDescriptor(_source_types[i]), true)); + // update stats + { + auto cp = transformer->cast_paths(); + for (int i = 0; i < cp.size(); i++) { + opts.stats->flat_json_hits[cp[i]] += 1; + } + auto mp = transformer->merge_paths(); + for (int i = 0; i < mp.size(); i++) { + opts.stats->merge_json_hits[mp[i]] += 1; + } + auto fp = transformer->flat_paths(); + for (int i = 0; i < fp.size(); i++) { + opts.stats->dynamic_json_hits[fp[i]] += 1; + } } return Status::OK(); } template -Status JsonFlatColumnIterator::_read_and_cast(JsonColumn* json_column, FUNC read_fn) { - json_column->init_flat_columns(_flat_paths, _target_types); - Chunk chunk; +Status JsonFlatColumnIterator::_read(JsonColumn* json_column, FUNC read_fn) { + std::vector columns; + for (int i = 0; i < _source_column_modules.size(); i++) { + columns.emplace_back(_source_column_modules[i]->clone_empty()); + } for (int i = 0; i < _flat_iters.size(); i++) { - if (_cast_exprs[i] != nullptr) { - ColumnPtr source = _source_columns[i]->clone_empty(); - RETURN_IF_ERROR(read_fn(i, source.get())); - - chunk.append_column(source, i); - ASSIGN_OR_RETURN(auto res, _cast_exprs[i]->evaluate_checked(nullptr, &chunk)); - auto target = json_column->get_flat_field(i); - target->set_delete_state(source->delete_state()); - if (res->only_null()) { - target->append_nulls(source->size()); - } else if (res->is_constant()) { - auto data = down_cast(res.get())->data_column(); - target->append_value_multiple_times(*data, 0, source->size()); - } else { - target->append(*res, 0, source->size()); - } - DCHECK_EQ(json_column->size(), target->size()); - } else { - auto* flat_column = json_column->get_flat_field(i).get(); - RETURN_IF_ERROR(read_fn(i, flat_column)); - } + RETURN_IF_ERROR(read_fn(_flat_iters[i].get(), columns[i].get())); } + + RETURN_IF_ERROR(transformer->trans(columns)); + auto result = transformer->mutable_result(); + json_column->set_flat_columns(_target_paths, _target_types, result); return Status::OK(); } @@ -192,8 +190,8 @@ Status JsonFlatColumnIterator::next_batch(size_t* n, Column* dst) { } // 2. Read flat column - auto read = [&](int index, Column* column) { return _flat_iters[index]->next_batch(n, column); }; - return _read_and_cast(json_column, read); + auto read = [&](ColumnIterator* iter, Column* column) { return iter->next_batch(n, column); }; + return _read(json_column, read); } Status JsonFlatColumnIterator::next_batch(const SparseRange<>& range, Column* dst) { @@ -216,8 +214,8 @@ Status JsonFlatColumnIterator::next_batch(const SparseRange<>& range, Column* ds } // 2. Read flat column - auto read = [&](int index, Column* column) { return _flat_iters[index]->next_batch(range, column); }; - return _read_and_cast(json_column, read); + auto read = [&](ColumnIterator* iter, Column* column) { return iter->next_batch(range, column); }; + return _read(json_column, read); } Status JsonFlatColumnIterator::fetch_values_by_rowid(const rowid_t* rowids, size_t size, Column* values) { @@ -235,11 +233,9 @@ Status JsonFlatColumnIterator::fetch_values_by_rowid(const rowid_t* rowids, size } // 2. Read flat column - auto read = [&](int index, Column* column) { - return _flat_iters[index]->fetch_values_by_rowid(rowids, size, column); - }; + auto read = [&](ColumnIterator* iter, Column* column) { return iter->fetch_values_by_rowid(rowids, size, column); }; - return _read_and_cast(json_column, read); + return _read(json_column, read); } Status JsonFlatColumnIterator::seek_to_first() { @@ -271,12 +267,11 @@ Status JsonFlatColumnIterator::get_row_ranges_by_zone_map(const std::vector& json_iter, std::vector flat_paths, - std::vector target_types, ColumnAccessPath* path) + JsonDynamicFlatIterator(std::unique_ptr& json_iter, std::vector target_paths, + std::vector target_types) : _json_iter(std::move(json_iter)), - _flat_paths(std::move(flat_paths)), - _target_types(std::move(target_types)), - _path(path){}; + _target_paths(std::move(target_paths)), + _target_types(std::move(target_types)){}; ~JsonDynamicFlatIterator() override = default; @@ -306,24 +301,20 @@ class JsonDynamicFlatIterator final : public ColumnIterator { private: std::unique_ptr _json_iter; - std::vector _flat_paths; + std::vector _target_paths; std::vector _target_types; - ColumnAccessPath* _path; - JsonFlattener _flattener; + std::unique_ptr _flattener; }; Status JsonDynamicFlatIterator::init(const ColumnIteratorOptions& opts) { RETURN_IF_ERROR(ColumnIterator::init(opts)); - DCHECK(_path != nullptr); - auto abs_path = _path->absolute_path(); - if (opts.stats->dynamic_json_hits.count(abs_path) == 0) { - opts.stats->dynamic_json_hits[abs_path] = 1; - } else { - opts.stats->dynamic_json_hits[abs_path] = opts.stats->dynamic_json_hits[abs_path] + 1; + for (auto& p : _target_paths) { + opts.stats->dynamic_json_hits[p] += 1; } - _flattener = JsonFlattener(_flat_paths, _target_types); + SCOPED_RAW_TIMER(&_opts.stats->json_init_ns); + _flattener = std::make_unique(_target_paths, _target_types, false); return _json_iter->init(opts); } @@ -350,8 +341,9 @@ Status JsonDynamicFlatIterator::_flat_json(Column* input, Column* output) { } // 2. flat - json_data->init_flat_columns(_flat_paths, _target_types); - _flattener.flatten(input, &(json_data->get_flat_fields())); + _flattener->flatten(input); + auto result = _flattener->mutable_result(); + json_data->set_flat_columns(_target_paths, _target_types, result); return Status::OK(); } @@ -390,18 +382,236 @@ Status JsonDynamicFlatIterator::get_row_ranges_by_zone_map(const std::vectorget_row_ranges_by_zone_map(predicates, del_predicate, row_ranges, pred_relation); } -StatusOr> create_json_flat_iterator( - ColumnReader* reader, std::unique_ptr null_iter, - std::vector> field_iters, std::vector& full_paths, - std::vector& target_types, std::vector& source_types, ColumnAccessPath* path) { - return std::make_unique(reader, null_iter, field_iters, full_paths, target_types, - source_types, path); +class JsonMergeIterator final : public ColumnIterator { +public: + JsonMergeIterator(std::unique_ptr null_iter, std::vector> all_iter, + const std::vector& src_paths, const std::vector& src_types, + bool is_merge) + : _null_iter(std::move(null_iter)), + _all_iter(std::move(all_iter)), + _src_paths(src_paths), + _src_types(src_types), + _is_merge(is_merge){}; + + ~JsonMergeIterator() override = default; + + [[nodiscard]] Status init(const ColumnIteratorOptions& opts) override; + + [[nodiscard]] Status next_batch(size_t* n, Column* dst) override; + + [[nodiscard]] Status next_batch(const SparseRange<>& range, Column* dst) override; + + [[nodiscard]] Status seek_to_first() override; + + [[nodiscard]] Status seek_to_ordinal(ordinal_t ord) override; + + ordinal_t get_current_ordinal() const override { return _all_iter[0]->get_current_ordinal(); } + + ordinal_t num_rows() const override { return _all_iter[0]->num_rows(); } + + /// for vectorized engine + [[nodiscard]] Status get_row_ranges_by_zone_map(const std::vector& predicates, + const ColumnPredicate* del_predicate, SparseRange<>* row_ranges, + CompoundNodeType pred_relation) override; + + [[nodiscard]] Status fetch_values_by_rowid(const rowid_t* rowids, size_t size, Column* values) override; + +private: + template + Status _merge(JsonColumn* dst, FUNC func); + +private: + ColumnReader* _reader; + + std::unique_ptr _null_iter; + std::vector> _all_iter; + std::vector _src_paths; + std::vector _src_types; + std::vector _src_column_modules; + + std::unique_ptr _merger; + bool _is_merge; +}; + +Status JsonMergeIterator::init(const ColumnIteratorOptions& opts) { + RETURN_IF_ERROR(ColumnIterator::init(opts)); + if (_null_iter != nullptr) { + RETURN_IF_ERROR(_null_iter->init(opts)); + } + + for (auto& iter : _all_iter) { + RETURN_IF_ERROR(iter->init(opts)); + } + + if (_is_merge) { + for (auto& p : _src_paths) { + opts.stats->merge_json_hits[p] += 1; + } + + SCOPED_RAW_TIMER(&_opts.stats->json_init_ns); + _merger = std::make_unique(_src_paths, _src_types, _all_iter.size() != _src_paths.size()); + } + + DCHECK(_all_iter.size() == _src_paths.size() || _all_iter.size() == _src_paths.size() + 1); + for (int i = 0; i < _src_paths.size(); i++) { + auto column = ColumnHelper::create_column(TypeDescriptor(_src_types[i]), true); + _src_column_modules.emplace_back(column); + } + + if (_all_iter.size() != _src_paths.size()) { + // remain + _src_column_modules.emplace_back(JsonColumn::create()); + } + + return Status::OK(); +} + +template +Status JsonMergeIterator::_merge(JsonColumn* dst, FUNC func) { + std::vector all_columns; + for (size_t i = 0; i < _all_iter.size(); i++) { + auto iter = _all_iter[i].get(); + auto c = _src_column_modules[i]->clone_empty(); + RETURN_IF_ERROR(func(iter, c.get())); + all_columns.emplace_back(std::move(c)); + } + + if (_is_merge) { + SCOPED_RAW_TIMER(&_opts.stats->json_merge_ns); + auto json = _merger->merge(all_columns); + dst->swap_column(*json); + } else { + dst->set_flat_columns(_src_paths, _src_types, all_columns); + } + return Status::OK(); +} + +Status JsonMergeIterator::next_batch(size_t* n, Column* dst) { + JsonColumn* json_column = nullptr; + NullColumn* null_column = nullptr; + if (dst->is_nullable()) { + auto* nullable_column = down_cast(dst); + json_column = down_cast(nullable_column->data_column().get()); + null_column = down_cast(nullable_column->null_column().get()); + } else { + json_column = down_cast(dst); + } + + CHECK((_null_iter == nullptr && null_column == nullptr) || (_null_iter != nullptr && null_column != nullptr)); + + // 1. Read null column + if (_null_iter != nullptr) { + RETURN_IF_ERROR(_null_iter->next_batch(n, null_column)); + down_cast(dst)->update_has_null(); + } + + auto func = [&](ColumnIterator* iter, Column* column) { return iter->next_batch(n, column); }; + return _merge(json_column, func); +} + +Status JsonMergeIterator::next_batch(const SparseRange<>& range, Column* dst) { + JsonColumn* json_column = nullptr; + NullColumn* null_column = nullptr; + if (dst->is_nullable()) { + auto* nullable_column = down_cast(dst); + json_column = down_cast(nullable_column->data_column().get()); + null_column = down_cast(nullable_column->null_column().get()); + } else { + json_column = down_cast(dst); + } + + CHECK((_null_iter == nullptr && null_column == nullptr) || (_null_iter != nullptr && null_column != nullptr)); + + // 1. Read null column + if (_null_iter != nullptr) { + RETURN_IF_ERROR(_null_iter->next_batch(range, null_column)); + down_cast(dst)->update_has_null(); + } + + auto func = [&](ColumnIterator* iter, Column* column) { return iter->next_batch(range, column); }; + return _merge(json_column, func); +} + +Status JsonMergeIterator::fetch_values_by_rowid(const rowid_t* rowids, size_t size, Column* dst) { + JsonColumn* json_column = nullptr; + NullColumn* null_column = nullptr; + if (dst->is_nullable()) { + auto* nullable_column = down_cast(dst); + json_column = down_cast(nullable_column->data_column().get()); + null_column = down_cast(nullable_column->null_column().get()); + } else { + json_column = down_cast(dst); + } + + CHECK((_null_iter == nullptr && null_column == nullptr) || (_null_iter != nullptr && null_column != nullptr)); + + // 1. Read null column + if (_null_iter != nullptr) { + RETURN_IF_ERROR(_null_iter->fetch_values_by_rowid(rowids, size, null_column)); + down_cast(dst)->update_has_null(); + } + + auto func = [&](ColumnIterator* iter, Column* column) { return iter->fetch_values_by_rowid(rowids, size, column); }; + return _merge(json_column, func); +} + +Status JsonMergeIterator::seek_to_first() { + for (auto& iter : _all_iter) { + RETURN_IF_ERROR(iter->seek_to_first()); + } + + if (_null_iter != nullptr) { + RETURN_IF_ERROR(_null_iter->seek_to_first()); + } + return Status::OK(); +} + +Status JsonMergeIterator::seek_to_ordinal(ordinal_t ord) { + for (auto& iter : _all_iter) { + RETURN_IF_ERROR(iter->seek_to_ordinal(ord)); + } + + if (_null_iter != nullptr) { + RETURN_IF_ERROR(_null_iter->seek_to_ordinal(ord)); + } + return Status::OK(); +} + +Status JsonMergeIterator::get_row_ranges_by_zone_map(const std::vector& predicates, + const ColumnPredicate* del_predicate, SparseRange<>* row_ranges, + CompoundNodeType pred_relation) { + row_ranges->add({0, static_cast(_reader->num_rows())}); + return Status::OK(); +} + +StatusOr> create_json_flat_iterator(ColumnReader* reader, + std::unique_ptr null_iter, + std::vector> iters, + const std::vector& target_paths, + const std::vector& target_types, + const std::vector& source_paths, + const std::vector& source_types) { + return std::make_unique(reader, std::move(null_iter), std::move(iters), target_paths, + target_types, source_paths, source_types); } StatusOr> create_json_dynamic_flat_iterator( - std::unique_ptr json_iter, std::vector& flat_paths, - std::vector& target_types, ColumnAccessPath* path) { - return std::make_unique(json_iter, flat_paths, target_types, path); + std::unique_ptr json_iter, const std::vector& target_paths, + const std::vector& target_types) { + return std::make_unique(json_iter, target_paths, target_types); } +StatusOr> create_json_merge_iterator( + std::unique_ptr null_iter, std::vector> all_iters, + const std::vector& merge_paths, const std::vector& merge_types) { + return std::make_unique(std::move(null_iter), std::move(all_iters), merge_paths, merge_types, + true); +} + +StatusOr> create_json_direct_iterator( + std::unique_ptr null_iter, std::vector> all_iters, + const std::vector& merge_paths, const std::vector& merge_types) { + return std::make_unique(std::move(null_iter), std::move(all_iters), merge_paths, merge_types, + false); +} } // namespace starrocks diff --git a/be/src/storage/rowset/json_column_iterator.h b/be/src/storage/rowset/json_column_iterator.h index fdb2ffaca6f416..5e42531b4b6798 100644 --- a/be/src/storage/rowset/json_column_iterator.h +++ b/be/src/storage/rowset/json_column_iterator.h @@ -22,12 +22,24 @@ namespace starrocks { -StatusOr> create_json_flat_iterator( - ColumnReader* reader, std::unique_ptr null_iter, - std::vector> field_iters, std::vector& full_paths, - std::vector& target_types, std::vector& source_types, ColumnAccessPath* path); +StatusOr> create_json_flat_iterator(ColumnReader* reader, + std::unique_ptr null_iter, + std::vector> iters, + const std::vector& target_paths, + const std::vector& target_types, + const std::vector& source_paths, + const std::vector& source_types); StatusOr> create_json_dynamic_flat_iterator( - std::unique_ptr json_iter, std::vector& flat_paths, - std::vector& target_types, ColumnAccessPath* path); + std::unique_ptr json_iter, const std::vector& target_paths, + const std::vector& target_types); + +StatusOr> create_json_merge_iterator( + std::unique_ptr null_iter, std::vector> all_iters, + const std::vector& merge_paths, const std::vector& merge_types); + +StatusOr> create_json_direct_iterator( + std::unique_ptr null_iter, std::vector> all_iters, + const std::vector& all_paths, const std::vector& all_types); + } // namespace starrocks diff --git a/be/src/storage/rowset/json_column_writer.cpp b/be/src/storage/rowset/json_column_writer.cpp index 3f85ae3b414c41..815fb20db47841 100644 --- a/be/src/storage/rowset/json_column_writer.cpp +++ b/be/src/storage/rowset/json_column_writer.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include "column/column.h" #include "column/column_helper.h" @@ -36,250 +37,249 @@ #include "gutil/casts.h" #include "runtime/types.h" #include "storage/rowset/column_writer.h" +#include "storage/rowset/common.h" +#include "storage/rowset/json_column_compactor.h" +#include "types/constexpr.h" #include "types/logical_type.h" #include "util/json_flattener.h" #include "velocypack/vpack.h" namespace starrocks { -class FlatJsonColumnWriter final : public ColumnWriter { -public: - FlatJsonColumnWriter(const ColumnWriterOptions& opts, const TypeInfoPtr& type_info, WritableFile* wfile, - std::unique_ptr json_writer); - - ~FlatJsonColumnWriter() override = default; - - Status init() override { return _json_column_writer->init(); }; - - Status append(const Column& column) override; - - Status finish_current_page() override; - - uint64_t estimate_buffer_size() override; - - Status finish() override; - - Status write_data() override; - Status write_ordinal_index() override; - Status write_zone_map() override; - Status write_bitmap_index() override; - Status write_bloom_filter_index() override; - ordinal_t get_next_rowid() const override { return _json_column_writer->get_next_rowid(); } - - bool is_global_dict_valid() override { return _json_column_writer->is_global_dict_valid(); } - - uint64_t total_mem_footprint() const override { return _json_column_writer->total_mem_footprint(); } - -private: - void _flat_column(std::vector& json_datas); - -private: - std::unique_ptr _json_column_writer; - ColumnMetaPB* _json_meta; - WritableFile* _wfile; - - std::vector _json_datas; - - std::vector> _flat_writers; - std::vector _flat_paths; - std::vector _flat_types; - std::vector _flat_columns; -}; - FlatJsonColumnWriter::FlatJsonColumnWriter(const ColumnWriterOptions& opts, const TypeInfoPtr& type_info, WritableFile* wfile, std::unique_ptr json_writer) - : ColumnWriter(std::move(type_info), opts.meta->length(), opts.meta->is_nullable()), - _json_column_writer(std::move(json_writer)), + : ColumnWriter(type_info, opts.meta->length(), opts.meta->is_nullable()), _json_meta(opts.meta), - _wfile(wfile) {} + _wfile(wfile), + _json_writer(std::move(json_writer)) {} + +Status FlatJsonColumnWriter::init() { + return _json_writer->init(); +} Status FlatJsonColumnWriter::append(const Column& column) { - RETURN_IF_ERROR(_json_column_writer->append(column)); - // write process/compection will reuse column, must copy in there. - // @Todo: avoid memory copy - auto clone = column.clone_empty(); - clone->append(column); - _json_datas.emplace_back(std::move(clone)); - return Status::OK(); + DCHECK(_flat_paths.empty()); + DCHECK(_flat_types.empty()); + DCHECK(_flat_writers.empty()); + + auto st = _flat_column(&column); + if (st.ok()) { + _is_flat = true; + RETURN_IF_ERROR(_init_flat_writers()); + return _write_flat_column(); + } else { + _is_flat = false; + return _json_writer->append(column); + } } -void FlatJsonColumnWriter::_flat_column(std::vector& json_datas) { - JsonFlattener flattener; - flattener.derived_paths(json_datas); +Status FlatJsonColumnWriter::_flat_column(const Column* json_data) { + // all json datas must full json + JsonPathDeriver deriver; + deriver.derived({json_data}); - _flat_paths = flattener.get_flat_paths(); - _flat_types = flattener.get_flat_types(); + _flat_paths = deriver.flat_paths(); + _flat_types = deriver.flat_types(); + _has_remain = deriver.has_remain_json(); if (_flat_paths.empty()) { - return; + return Status::InternalError("doesn't have flat column."); } - // extract flat column - for (size_t i = 0; i < _flat_paths.size(); i++) { - _flat_columns.emplace_back(ColumnHelper::create_column(TypeDescriptor(_flat_types[i]), true)); - } - - for (auto& col : json_datas) { - flattener.flatten(col.get(), &_flat_columns); - } + JsonFlattener flattener(deriver); + flattener.flatten(json_data); + _flat_columns = flattener.mutable_result(); // recode null column in 1st if (_json_meta->is_nullable()) { auto nulls = NullColumn::create(); uint8_t IS_NULL = 1; uint8_t NOT_NULL = 0; - for (auto& col : json_datas) { - if (col->only_null()) { - nulls->append_value_multiple_times(&IS_NULL, col->size()); - } else if (col->is_nullable()) { - auto* nullable_column = down_cast(col.get()); - auto* nl = down_cast(nullable_column->null_column().get()); - nulls->append(*nl, 0, nl->size()); - } else { - nulls->append_value_multiple_times(&NOT_NULL, col->size()); - } + if (json_data->only_null()) { + nulls->append_value_multiple_times(&IS_NULL, json_data->size()); + } else if (json_data->is_nullable()) { + auto* nullable_column = down_cast(json_data); + auto* nl = down_cast(nullable_column->null_column().get()); + nulls->append(*nl, 0, nl->size()); + } else { + nulls->append_value_multiple_times(&NOT_NULL, json_data->size()); } _flat_columns.insert(_flat_columns.begin(), nulls); - _flat_paths.insert(_flat_paths.begin(), "nulls"); - _flat_types.insert(_flat_types.begin(), LogicalType::TYPE_TINYINT); } + return Status::OK(); } -Status FlatJsonColumnWriter::finish() { - for (const auto& js : _json_datas) { - DCHECK_GT(js->size(), 0); - } - _flat_column(_json_datas); - _json_datas.clear(); // release column data - - if (!_flat_columns.empty()) { - // nulls - if (_json_meta->is_nullable()) { - ColumnWriterOptions opts; - opts.meta = _json_meta->add_children_columns(); - opts.meta->set_column_id(0); - opts.meta->set_unique_id(0); - opts.meta->set_type(LogicalType::TYPE_TINYINT); - opts.meta->set_length(get_type_info(LogicalType::TYPE_TINYINT)->size()); - opts.meta->set_is_nullable(false); - opts.meta->set_name("nulls"); - opts.meta->set_encoding(DEFAULT_ENCODING); - opts.meta->set_compression(_json_meta->compression()); +Status FlatJsonColumnWriter::_init_flat_writers() { + // update json meta + _json_meta->mutable_json_meta()->set_format_version(kJsonMetaDefaultFormatVersion); + _json_meta->mutable_json_meta()->set_has_remain(_has_remain); + _json_meta->mutable_json_meta()->set_is_flat(true); - TabletColumn col(StorageAggregateType::STORAGE_AGGREGATE_NONE, LogicalType::TYPE_TINYINT, true); - ASSIGN_OR_RETURN(auto fw, ColumnWriter::create(opts, &col, _wfile)); - _flat_writers.emplace_back(std::move(fw)); + // recode null column in 1st + if (_json_meta->is_nullable()) { + _flat_paths.insert(_flat_paths.begin(), "nulls"); + _flat_types.insert(_flat_types.begin(), LogicalType::TYPE_TINYINT); + } - RETURN_IF_ERROR(_flat_writers[0]->init()); - RETURN_IF_ERROR(_flat_writers[0]->append(*_flat_columns[0])); - RETURN_IF_ERROR(_flat_writers[0]->finish()); + if (_has_remain) { + _flat_paths.emplace_back("remain"); + _flat_types.emplace_back(LogicalType::TYPE_JSON); + } - VLOG(8) << "flush flat json nulls"; + for (size_t i = 0; i < _flat_columns.size(); i++) { + ColumnWriterOptions opts; + opts.meta = _json_meta->add_children_columns(); + opts.meta->set_column_id(i); + opts.meta->set_unique_id(i); + opts.meta->set_type(_flat_types[i]); + if (_flat_types[i] == TYPE_VARCHAR) { + opts.meta->set_length(config::olap_string_max_length); + } else { + DCHECK_NE(_flat_types[i], TYPE_CHAR); + // set length for non-string type (e.g. int, double, date, etc. + opts.meta->set_length(get_type_info(_flat_types[i])->size()); } - - int start = _json_meta->is_nullable() ? 1 : 0; - // flat datas - for (size_t i = start; i < _flat_columns.size(); i++) { - ColumnWriterOptions opts; - opts.meta = _json_meta->add_children_columns(); - opts.meta->set_column_id(i); - opts.meta->set_unique_id(i); - opts.meta->set_type(_flat_types[i]); - if (_flat_types[i] == TYPE_VARCHAR) { - opts.meta->set_length(config::olap_string_max_length); - } else { - DCHECK_NE(_flat_types[i], TYPE_CHAR); - // set length for non-string type (e.g. int, double, date, etc. - opts.meta->set_length(get_type_info(_flat_types[i])->size()); - } + if ((_json_meta->is_nullable() && i == 0) || (i == _flat_columns.size() - 1 && _has_remain)) { + opts.meta->set_is_nullable(false); + } else { opts.meta->set_is_nullable(true); - opts.meta->set_encoding(DEFAULT_ENCODING); - opts.meta->set_compression(_json_meta->compression()); + } + opts.meta->set_encoding(DEFAULT_ENCODING); + opts.meta->set_compression(_json_meta->compression()); - if (_flat_types[i] == LogicalType::TYPE_JSON) { - opts.meta->mutable_json_meta()->set_format_version(kJsonMetaDefaultFormatVersion); - } + if (_flat_types[i] == LogicalType::TYPE_JSON) { + opts.meta->mutable_json_meta()->set_format_version(kJsonMetaDefaultFormatVersion); + opts.meta->mutable_json_meta()->set_is_flat(false); + } - if (_flat_paths[i].find('.') != std::string::npos) { - // add escape - opts.meta->set_name(fmt::format("\"{}\"", _flat_paths[i])); - } else { - opts.meta->set_name(_flat_paths[i]); - } + opts.meta->set_name(_flat_paths[i]); + opts.need_flat = false; - opts.need_flat = false; + TabletColumn col(StorageAggregateType::STORAGE_AGGREGATE_NONE, _flat_types[i], true); + ASSIGN_OR_RETURN(auto fw, ColumnWriter::create(opts, &col, _wfile)); + _flat_writers.emplace_back(std::move(fw)); - TabletColumn col(StorageAggregateType::STORAGE_AGGREGATE_NONE, _flat_types[i], true); - ASSIGN_OR_RETURN(auto fw, ColumnWriter::create(opts, &col, _wfile)); - _flat_writers.emplace_back(std::move(fw)); + RETURN_IF_ERROR(_flat_writers[i]->init()); + } + return Status::OK(); +} - RETURN_IF_ERROR(_flat_writers[i]->init()); - RETURN_IF_ERROR(_flat_writers[i]->append(*_flat_columns[i])); - RETURN_IF_ERROR(_flat_writers[i]->finish()); +Status FlatJsonColumnWriter::_write_flat_column() { + DCHECK(!_flat_columns.empty()); + DCHECK(_flat_columns.size() == _flat_writers.size()); + // flat datas + for (size_t i = 0; i < _flat_columns.size(); i++) { + RETURN_IF_ERROR(_flat_writers[i]->append(*_flat_columns[i])); + } - VLOG(8) << "flush flat json: " << _flat_paths[i]; - } + return Status::OK(); +} + +Status FlatJsonColumnWriter::finish() { + DCHECK(_is_flat ? !_flat_writers.empty() : _flat_writers.empty()); + // flat datas + for (size_t i = 0; i < _flat_columns.size(); i++) { + RETURN_IF_ERROR(_flat_writers[i]->finish()); + VLOG(8) << "flush flat json: " << _flat_paths[i]; } - return _json_column_writer->finish(); + _flat_columns.clear(); + return _json_writer->finish(); +} + +ordinal_t FlatJsonColumnWriter::get_next_rowid() const { + DCHECK(_is_flat ? !_flat_writers.empty() : _flat_writers.empty()); + if (!_is_flat) { + return _json_writer->get_next_rowid(); + } + return _flat_writers[0]->get_next_rowid(); } uint64_t FlatJsonColumnWriter::estimate_buffer_size() { - uint64_t size = _json_column_writer->estimate_buffer_size(); + DCHECK(_is_flat ? !_flat_writers.empty() : _flat_writers.empty()); + uint64_t size = 0; for (auto& w : _flat_writers) { size += w->estimate_buffer_size(); } + size += _json_writer->estimate_buffer_size(); + return size; +} + +uint64_t FlatJsonColumnWriter::total_mem_footprint() const { + DCHECK(_is_flat ? !_flat_writers.empty() : _flat_writers.empty()); + uint64_t size = 0; + for (auto& w : _flat_writers) { + size += w->total_mem_footprint(); + } + size += _json_writer->total_mem_footprint(); return size; } Status FlatJsonColumnWriter::write_data() { + DCHECK(_is_flat ? !_flat_writers.empty() : _flat_writers.empty()); for (auto& w : _flat_writers) { RETURN_IF_ERROR(w->write_data()); } - return _json_column_writer->write_data(); + return _json_writer->write_data(); } Status FlatJsonColumnWriter::write_ordinal_index() { + DCHECK(_is_flat ? !_flat_writers.empty() : _flat_writers.empty()); for (auto& w : _flat_writers) { RETURN_IF_ERROR(w->write_ordinal_index()); } - return _json_column_writer->write_ordinal_index(); + return _json_writer->write_ordinal_index(); } Status FlatJsonColumnWriter::write_zone_map() { + DCHECK(_is_flat ? !_flat_writers.empty() : _flat_writers.empty()); for (auto& w : _flat_writers) { RETURN_IF_ERROR(w->write_zone_map()); } - return _json_column_writer->write_zone_map(); + return _json_writer->write_zone_map(); } Status FlatJsonColumnWriter::write_bitmap_index() { + DCHECK(_is_flat ? !_flat_writers.empty() : _flat_writers.empty()); for (auto& w : _flat_writers) { RETURN_IF_ERROR(w->write_bitmap_index()); } - return _json_column_writer->write_bitmap_index(); + return _json_writer->write_bitmap_index(); } Status FlatJsonColumnWriter::write_bloom_filter_index() { + DCHECK(_is_flat ? !_flat_writers.empty() : _flat_writers.empty()); for (auto& w : _flat_writers) { RETURN_IF_ERROR(w->write_bloom_filter_index()); } - return _json_column_writer->write_bloom_filter_index(); + return _json_writer->write_bloom_filter_index(); } Status FlatJsonColumnWriter::finish_current_page() { + DCHECK(_is_flat ? !_flat_writers.empty() : _flat_writers.empty()); for (auto& w : _flat_writers) { RETURN_IF_ERROR(w->finish_current_page()); } - return _json_column_writer->finish_current_page(); + return _json_writer->finish_current_page(); } StatusOr> create_json_column_writer(const ColumnWriterOptions& opts, const TypeInfoPtr& type_info, WritableFile* wfile, std::unique_ptr json_writer) { + // compaction + if (opts.is_compaction) { + if (opts.need_flat) { + return std::make_unique(opts, type_info, wfile, std::move(json_writer)); + } else { + return std::make_unique(opts, type_info, wfile, std::move(json_writer)); + } + } + + // load if (!opts.need_flat) { return std::move(json_writer); + } else { + return std::make_unique(opts, type_info, wfile, std::move(json_writer)); } - return std::make_unique(opts, type_info, wfile, std::move(json_writer)); } } // namespace starrocks diff --git a/be/src/storage/rowset/json_column_writer.h b/be/src/storage/rowset/json_column_writer.h index 9aa155d504ff79..0e374a2fe6e40b 100644 --- a/be/src/storage/rowset/json_column_writer.h +++ b/be/src/storage/rowset/json_column_writer.h @@ -21,4 +21,51 @@ namespace starrocks { StatusOr> create_json_column_writer(const ColumnWriterOptions& opts, const TypeInfoPtr& type_info, WritableFile* wfile, std::unique_ptr json_writer); -} + +class FlatJsonColumnWriter : public ColumnWriter { +public: + FlatJsonColumnWriter(const ColumnWriterOptions& opts, const TypeInfoPtr& type_info, WritableFile* wfile, + std::unique_ptr json_writer); + + ~FlatJsonColumnWriter() override = default; + + Status init() override; + + Status append(const Column& column) override; + + Status finish_current_page() override; + + uint64_t estimate_buffer_size() override; + + Status finish() override; + + Status write_data() override; + Status write_ordinal_index() override; + Status write_zone_map() override; + Status write_bitmap_index() override; + Status write_bloom_filter_index() override; + ordinal_t get_next_rowid() const override; + + uint64_t total_mem_footprint() const override; + +protected: + Status _init_flat_writers(); + Status _write_flat_column(); + +private: + Status _flat_column(const Column* json_data); + +protected: + ColumnMetaPB* _json_meta; + WritableFile* _wfile; + std::unique_ptr _json_writer; + + std::vector> _flat_writers; + std::vector _flat_paths; + std::vector _flat_types; + std::vector _flat_columns; + + bool _has_remain; + bool _is_flat = false; +}; +} // namespace starrocks diff --git a/be/src/storage/rowset/rowset_writer.cpp b/be/src/storage/rowset/rowset_writer.cpp index 4667be1db87360..045813081f0ff1 100644 --- a/be/src/storage/rowset/rowset_writer.cpp +++ b/be/src/storage/rowset/rowset_writer.cpp @@ -122,6 +122,7 @@ Status RowsetWriter::init() { _writer_options.global_dicts = _context.global_dicts != nullptr ? _context.global_dicts : nullptr; _writer_options.referenced_column_ids = _context.referenced_column_ids; + _writer_options.is_compaction = _context.is_compaction; if (_context.tablet_schema->keys_type() == KeysType::PRIMARY_KEYS && (_context.is_partial_update || !_context.merge_condition.empty() || _context.miss_auto_increment_column)) { diff --git a/be/src/storage/rowset/rowset_writer_context.h b/be/src/storage/rowset/rowset_writer_context.h index caf535ab288bdc..d294dfaf9813e7 100644 --- a/be/src/storage/rowset/rowset_writer_context.h +++ b/be/src/storage/rowset/rowset_writer_context.h @@ -95,6 +95,8 @@ class RowsetWriterContext { int64_t gtid = 0; // Is pk compaction output writer bool is_pk_compaction = false; + // is compaction job + bool is_compaction = false; }; } // namespace starrocks diff --git a/be/src/storage/rowset/segment_writer.cpp b/be/src/storage/rowset/segment_writer.cpp index c8699bbcf01450..bc23b77fe377a0 100644 --- a/be/src/storage/rowset/segment_writer.cpp +++ b/be/src/storage/rowset/segment_writer.cpp @@ -94,6 +94,7 @@ void SegmentWriter::_init_column_meta(ColumnMetaPB* meta, uint32_t column_id, co if (column.type() == TYPE_JSON) { JsonMetaPB* json_meta = meta->mutable_json_meta(); json_meta->set_format_version(kJsonMetaDefaultFormatVersion); + json_meta->set_is_flat(false); } for (uint32_t i = 0; i < column.subcolumn_count(); ++i) { @@ -199,6 +200,7 @@ Status SegmentWriter::init(const std::vector& column_indexes, bool has } opts.need_flat = config::enable_json_flat; + opts.is_compaction = _opts.is_compaction; ASSIGN_OR_RETURN(auto writer, ColumnWriter::create(opts, &column, _wfile.get())); RETURN_IF_ERROR(writer->init()); _column_writers.push_back(std::move(writer)); diff --git a/be/src/storage/rowset/segment_writer.h b/be/src/storage/rowset/segment_writer.h index e8f53b3bfe7555..f045ceb0253f32 100644 --- a/be/src/storage/rowset/segment_writer.h +++ b/be/src/storage/rowset/segment_writer.h @@ -77,6 +77,7 @@ struct SegmentWriterOptions { std::vector referenced_column_ids; SegmentFileMark segment_file_mark; std::string encryption_meta; + bool is_compaction = false; }; // SegmentWriter is responsible for writing data into single segment by all or partital columns. diff --git a/be/src/storage/tablet_updates.cpp b/be/src/storage/tablet_updates.cpp index 0615f9f8e82bdf..a904e2b270e45a 100644 --- a/be/src/storage/tablet_updates.cpp +++ b/be/src/storage/tablet_updates.cpp @@ -1928,6 +1928,7 @@ Status TabletUpdates::_do_compaction(std::unique_ptr* pinfo) { context.writer_type = (algorithm == VERTICAL_COMPACTION ? RowsetWriterType::kVertical : RowsetWriterType::kHorizontal); context.is_pk_compaction = true; + context.is_compaction = true; std::unique_ptr rowset_writer; Status st = RowsetFactory::create_rowset_writer(context, &rowset_writer); if (!st.ok()) { diff --git a/be/src/util/json_flattener.cpp b/be/src/util/json_flattener.cpp index 7143924673eae7..7c026fb25f1fbb 100644 --- a/be/src/util/json_flattener.cpp +++ b/be/src/util/json_flattener.cpp @@ -20,10 +20,15 @@ #include #include #include +#include #include +#include #include #include #include +#include +#include +#include #include "column/column_helper.h" #include "column/column_viewer.h" @@ -33,15 +38,25 @@ #include "column/vectorized_fwd.h" #include "common/compiler_util.h" #include "common/status.h" +#include "common/statusor.h" +#include "exprs/cast_expr.h" +#include "exprs/column_ref.h" +#include "exprs/expr_context.h" #include "gutil/casts.h" +#include "runtime/types.h" #include "types/logical_type.h" #include "util/json.h" #include "util/json_converter.h" +#include "util/runtime_profile.h" namespace starrocks { +namespace flat_json { +using JsonFlatExtractFunc = void (*)(const vpack::Slice* json, NullableColumn* result); +using JsonFlatMergeFunc = void (*)(vpack::Builder* builder, const std::string& name, const Column* src, size_t idx); + template -void append_to_number(const vpack::Slice* json, NullableColumn* result) { +void extract_number(const vpack::Slice* json, NullableColumn* result) { try { if (LIKELY(json->isNumber() || json->isString())) { auto st = get_number_from_vpjson(*json); @@ -64,7 +79,7 @@ void append_to_number(const vpack::Slice* json, NullableColumn* result) { } } -void append_to_string(const vpack::Slice* json, NullableColumn* result) { +void extract_string(const vpack::Slice* json, NullableColumn* result) { try { if (json->isNone() || json->isNull()) { result->append_nulls(1); @@ -85,7 +100,7 @@ void append_to_string(const vpack::Slice* json, NullableColumn* result) { } } -void append_to_json(const vpack::Slice* json, NullableColumn* result) { +void extract_json(const vpack::Slice* json, NullableColumn* result) { if (json->isNone()) { result->append_nulls(1); } else { @@ -94,14 +109,43 @@ void append_to_json(const vpack::Slice* json, NullableColumn* result) { } } -using JsonFlatAppendFunc = void (*)(const vpack::Slice* json, NullableColumn* result); +template +void merge_number(vpack::Builder* builder, const std::string& name, const Column* src, size_t idx) { + DCHECK(src->is_nullable()); + auto* nullable_column = down_cast(src); + auto* col = down_cast*>(nullable_column->data_column().get()); + + if constexpr (TYPE == LogicalType::TYPE_LARGEINT) { + // the value is from json, must be uint64_t + builder->add(name, vpack::Value((uint64_t)col->get_data()[idx])); + } else { + builder->add(name, vpack::Value(col->get_data()[idx])); + } +} + +void merge_string(vpack::Builder* builder, const std::string& name, const Column* src, size_t idx) { + DCHECK(src->is_nullable()); + auto* nullable_column = down_cast(src); + auto* col = down_cast(nullable_column->data_column().get()); + builder->add(name, vpack::Value(col->get_slice(idx))); +} + +void merge_json(vpack::Builder* builder, const std::string& name, const Column* src, size_t idx) { + DCHECK(src->is_nullable()); + auto* nullable_column = down_cast(src); + auto* col = down_cast(nullable_column->data_column().get()); + builder->add(name, vpack::Value(col->get_object(idx)->get_slice())); +} + +using JsonFlatExtractFunc = void (*)(const vpack::Slice* json, NullableColumn* result); +using JsonFlatMergeFunc = void (*)(vpack::Builder* builder, const std::string& name, const Column* src, size_t idx); static const uint8_t JSON_BASE_TYPE_BITS = 0; // least flat to JSON type static const uint8_t JSON_BIGINT_TYPE_BITS = 225; // 011000 10, bigint compatible type // clang-format off // bool will flatting as string, because it's need save string-literal(true/false) // int & string compatible type is json, because int cast to string will add double quote, it's different with json -static const std::unordered_map JSON_TYPE_BITS{ +static const std::unordered_map JSON_TYPE_BITS { {vpack::ValueType::None, 255}, // 111111 11, 255 {vpack::ValueType::SmallInt, 241}, // 111100 01, 241 {vpack::ValueType::Int, 225}, // 111000 01, 225 @@ -121,80 +165,109 @@ static const std::unordered_map JSON_BITS_TO_LOGICAL_TYPE {JSON_BASE_TYPE_BITS, LogicalType::TYPE_JSON}, }; -static const std::unordered_map JSON_BITS_FUNC { - {JSON_TYPE_BITS.at(vpack::ValueType::None), &append_to_number}, - {JSON_TYPE_BITS.at(vpack::ValueType::SmallInt), &append_to_number}, - {JSON_TYPE_BITS.at(vpack::ValueType::Int), &append_to_number}, - {JSON_TYPE_BITS.at(vpack::ValueType::UInt), &append_to_number}, - {JSON_TYPE_BITS.at(vpack::ValueType::Double), &append_to_number}, - {JSON_TYPE_BITS.at(vpack::ValueType::String), &append_to_string}, - {JSON_BASE_TYPE_BITS, &append_to_json}, +static const std::unordered_map LOGICAL_TYPE_TO_JSON_BITS { + {LogicalType::TYPE_TINYINT, JSON_TYPE_BITS.at(vpack::ValueType::None)}, + {LogicalType::TYPE_BIGINT, JSON_TYPE_BITS.at(vpack::ValueType::Int)}, + {LogicalType::TYPE_LARGEINT, JSON_TYPE_BITS.at(vpack::ValueType::UInt)}, + {LogicalType::TYPE_DOUBLE, JSON_TYPE_BITS.at(vpack::ValueType::Double)}, + {LogicalType::TYPE_VARCHAR, JSON_TYPE_BITS.at(vpack::ValueType::String)}, + {LogicalType::TYPE_JSON, JSON_BASE_TYPE_BITS}, +}; + +static const std::unordered_map JSON_EXTRACT_FUNC { + {LogicalType::TYPE_TINYINT, &extract_number}, + {LogicalType::TYPE_BIGINT, &extract_number}, + {LogicalType::TYPE_LARGEINT, &extract_number}, + {LogicalType::TYPE_DOUBLE, &extract_number}, + {LogicalType::TYPE_VARCHAR, &extract_string}, + {LogicalType::TYPE_CHAR, &extract_string}, + {LogicalType::TYPE_JSON, &extract_json}, +}; + +// should match with extract function +static const std::unordered_map JSON_MERGE_FUNC { + {LogicalType::TYPE_TINYINT, &merge_number}, + {LogicalType::TYPE_BIGINT, &merge_number}, + {LogicalType::TYPE_LARGEINT, &merge_number}, + {LogicalType::TYPE_DOUBLE, &merge_number}, + {LogicalType::TYPE_VARCHAR, &merge_string}, + {LogicalType::TYPE_JSON, &merge_json}, }; // clang-format on -uint8_t JsonFlattener::get_compatibility_type(vpack::ValueType type1, uint8_t type2) { +uint8_t get_compatibility_type(vpack::ValueType type1, uint8_t type2) { if (JSON_TYPE_BITS.contains(type1)) { return JSON_TYPE_BITS.at(type1) & type2; } return JSON_BASE_TYPE_BITS; } -JsonFlattener::JsonFlattener(std::vector& paths) : _flat_paths(paths) { - _flat_types.resize(paths.size(), JSON_BASE_TYPE_BITS); - for (int i = 0; i < _flat_paths.size(); i++) { - _flat_index[_flat_paths[i]] = i; - } -}; +} // namespace flat_json -JsonFlattener::JsonFlattener(std::vector& paths, const std::vector& types) - : _flat_paths(paths) { - for (const auto& t : types) { - for (const auto& [k, v] : JSON_BITS_TO_LOGICAL_TYPE) { - if (t == v) { - _flat_types.emplace_back(k); - break; - } - } +std::pair JsonFlatPath::_split_path(const std::string& path) { + size_t pos = 0; + if (path.starts_with("\"")) { + pos = path.find('\"', 1); + DCHECK(pos != std::string::npos); } - DCHECK_EQ(_flat_types.size(), types.size()); - for (int i = 0; i < _flat_paths.size(); i++) { - _flat_index[_flat_paths[i]] = i; + pos = path.find('.', pos); + std::string key; + std::string next; + if (pos == std::string::npos) { + key = path; + } else { + key = path.substr(0, pos); + next = path.substr(pos + 1); } -}; -std::vector JsonFlattener::get_flat_types() { - std::vector types; - for (const auto& t : _flat_types) { - types.emplace_back(JSON_BITS_TO_LOGICAL_TYPE.at(t)); - } - return types; + return {key, next}; } -struct FlatColumnDesc { - // json compatible type - uint8_t type = JsonFlattener::JSON_NULL_TYPE_BITS; - // column path hit count, some json may be null or none, so hit use to record the actual value - // e.g: {"a": 1, "b": 2}, path "$.c" not exist, so hit is 0 - uint64_t hits = 0; - // how many rows need to be cast to a compatible type - uint16_t casts = 0; - - // for json-uint, json-uint is uint64_t, check the maximum value and downgrade to bigint - uint64_t max = 0; - - // same key may appear many times in json, so we need avoid duplicate compute hits - uint64_t last_row = -1; - uint64_t multi_times = 0; -}; +JsonFlatPath* JsonFlatPath::normalize_from_path(const std::string& path, JsonFlatPath* root) { + if (path.empty()) { + return root; + } + auto [key, next] = _split_path(path); + auto iter = root->children.find(key); + JsonFlatPath* child_path = nullptr; -void JsonFlattener::derived_paths(std::vector& json_datas) { - _flat_paths.clear(); - _flat_types.clear(); + if (iter == root->children.end()) { + root->children.emplace(key, std::make_unique()); + child_path = root->children[key].get(); + } else { + child_path = iter->second.get(); + } + return normalize_from_path(next, child_path); +} - if (json_datas.empty()) { +/* +* to mark new root +* root(Ig) +* / | \ +* a(Ex) b(Ig) c(Ex) +* / / \ \ +* any b1(Ex) b2(N) any +* / \ +* b3(IN) b4(IN) +*/ +void JsonFlatPath::set_root(const std::string& new_root_path, JsonFlatPath* node) { + node->op = OP_IGNORE; + if (new_root_path.empty()) { + node->op = OP_ROOT; return; } + auto [key, next] = _split_path(new_root_path); + auto iter = node->children.begin(); + for (; iter != node->children.end(); iter++) { + iter->second->op = OP_EXCLUDE; + if (iter->first == key) { + set_root(next, iter->second.get()); + } + } +} + +bool check_null_factor(const std::vector& json_datas) { size_t total_rows = 0; size_t null_count = 0; @@ -204,7 +277,7 @@ void JsonFlattener::derived_paths(std::vector& json_datas) { null_count += column->size(); continue; } else if (column->is_nullable()) { - auto* nullable_column = down_cast(column.get()); + auto* nullable_column = down_cast(column); null_count += nullable_column->null_count(); } } @@ -213,104 +286,262 @@ void JsonFlattener::derived_paths(std::vector& json_datas) { if (null_count > total_rows * config::json_flat_null_factor) { VLOG(8) << "flat json, null_count[" << null_count << "], row[" << total_rows << "], null_factor: " << config::json_flat_null_factor; + return false; + } + + return true; +} + +JsonPathDeriver::JsonPathDeriver(const std::vector& paths, const std::vector& types, + bool has_remain) + : _has_remain(has_remain), _paths(paths), _types(types) { + for (size_t i = 0; i < paths.size(); i++) { + auto* leaf = JsonFlatPath::normalize_from_path(paths[i], _path_root.get()); + leaf->type = types[i]; + leaf->index = i; + } +} + +void JsonPathDeriver::derived(const std::vector& json_datas) { + DCHECK(_paths.empty()); + DCHECK(_types.empty()); + DCHECK(_derived_maps.empty()); + DCHECK(_path_root == nullptr); + + if (json_datas.empty()) { + return; + } + + if (!check_null_factor(json_datas)) { return; } - size_t rows = 0; + _path_root = std::make_shared(); + _total_rows = 0; + // init path by flat json + _derived_on_flat_json(json_datas); + // extract common keys, type - std::unordered_map derived_maps; for (size_t k = 0; k < json_datas.size(); k++) { - size_t row_count = json_datas[k]->size(); + _derived(json_datas[k], _total_rows); + _total_rows += json_datas[k]->size(); + } - ColumnViewer viewer(json_datas[k]); - for (size_t i = 0; i < row_count; ++i) { - rows++; - if (viewer.is_null(i)) { - continue; - } + _finalize(); +} - JsonValue* json = viewer.value(i); - auto vslice = json->to_vslice(); +void JsonPathDeriver::_derived_on_flat_json(const std::vector& json_datas) { + // extract flat paths + for (size_t k = 0; k < json_datas.size(); k++) { + auto col = json_datas[k]; + const JsonColumn* json_col; + size_t hits = 0; + if (col->is_nullable()) { + auto nullable = down_cast(col); + hits = nullable->null_count(); + json_col = down_cast(nullable->data_column().get()); + } else { + hits = col->size(); + json_col = down_cast(col); + } - if (vslice.isNull() || vslice.isNone() || vslice.isEmptyObject() || !vslice.isObject()) { - continue; - } + if (!json_col->is_flat_json()) { + continue; + } - vpack::ObjectIterator iter(vslice); - for (const auto& it : iter) { - std::string_view name = it.key.stringView(); - derived_maps[name].hits++; - uint8_t base_type = derived_maps[name].type; - vpack::ValueType json_type = it.value.type(); - uint8_t compatibility_type = JsonFlattener::get_compatibility_type(json_type, base_type); - derived_maps[name].type = compatibility_type; - derived_maps[name].casts += (base_type != compatibility_type); + auto paths = json_col->flat_column_paths(); + auto types = json_col->flat_column_types(); - derived_maps[name].multi_times += (derived_maps[name].last_row == rows); - derived_maps[name].last_row = rows; + for (size_t i = 0; i < paths.size(); i++) { + auto leaf = JsonFlatPath::normalize_from_path(paths[i], _path_root.get()); + _derived_maps[leaf].type &= flat_json::LOGICAL_TYPE_TO_JSON_BITS.at(types[i]); + _derived_maps[leaf].hits += hits; + } + } +} - if (json_type == vpack::ValueType::UInt) { - derived_maps[name].max = std::max(derived_maps[name].max, it.value.getUIntUnchecked()); - } - } +void JsonPathDeriver::_derived(const Column* col, size_t mark_row) { + size_t row_count = col->size(); + const JsonColumn* json_col; + + if (col->is_nullable()) { + auto nullable = down_cast(col); + json_col = down_cast(nullable->data_column().get()); + } else { + json_col = down_cast(col); + } + + if (json_col->is_flat_json()) { + if (json_col->has_remain()) { + json_col = down_cast(json_col->get_remain().get()); + } else { + return; } } - if (derived_maps.size() <= config::json_flat_internal_column_min_limit) { - VLOG(8) << "flat json, internal column too less: " << derived_maps.size() - << ", at least: " << config::json_flat_internal_column_min_limit; - return; + for (size_t i = 0; i < row_count; ++i) { + if (col->is_null(i)) { + continue; + } + + JsonValue* json = json_col->get_object(i); + auto vslice = json->to_vslice(); + + if (vslice.isNull() || vslice.isNone()) { + continue; + } + + if (vslice.isEmptyObject() || !vslice.isObject()) { + _has_remain = true; + continue; + } + + _visit_json_paths(vslice, _path_root.get(), mark_row + i); + } +} + +void JsonPathDeriver::_visit_json_paths(vpack::Slice value, JsonFlatPath* root, size_t mark_row) { + vpack::ObjectIterator it(value, false); + + for (; it.valid(); it.next()) { + auto current = (*it); + // sub-object? + auto v = current.value; + auto k = current.key.copyString(); + + if (!root->children.contains(k)) { + root->children.emplace(k, std::make_unique()); + } + auto child = root->children[k].get(); + if (v.isObject()) { + _visit_json_paths(v, child, mark_row); + } else { + _derived_maps[child].hits++; + uint8_t base_type = _derived_maps[child].type; + vpack::ValueType json_type = v.type(); + uint8_t compatibility_type = flat_json::get_compatibility_type(json_type, base_type); + _derived_maps[child].type = compatibility_type; + _derived_maps[child].casts += (base_type != compatibility_type); + + _derived_maps[child].multi_times += (_derived_maps[child].last_row == mark_row); + _derived_maps[child].last_row = mark_row; + + if (json_type == vpack::ValueType::UInt) { + _derived_maps[child].max = std::max(_derived_maps[child].max, v.getUIntUnchecked()); + } + } } +} +void JsonPathDeriver::_finalize() { // try downgrade json-uint to bigint int128_t max = RunTimeTypeLimits::max_value(); - for (auto& [name, desc] : derived_maps) { - if (desc.type == JSON_TYPE_BITS.at(vpack::ValueType::UInt) && desc.max <= max) { - desc.type = JSON_BIGINT_TYPE_BITS; + for (auto& [name, desc] : _derived_maps) { + if (desc.type == flat_json::JSON_TYPE_BITS.at(vpack::ValueType::UInt) && desc.max <= max) { + desc.type = flat_json::JSON_BIGINT_TYPE_BITS; } } - // sort by hit, casts - std::vector> top_hits(derived_maps.begin(), derived_maps.end()); - std::sort(top_hits.begin(), top_hits.end(), - [](const pair& a, const pair& b) { - // check hits, the higher the hit rate, the higher the priority. - if (a.second.hits != b.second.hits) { - return a.second.hits > b.second.hits; - } - // check type, the scalar type has the highest priority. - if (a.second.type != b.second.type) { - return a.second.type > b.second.type; - } - // check casts, the fewer the types of inference cast, the higher the priority. - if (a.second.casts != b.second.casts) { - return a.second.casts < b.second.casts; - } + std::vector update_stack; + std::vector> stack; + std::vector> hit_leaf; - // sort by name, just for stable order - return a.first < b.first; - }); + stack.emplace_back(_path_root.get(), ""); + while (!stack.empty()) { + auto [node, path] = stack.back(); + stack.pop_back(); - for (int i = 0; i < top_hits.size() && i < config::json_flat_column_max; i++) { - const auto& [name, desc] = top_hits[i]; - // check sparsity - // same key may appear many times in json, so we need avoid duplicate compute hits - if (desc.multi_times <= 0 && desc.hits >= total_rows * config::json_flat_sparsity_factor) { - _flat_paths.emplace_back(name); - _flat_types.emplace_back(desc.type); + if (node->children.empty()) { + // leaf node + // check sparsity, same key may appear many times in json, so we need avoid duplicate compute hits + auto desc = _derived_maps[node]; + if (desc.multi_times <= 0 && desc.hits >= _total_rows * config::json_flat_sparsity_factor) { + hit_leaf.emplace_back(node, path); + node->type = flat_json::JSON_BITS_TO_LOGICAL_TYPE.at(desc.type); + node->remain = false; // later update + } else { + node->remain = true; + _has_remain = true; + } + VLOG(8) << "flat json[" << path << "], hit[" << desc.hits << "], row[" << _total_rows << "]"; + } else { + update_stack.push_back(node); + for (auto& [key, child] : node->children) { + stack.emplace_back(child.get(), path + "." + key); + } + } + } + + // sort by name, just for stable order + size_t limit = config::json_flat_column_max > 0 ? config::json_flat_column_max : std::numeric_limits::max(); + std::sort(hit_leaf.begin(), hit_leaf.end(), [](const auto& a, const auto& b) { return a.second < b.second; }); + for (auto& [node, path] : hit_leaf) { + if (_paths.size() >= limit) { + node->remain = true; + _has_remain = true; + continue; + } + node->index = _paths.size(); + _paths.emplace_back(path.substr(1)); + _types.emplace_back(node->type); + } + + // remove & update remain json + while (!update_stack.empty()) { + auto* node = update_stack.back(); + update_stack.pop_back(); + + auto iter = node->children.begin(); + while (iter != node->children.end()) { + node->remain |= iter->second->remain; + if (iter->second->remain && iter->second->children.empty()) { + iter = node->children.erase(iter); + } else { + ++iter; + } } - VLOG(8) << "flat json[" << name << "], hit[" << desc.hits << "], row[" << total_rows << "]"; + } +} + +JsonFlattener::JsonFlattener(JsonPathDeriver& deriver) { + DCHECK(deriver.flat_path_root() != nullptr); + _dst_root = deriver.flat_path_root(); + _dst_paths = deriver.flat_paths(); + _has_remain = deriver.has_remain_json(); + + auto paths = deriver.flat_paths(); + auto types = deriver.flat_types(); + + for (size_t i = 0; i < paths.size(); i++) { + _flat_columns.emplace_back(ColumnHelper::create_column(TypeDescriptor(types[i]), true)); } - // init index map - for (int i = 0; i < _flat_paths.size(); i++) { - _flat_index[_flat_paths[i]] = i; + if (_has_remain) { + _flat_columns.emplace_back(ColumnHelper::create_column(TypeDescriptor(LogicalType::TYPE_JSON), false)); + _remain = down_cast(_flat_columns.back().get()); } } -void JsonFlattener::flatten(const Column* json_column, std::vector* result) { - DCHECK(result->size() == _flat_paths.size()); +JsonFlattener::JsonFlattener(const std::vector& paths, const std::vector& types, + bool has_remain) + : _has_remain(has_remain), _dst_paths(paths) { + _dst_root = std::make_shared(); + for (size_t i = 0; i < paths.size(); i++) { + auto* leaf = JsonFlatPath::normalize_from_path(paths[i], _dst_root.get()); + leaf->type = types[i]; + leaf->index = i; + + _flat_columns.emplace_back(ColumnHelper::create_column(TypeDescriptor(types[i]), true)); + } + + if (_has_remain) { + _flat_columns.emplace_back(ColumnHelper::create_column(TypeDescriptor(LogicalType::TYPE_JSON), false)); + _remain = down_cast(_flat_columns.back().get()); + } +} + +void JsonFlattener::flatten(const Column* json_column) { // input const JsonColumn* json_data = nullptr; if (json_column->is_nullable()) { @@ -321,73 +552,706 @@ void JsonFlattener::flatten(const Column* json_column, std::vector* r json_data = down_cast(json_column); } - std::vector flat_jsons; - for (size_t i = 0; i < _flat_paths.size(); i++) { - flat_jsons.emplace_back(down_cast((*result)[i].get())); + // may not empty rows when compaction + size_t base_rows = _flat_columns[0]->size(); + // output + if (_has_remain) { + _flatten(json_column, json_data); + for (size_t i = 0; i < _flat_columns.size() - 1; i++) { + down_cast(_flat_columns[i].get())->update_has_null(); + } + } else { + _flatten(json_column, json_data); + for (size_t i = 0; i < _flat_columns.size(); i++) { + down_cast(_flat_columns[i].get())->update_has_null(); + } + } + + for (auto& col : _flat_columns) { + DCHECK_EQ(col->size(), json_column->size() + base_rows); + } +} + +template +bool JsonFlattener::_flatten_json(const vpack::Slice& value, const JsonFlatPath* root, vpack::Builder* builder, + uint32_t* flat_hit) { + vpack::ObjectIterator it(value, false); + for (; it.valid(); it.next()) { + auto current = (*it); + // sub-object + auto v = current.value; + auto k = current.key.copyString(); + + auto child = root->children.find(k); + if constexpr (REMAIN) { + if (child == root->children.end()) { + builder->add(k, v); + continue; + } + } else { + if (*flat_hit == 0) { + return false; + } + if (child == root->children.end()) { + continue; + } + } + + if (child->second->children.empty()) { + // leaf node + auto index = child->second->index; + DCHECK(_flat_columns.size() > index); + DCHECK(_flat_columns[index]->is_nullable()); + auto* c = down_cast(_flat_columns[index].get()); + auto func = flat_json::JSON_EXTRACT_FUNC.at(child->second->type); + func(&v, c); + *flat_hit ^= (1 << index); + // not leaf node, should goto deep + } else if (v.isObject()) { + if constexpr (REMAIN) { + builder->add(k, vpack::Value(vpack::ValueType::Object)); + _flatten_json(v, child->second.get(), builder, flat_hit); + builder->close(); + } else { + if (!_flatten_json(v, child->second.get(), builder, flat_hit)) { + return false; + } + } + } else { + if constexpr (REMAIN) { + builder->add(k, v); + } + } } + return true; +} +template +void JsonFlattener::_flatten(const Column* json_column, const JsonColumn* json_data) { + DCHECK(!_dst_paths.empty()); // may not empty rows when compaction - size_t base_rows = flat_jsons[0]->size(); + size_t base_rows = _flat_columns[0]->size(); // output - DCHECK_LE(_flat_paths.size(), std::numeric_limits::max()); + DCHECK_LE(_dst_paths.size(), std::numeric_limits::max()); for (size_t row = 0; row < json_column->size(); row++) { if (json_column->is_null(row)) { - for (size_t k = 0; k < result->size(); k++) { - (*result)[k]->append_nulls(1); + for (size_t k = 0; k < _flat_columns.size(); k++) { // all is null + _flat_columns[k]->append_nulls(1); } continue; } auto* obj = json_data->get_object(row); auto vslice = obj->to_vslice(); - if (vslice.isNone() || vslice.isNull() || vslice.isEmptyObject() || !vslice.isObject()) { - for (size_t k = 0; k < result->size(); k++) { - (*result)[k]->append_nulls(1); + if (vslice.isNone() || vslice.isNull()) { + for (size_t k = 0; k < _flat_columns.size(); k++) { // all is null + _flat_columns[k]->append_nulls(1); } continue; } + if (vslice.isEmptyObject() || !vslice.isObject()) { + for (size_t k = 0; k < _dst_paths.size(); k++) { // remain push object + _flat_columns[k]->append_nulls(1); + } + if constexpr (HAS_REMAIN) { + _remain->append(obj); + } + continue; + } // bitset, all 1, // to mark which column exists in json, to fill null if doesn't found in json - uint32_t flat_hit = (1 << _flat_paths.size()) - 1; - vpack::ObjectIterator iter(vslice); - for (const auto& it : iter) { - std::string_view path = it.key.stringView(); - auto iter = _flat_index.find(std::string(path)); - if (iter != _flat_index.end()) { - int index = iter->second; - uint8_t type = _flat_types[index]; - auto func = JSON_BITS_FUNC.at(type); - func(&it.value, flat_jsons[index]); - // set index to 0 - flat_hit ^= (1 << index); - } - - if (flat_hit == 0) { - break; - } + uint32_t flat_hit = (1 << _dst_paths.size()) - 1; + if constexpr (HAS_REMAIN) { + vpack::Builder builder; + builder.add(vpack::Value(vpack::ValueType::Object)); + _flatten_json(vslice, _dst_root.get(), &builder, &flat_hit); + builder.close(); + _remain->append(JsonValue(builder.slice())); + } else { + _flatten_json(vslice, _dst_root.get(), nullptr, &flat_hit); } if (UNLIKELY(flat_hit > 0)) { - for (size_t k = 0; k < _flat_paths.size() && flat_hit > 0; k++) { + for (size_t k = 0; k < _dst_paths.size() && flat_hit > 0; k++) { if (flat_hit & (1 << k)) { - flat_jsons[k]->append_nulls(1); + _flat_columns[k]->append_nulls(1); flat_hit ^= (1 << k); } } } - for (auto col : flat_jsons) { + for (auto& col : _flat_columns) { DCHECK_EQ(col->size(), row + 1 + base_rows); } + if constexpr (HAS_REMAIN) { + DCHECK_EQ(row + 1 + base_rows, _remain->size()); + } } +} - for (auto col : flat_jsons) { - DCHECK_EQ(col->size(), json_column->size() + base_rows); +std::vector JsonFlattener::mutable_result() { + std::vector res; + for (size_t i = 0; i < _flat_columns.size(); i++) { + res.emplace_back(_flat_columns[i]); + _flat_columns[i] = _flat_columns[i]->clone_empty(); + } + if (_has_remain) { + _remain = down_cast(_flat_columns.back().get()); } + return res; +} - for (auto& col : *result) { - down_cast(col.get())->update_has_null(); +JsonMerger::JsonMerger(const std::vector& paths, const std::vector& types, bool has_remain) + : _has_remain(has_remain) { + _src_root = std::make_shared(); + + for (size_t i = 0; i < paths.size(); i++) { + auto* leaf = JsonFlatPath::normalize_from_path(paths[i], _src_root.get()); + leaf->type = types[i]; + leaf->index = i; } } + +void dfs_exclude(JsonFlatPath* node) { + if (node->children.empty()) { + return; + } + bool all_exclude = true; + for (auto& [_, child] : node->children) { + dfs_exclude(child.get()); + all_exclude &= (child->op == JsonFlatPath::OP_EXCLUDE); + } + node->op = all_exclude ? JsonFlatPath::OP_EXCLUDE : JsonFlatPath::OP_INCLUDE; +} + +void JsonMerger::set_exclude_paths(const std::vector& exclude_paths) { + for (auto& path : exclude_paths) { + auto* leaf = JsonFlatPath::normalize_from_path(path, _src_root.get()); + leaf->op = JsonFlatPath::OP_EXCLUDE; + } + dfs_exclude(_src_root.get()); +} + +void JsonMerger::set_root_path(const std::string& base_path) { + JsonFlatPath::set_root(base_path, _src_root.get()); +} + +ColumnPtr JsonMerger::merge(const std::vector& columns) { + DCHECK_GE(columns.size(), 1); + _result = JsonColumn::create(); + _json_result = down_cast(_result.get()); + + for (auto& col : columns) { + _src_columns.emplace_back(col.get()); + } + + size_t rows = columns[0]->size(); + if (_has_remain) { + auto remain = down_cast(_src_columns.back()); + for (size_t i = 0; i < rows; i++) { + auto obj = remain->get_object(i)->to_vslice(); + vpack::Builder builder; + builder.add(vpack::Value(vpack::ValueType::Object)); + _merge_json(_src_root.get(), &obj, &builder, i, _src_root->op == JsonFlatPath::OP_INCLUDE); + builder.close(); + _json_result->append(JsonValue(builder.slice())); + } + } else { + for (size_t i = 0; i < rows; i++) { + vpack::Builder builder; + builder.add(vpack::Value(vpack::ValueType::Object)); + _merge_json(_src_root.get(), nullptr, &builder, i, _src_root->op == JsonFlatPath::OP_INCLUDE); + builder.close(); + _json_result->append(JsonValue(builder.slice())); + } + } + return _result; +} + +template +void JsonMerger::_merge_json(const JsonFlatPath* root, const vpack::Slice* remain, vpack::Builder* builder, + size_t index, bool in_tree) { + std::unordered_set check_values; + if (HAS_REMAIN) { + vpack::ObjectIterator it(*remain, false); + for (; it.valid(); it.next()) { + auto k = it.key().copyString(); + auto v = it.value(); + + auto iter = root->children.find(k); + if (iter == root->children.end()) { + if (in_tree) { + // only remain contains + builder->add(k, v); + } + continue; + } + if (iter->second->op == JsonFlatPath::OP_EXCLUDE) { + continue; + } + check_values.emplace(it.key().stringView()); + if (v.isObject()) { + if (iter->second->op == JsonFlatPath::OP_IGNORE) { + _merge_json(iter->second.get(), &v, builder, index, false); + } else if (iter->second->op == JsonFlatPath::OP_ROOT) { + _merge_json(iter->second.get(), &v, builder, index, true); + } else { + DCHECK(iter->second->op == JsonFlatPath::OP_INCLUDE); + builder->add(k, vpack::Value(vpack::ValueType::Object)); + _merge_json(iter->second.get(), &v, builder, index, true); + builder->close(); + } + continue; + } + // leaf node + DCHECK(iter->second->op == JsonFlatPath::OP_INCLUDE); + builder->add(k, v); + } + } + for (auto& [child_name, child] : root->children) { + if (child->op == JsonFlatPath::OP_EXCLUDE || check_values.contains(child_name)) { + continue; + } + + if (child->children.empty()) { + DCHECK(child->op == JsonFlatPath::OP_INCLUDE); + auto col = _src_columns[child->index]; + if (!col->is_null(index)) { + DCHECK(flat_json::JSON_MERGE_FUNC.contains(child->type)); + auto func = flat_json::JSON_MERGE_FUNC.at(child->type); + func(builder, child_name, col, index); + } + } else if (child->op == JsonFlatPath::OP_IGNORE) { + // don't add level + _merge_json(child.get(), nullptr, builder, index, false); + } else if (child->op == JsonFlatPath::OP_ROOT) { + _merge_json(child.get(), nullptr, builder, index, true); + } else { + builder->add(child_name, vpack::Value(vpack::ValueType::Object)); + _merge_json(child.get(), nullptr, builder, index, true); + builder->close(); + } + } +} + +HyperJsonTransformer::HyperJsonTransformer(JsonPathDeriver& deriver) + : _dst_remain(deriver.has_remain_json()), _dst_paths(deriver.flat_paths()), _dst_types(deriver.flat_types()) { + for (size_t i = 0; i < _dst_paths.size(); i++) { + _dst_columns.emplace_back(ColumnHelper::create_column(TypeDescriptor(_dst_types[i]), true)); + } + + if (_dst_remain) { + _dst_columns.emplace_back(JsonColumn::create()); + } +} + +HyperJsonTransformer::HyperJsonTransformer(const std::vector& paths, const std::vector& types, + bool has_remain) + : _dst_remain(has_remain), _dst_paths(paths), _dst_types(types) { + for (size_t i = 0; i < _dst_paths.size(); i++) { + _dst_columns.emplace_back(ColumnHelper::create_column(TypeDescriptor(types[i]), true)); + } + + if (_dst_remain) { + _dst_columns.emplace_back(JsonColumn::create()); + } +} + +void HyperJsonTransformer::init_read_task(const std::vector& paths, const std::vector& types, + bool has_remain) { + DCHECK(_src_paths.empty()); + DCHECK(_src_types.empty()); + DCHECK(!paths.empty()); + DCHECK(!types.empty()); + + _src_paths.assign(paths.begin(), paths.end()); + _src_types.assign(types.begin(), types.end()); + _merge_tasks.clear(); + _flat_tasks.clear(); + + std::vector equals; + std::vector merges; + std::unordered_set check_dst; + DCHECK(!_dst_remain); + // equals & merge + for (size_t i = 0; i < _dst_paths.size(); i++) { + equals.clear(); + merges.clear(); + for (size_t j = 0; j < _src_paths.size(); j++) { + if (_dst_paths[i] == _src_paths[j]) { + equals.emplace_back(j); // equals +#ifdef NDEBUG + break; // must only one +#endif + } else if (_src_paths[j].starts_with(_dst_paths[i] + ".")) { + merges.emplace_back(j); + } + } + + DCHECK(equals.empty() || merges.empty()); + if (!equals.empty()) { + DCHECK_EQ(equals.size(), 1); + check_dst.emplace(i); + auto& mk = _merge_tasks.emplace_back(); + mk.is_merge = false; + mk.src_index.emplace_back(equals[0]); + mk.dst_index = i; + if (_dst_types[i] != _src_types[equals[0]]) { + mk.need_cast = true; + SlotDescriptor source_slot(i, "mock_solt", TypeDescriptor(_src_types[equals[0]])); + ColumnRef* col_ref = _pool.add(new ColumnRef(&source_slot)); + mk.cast_expr = VectorizedCastExprFactory::from_type(TypeDescriptor(_src_types[equals[0]]), + TypeDescriptor(_dst_types[i]), col_ref, &_pool); + } + } else if (!merges.empty()) { + check_dst.emplace(i); + if (_dst_types[i] != TYPE_JSON && !is_string_type(_dst_types[i])) { + continue; + } + auto& mk = _merge_tasks.emplace_back(); + mk.is_merge = true; + mk.src_index = merges; + mk.dst_index = i; + if (_dst_types[i] != TYPE_JSON) { + // must be to string, merge result must be string + mk.need_cast = true; + SlotDescriptor source_slot(i, "mock_solt", TypeDescriptor(TYPE_JSON)); + ColumnRef* col_ref = _pool.add(new ColumnRef(&source_slot)); + mk.cast_expr = VectorizedCastExprFactory::from_type(TypeDescriptor(TYPE_JSON), + TypeDescriptor(_dst_types[i]), col_ref, &_pool); + } + } + } + + std::vector flats; + for (size_t j = 0; j < _src_paths.size(); j++) { + flats.clear(); + for (size_t i = 0; i < _dst_paths.size(); i++) { +#ifdef NDEBUG + if (!check_dst.contains(i) && _dst_paths[i].starts_with(_src_paths[j] + ".")) { +#else + if (_dst_paths[i].starts_with(_src_paths[j] + ".")) { +#endif + flats.emplace_back(i); + DCHECK(!check_dst.contains(i)); + check_dst.emplace(i); + } + } + if (!flats.empty()) { + auto& fk = _flat_tasks.emplace_back(); + fk.src_index = j; + fk.dst_index = flats; + } + } + + if (has_remain && _dst_paths.size() != check_dst.size()) { + // must from remain + flats.clear(); + for (size_t i = 0; i < _dst_paths.size(); i++) { + if (!check_dst.contains(i)) { + flats.emplace_back(i); + DCHECK(!check_dst.contains(i)); + check_dst.emplace(i); + } + } + + auto& fk = _flat_tasks.emplace_back(); + fk.src_index = _src_paths.size(); + fk.dst_index = flats; + } + + for (auto& fk : _flat_tasks) { + std::vector p; + std::vector t; + for (auto& index : fk.dst_index) { + p.emplace_back(_dst_paths[index]); + t.emplace_back(_dst_types[index]); + } + fk.flattener = std::make_unique(p, t, false); + } + + for (auto& mk : _merge_tasks) { + if (mk.is_merge) { + std::vector p; + std::vector t; + for (auto& index : mk.src_index) { + p.emplace_back(_src_paths[index]); + t.emplace_back(_src_types[index]); + } + if (has_remain) { + mk.src_index.emplace_back(paths.size()); + } + mk.merger = std::make_unique(p, t, has_remain); + mk.merger->set_root_path(_dst_paths[mk.dst_index]); + } + } +} + +void HyperJsonTransformer::init_compaction_task(JsonColumn* column) { + DCHECK(column->is_flat_json()); + _src_paths.clear(); + _src_types.clear(); + _merge_tasks.clear(); + _flat_tasks.clear(); + + _src_paths.assign(column->flat_column_paths().begin(), column->flat_column_paths().end()); + _src_types.assign(column->flat_column_types().begin(), column->flat_column_types().end()); + + std::unordered_set _src_set(_src_paths.begin(), _src_paths.end()); + + // output remain, must put merge task at first + if (_dst_remain) { + _merge_tasks.emplace_back(); + _merge_tasks.back().dst_index = _dst_paths.size(); + _merge_tasks.back().is_merge = true; + } + + for (size_t j = 0; j < _src_paths.size(); j++) { + size_t i = 0; + for (; i < _dst_paths.size(); i++) { + if (_dst_paths[i] == _src_paths[j]) { + auto& mk = _merge_tasks.emplace_back(); + mk.is_merge = false; + mk.is_merge = false; + mk.src_index.emplace_back(j); + mk.dst_index = i; + if (_dst_types[i] != _src_types[j]) { + mk.need_cast = true; + SlotDescriptor source_slot(i, "mock_solt", TypeDescriptor(_src_types[j])); + ColumnRef* col_ref = _pool.add(new ColumnRef(&source_slot)); + mk.cast_expr = VectorizedCastExprFactory::from_type(TypeDescriptor(_src_types[j]), + TypeDescriptor(_dst_types[i]), col_ref, &_pool); + } + break; + } + } + + if (i >= _dst_paths.size() && _dst_remain) { + _merge_tasks[0].src_index.emplace_back(j); + } + } + + std::vector all_flat_paths; // for remove from remain + if (column->has_remain()) { + for (size_t i = 0; i < _dst_paths.size(); i++) { + if (_src_set.find(_dst_paths[i]) == _src_set.end()) { + // merge to remain + if (_flat_tasks.empty()) { + _flat_tasks.emplace_back(); + _flat_tasks.back().src_index = _src_paths.size(); + } + _flat_tasks.back().dst_index.emplace_back(i); + } + } + + for (size_t i = 0; i < _flat_tasks.size(); i++) { + auto& fk = _flat_tasks[i]; + std::vector p; + std::vector t; + for (auto& index : fk.dst_index) { + p.emplace_back(all_flat_paths[index]); + p.emplace_back(_dst_paths[index]); + t.emplace_back(_dst_types[index]); + } + fk.flattener = std::make_unique(p, t, false); + } + } + + if (_dst_remain) { + auto& mk = _merge_tasks[0]; + DCHECK(mk.is_merge); + std::vector p; + std::vector t; + + for (auto& index : mk.src_index) { + p.emplace_back(_src_paths[index]); + t.emplace_back(_src_types[index]); + } + mk.merger = std::make_unique(p, t, column->has_remain()); + mk.merger->set_exclude_paths(all_flat_paths); + if (column->has_remain()) { + _merge_tasks[0].src_index.emplace_back(_src_paths.size()); + } + } + + for (size_t i = 1; i < _merge_tasks.size(); i++) { + DCHECK(!_merge_tasks[i].is_merge); + } +} + +Status HyperJsonTransformer::trans(std::vector& columns) { + { + SCOPED_RAW_TIMER(&_cast_ms); + for (auto& task : _merge_tasks) { + if (!task.is_merge) { + RETURN_IF_ERROR(_equals(task, columns)); + } + } + } + { + SCOPED_RAW_TIMER(&_merge_ms); + for (auto& task : _merge_tasks) { + if (task.is_merge) { + RETURN_IF_ERROR(_merge(task, columns)); + } + } + } + { + SCOPED_RAW_TIMER(&_flat_ms); + for (auto& task : _flat_tasks) { + _flat(task, columns); + } + } + + size_t rows = columns[0]->size(); + for (size_t i = 0; i < _dst_columns.size() - 1; i++) { + if (_dst_columns[i]->size() == 0) { + _dst_columns[i]->resize(rows); + } else { + DCHECK_EQ(rows, _dst_columns[i]->size()); + } + } + return Status::OK(); +} + +Status HyperJsonTransformer::_equals(const MergeTask& task, std::vector& columns) { + DCHECK(task.src_index.size() == 1); + if (task.need_cast) { + auto col = columns[task.src_index[0]]; + return _cast(task, col); + } + _dst_columns[task.dst_index] = columns[task.src_index[0]]; + return Status::OK(); +} + +Status HyperJsonTransformer::_cast(const MergeTask& task, ColumnPtr& col) { + DCHECK(task.need_cast); + Chunk chunk; + chunk.append_column(col, task.dst_index); + ASSIGN_OR_RETURN(auto res, task.cast_expr->evaluate_checked(nullptr, &chunk)); + res->set_delete_state(col->delete_state()); + + if (res->only_null()) { + _dst_columns[task.dst_index]->append_nulls(col->size()); + } else if (res->is_constant()) { + auto data = down_cast(res.get())->data_column(); + _dst_columns[task.dst_index]->append_value_multiple_times(*data, 0, col->size()); + } else { + _dst_columns[task.dst_index].swap(res); + } + return Status::OK(); +} + +Status HyperJsonTransformer::_merge(const MergeTask& task, std::vector& columns) { + if (task.dst_index == _dst_paths.size()) { + DCHECK(_dst_remain); + // output to remain + if (task.src_index.size() == 1 && task.src_index[0] == _src_paths.size()) { + // only use remain + _dst_columns[task.dst_index] = columns[task.src_index[0]]; + return Status::OK(); + } + } + + std::vector cols; + for (auto& index : task.src_index) { + cols.emplace_back(columns[index]); + } + auto result = task.merger->merge(cols); + if (task.need_cast) { + return _cast(task, result); + } else { + _dst_columns[task.dst_index] = result; + } + return Status::OK(); +} + +void HyperJsonTransformer::_flat(const FlatTask& task, std::vector& columns) { + if (task.dst_index.empty()) { + return; + } + + if (task.src_index != _src_types.size() && _src_types[task.src_index] != LogicalType::TYPE_JSON) { + // not json column, don't need to flatten + for (size_t i = 0; i < task.dst_index.size(); i++) { + _dst_columns[task.dst_index[i]]->append_nulls(columns[0]->size()); + } + return; + } + + task.flattener->flatten(columns[task.src_index].get()); + auto result = task.flattener->mutable_result(); + + for (size_t i = 0; i < task.dst_index.size(); i++) { + _dst_columns[task.dst_index[i]] = result[i]; + } +} + +void HyperJsonTransformer::reset() { + _src_paths.clear(); + _src_types.clear(); + _merge_tasks.clear(); + _flat_tasks.clear(); + _pool.clear(); + for (size_t i = 0; i < _dst_columns.size(); i++) { + _dst_columns[i] = _dst_columns[i]->clone_empty(); + } +} + +std::vector HyperJsonTransformer::mutable_result() { + std::vector res; + for (size_t i = 0; i < _dst_columns.size(); i++) { + res.emplace_back(_dst_columns[i]); + _dst_columns[i] = _dst_columns[i]->clone_empty(); + } + return res; +} + +std::vector HyperJsonTransformer::cast_paths() const { + std::vector res; + for (auto& task : _merge_tasks) { + if (task.is_merge) { + continue; + } + for (auto& index : task.src_index) { + if (index == _src_paths.size()) { + res.emplace_back("remain"); + } else { + res.emplace_back(_src_paths[index]); + } + } + } + return res; +} + +std::vector HyperJsonTransformer::merge_paths() const { + std::vector res; + for (auto& task : _merge_tasks) { + if (!task.is_merge) { + continue; + } + for (auto& index : task.src_index) { + if (index == _src_paths.size()) { + res.emplace_back("remain"); + } else { + res.emplace_back(_src_paths[index]); + } + } + } + return res; +} + +std::vector HyperJsonTransformer::flat_paths() const { + std::vector res; + for (auto& task : _flat_tasks) { + if (task.src_index == _src_paths.size()) { + res.emplace_back("remain"); + } else { + res.emplace_back(_src_paths[task.src_index]); + } + } + return res; +} + } // namespace starrocks diff --git a/be/src/util/json_flattener.h b/be/src/util/json_flattener.h index 5fcec1b51acc94..fb1f7d40f60afc 100644 --- a/be/src/util/json_flattener.h +++ b/be/src/util/json_flattener.h @@ -17,45 +17,251 @@ #pragma once +#include +#include #include +#include #include #include +#include +#include #include #include "column/vectorized_fwd.h" +#include "common/object_pool.h" #include "common/status.h" +#include "common/statusor.h" +#include "exprs/expr.h" #include "types/logical_type.h" #include "velocypack/vpack.h" namespace starrocks { namespace vpack = arangodb::velocypack; -class JsonFlattener { +class JsonFlatPath { public: - static const uint8_t JSON_NULL_TYPE_BITS = 0xFC | 1; // must be the NULL type + using OP = uint8_t; + static const OP OP_INCLUDE = 0; + static const OP OP_EXCLUDE = 1; // for compaction remove extract json + static const OP OP_IGNORE = 2; // for merge and read middle json + static const OP OP_ROOT = 3; // to mark new root + // for express flat path + int index = -1; // flat paths array index, only use for leaf, to find column + LogicalType type = LogicalType::TYPE_JSON; + bool remain = false; + OP op = OP_INCLUDE; // merge flat json use, to mark the path is need + std::unordered_map> children; + + JsonFlatPath() = default; + JsonFlatPath(JsonFlatPath&&) = default; + JsonFlatPath(const JsonFlatPath& rhs) = default; + ~JsonFlatPath() = default; + + // return the leaf node + static JsonFlatPath* normalize_from_path(const std::string& path, JsonFlatPath* root); + // set new root, other path will set to exclude, the node must include the root path + static void set_root(const std::string& new_root_path, JsonFlatPath* node); + +private: + static std::pair _split_path(const std::string& path); +}; + +// to deriver json flanttern path +class JsonPathDeriver { public: - JsonFlattener() = default; + JsonPathDeriver() = default; + JsonPathDeriver(const std::vector& paths, const std::vector& types, bool has_remain); + + ~JsonPathDeriver() = default; + + // dervie paths + void derived(const std::vector& json_datas); + + bool has_remain_json() const { return _has_remain; } + + std::shared_ptr& flat_path_root() { return _path_root; } + + const std::vector& flat_paths() const { return _paths; } + + const std::vector& flat_types() const { return _types; } + +private: + void _derived(const Column* json_data, size_t mark_row); + + void _finalize(); + + void _derived_on_flat_json(const std::vector& json_datas); + + void _visit_json_paths(vpack::Slice value, JsonFlatPath* root, size_t mark_row); + +private: + struct JsonFlatDesc { + // json compatible type + uint8_t type = 255; // JSON_NULL_TYPE_BITS + // column path hit count, some json may be null or none, so hit use to record the actual value + // e.g: {"a": 1, "b": 2}, path "$.c" not exist, so hit is 0 + uint64_t hits = 0; + // how many rows need to be cast to a compatible type + uint16_t casts = 0; + + // for json-uint, json-uint is uint64_t, check the maximum value and downgrade to bigint + uint64_t max = 0; + + // same key may appear many times in json, so we need avoid duplicate compute hits + uint64_t last_row = -1; + uint64_t multi_times = 0; + }; + + bool _has_remain = false; + std::vector _paths; + std::vector _types; + + size_t _total_rows; + std::unordered_map _derived_maps; + std::shared_ptr _path_root; +}; + +// flattern JsonColumn to flat json A,B,C +class JsonFlattener { +public: + JsonFlattener(JsonPathDeriver& deriver); + + JsonFlattener(const std::vector& paths, const std::vector& types, bool has_remain); ~JsonFlattener() = default; - JsonFlattener(std::vector& paths); + // flatten without flat json, input must not flat json + void flatten(const Column* json_column); + + std::vector mutable_result(); + +private: + template + void _flatten(const Column* json_column, const JsonColumn* json_data); - JsonFlattener(std::vector& paths, const std::vector& types); + template + bool _flatten_json(const vpack::Slice& value, const JsonFlatPath* root, vpack::Builder* builder, + uint32_t* flat_hit); - void flatten(const Column* json_column, std::vector* result); +private: + bool _has_remain = false; + // note: paths may be less 1 than flat columns + std::vector _dst_paths; - void derived_paths(std::vector& json_datas); + std::vector _flat_columns; + JsonColumn* _remain; + std::shared_ptr _dst_root; +}; - std::vector& get_flat_paths() { return _flat_paths; } +// merge flat json A,B,C to JsonColumn +class JsonMerger { +public: + ~JsonMerger() = default; - std::vector get_flat_types(); + JsonMerger(const std::vector& paths, const std::vector& types, bool has_remain = false); - static uint8_t get_compatibility_type(vpack::ValueType type1, uint8_t type2); + void set_root_path(const std::string& base_path); + + void set_exclude_paths(const std::vector& exclude_paths); + + // input nullable-json, output none null json + ColumnPtr merge(const std::vector& columns); + +private: + template + void _merge_json(const JsonFlatPath* root, const vpack::Slice* remain, vpack::Builder* builder, size_t index, + bool in_tree); + +private: + bool _has_remain = false; + std::shared_ptr _src_root; + std::vector _src_columns; + + ColumnPtr _result; + JsonColumn* _json_result; +}; + +// use for read json column and compaction +// +// handle: +// flatten json (A, B, C, remain) column to flat json column to (A, B, C, D, remain) +// some flat, some merge, some extract... +class HyperJsonTransformer { +public: + HyperJsonTransformer(JsonPathDeriver& deriver); + + HyperJsonTransformer(const std::vector& paths, const std::vector& types, bool has_remain); + + ~HyperJsonTransformer() = default; + + // init for read process + void init_read_task(const std::vector& paths, const std::vector& types, bool has_remain); + + // init for compaction + void init_compaction_task(JsonColumn* column); + + Status trans(std::vector& columns); + + std::vector& result() { return _dst_columns; } + + std::vector mutable_result(); + + void reset(); + + std::vector cast_paths() const; + + std::vector merge_paths() const; + + std::vector flat_paths() const; + + // cast, merge, flat + std::tuple cost_ms() const { return {_cast_ms, _merge_ms, _flat_ms}; }; private: - std::vector _flat_paths; - std::vector _flat_types; - std::unordered_map _flat_index; + // equals or merge, dst-src: 1-N/1-1 + struct MergeTask { + // to avoid create column with find type + Expr* cast_expr; + + bool is_merge = false; + bool need_cast = false; + + int dst_index = -1; + std::vector src_index; + + std::unique_ptr deriver; + std::unique_ptr merger; + }; + + // flat, dst-src: N-1 + struct FlatTask { + std::vector dst_index; + int src_index = -1; + + std::unique_ptr flattener; + }; + + Status _equals(const MergeTask& task, std::vector& columns); + Status _cast(const MergeTask& task, ColumnPtr& columns); + Status _merge(const MergeTask& task, std::vector& columns); + void _flat(const FlatTask& task, std::vector& columns); + +private: + bool _dst_remain = false; + std::vector _dst_paths; + std::vector _dst_types; + std::vector _dst_columns; + + std::vector _src_paths; + std::vector _src_types; + std::vector _merge_tasks; + std::vector _flat_tasks; + ObjectPool _pool; + + int64_t _cast_ms = 0; + int64_t _flat_ms = 0; + int64_t _merge_ms = 0; }; + } // namespace starrocks \ No newline at end of file diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt index ad7abe6a5803f6..c1d617c3a68587 100644 --- a/be/test/CMakeLists.txt +++ b/be/test/CMakeLists.txt @@ -410,6 +410,7 @@ set(EXEC_FILES ./util/filesystem_util_test.cpp ./util/frame_of_reference_coding_test.cpp ./util/json_util_test.cpp + ./util/json_flattener_test.cpp ./util/md5_test.cpp ./util/monotime_test.cpp ./util/mysql_row_buffer_test.cpp diff --git a/be/test/exprs/flat_json_functions_test.cpp b/be/test/exprs/flat_json_functions_test.cpp index ac00e22b5e48f8..d7d31599094c8c 100644 --- a/be/test/exprs/flat_json_functions_test.cpp +++ b/be/test/exprs/flat_json_functions_test.cpp @@ -68,14 +68,10 @@ TEST_P(FlatJsonQueryTestFixture2, flat_json_query) { auto flat_json = JsonColumn::create(); auto flat_json_ptr = flat_json.get(); - std::vector full_paths; - for (const auto& p : param_flat_path) { - full_paths.emplace_back(p); - } - flat_json_ptr->init_flat_columns(full_paths, param_flat_type); - JsonFlattener jf(param_flat_path, param_flat_type); - jf.flatten(json_col.get(), &flat_json_ptr->get_flat_fields()); + JsonFlattener jf(param_flat_path, param_flat_type, false); + jf.flatten(json_col.get()); + flat_json_ptr->set_flat_columns(param_flat_path, param_flat_type, jf.mutable_result()); Columns columns{flat_json, builder.build(true)}; @@ -171,14 +167,15 @@ TEST_P(FlatJsonQueryErrorTestFixture, json_query) { auto flat_json = JsonColumn::create(); auto flat_json_ptr = flat_json.get(); - std::vector full_paths; - for (const auto& p : param_flat_path) { - full_paths.emplace_back(p); + + std::vector param_flat_type; + for (auto _ : param_flat_path) { + param_flat_type.emplace_back(LogicalType::TYPE_JSON); } - flat_json_ptr->init_flat_columns(full_paths); - JsonFlattener jf(param_flat_path); - jf.flatten(json_col.get(), &flat_json_ptr->get_flat_fields()); + JsonFlattener jf(param_flat_path, param_flat_type, false); + jf.flatten(json_col.get()); + flat_json_ptr->set_flat_columns(param_flat_path, param_flat_type, jf.mutable_result()); Columns columns{flat_json, builder.build(true)}; @@ -227,14 +224,9 @@ TEST_P(FlatJsonExistsTestFixture2, flat_json_exists_test) { auto flat_json = JsonColumn::create(); auto* flat_json_ptr = down_cast(flat_json.get()); - std::vector full_paths; - for (const auto& p : param_flat_path) { - full_paths.emplace_back(p); - } - flat_json_ptr->init_flat_columns(full_paths, param_flat_type); - - JsonFlattener jf(param_flat_path, param_flat_type); - jf.flatten(json_col.get(), &flat_json_ptr->get_flat_fields()); + JsonFlattener jf(param_flat_path, param_flat_type, false); + jf.flatten(json_col.get()); + flat_json_ptr->set_flat_columns(param_flat_path, param_flat_type, jf.mutable_result()); Columns columns; columns.push_back(flat_json); @@ -312,14 +304,9 @@ TEST_P(FlatJsonLengthTestFixture2, flat_json_length_test) { auto flat_json = JsonColumn::create(); auto* flat_json_ptr = down_cast(flat_json.get()); - std::vector full_paths; - for (const auto& p : param_flat_path) { - full_paths.emplace_back(p); - } - flat_json_ptr->init_flat_columns(full_paths, param_flat_type); - - JsonFlattener jf(param_flat_path, param_flat_type); - jf.flatten(json_col.get(), &flat_json_ptr->get_flat_fields()); + JsonFlattener jf(param_flat_path, param_flat_type, false); + jf.flatten(json_col.get()); + flat_json_ptr->set_flat_columns(param_flat_path, param_flat_type, jf.mutable_result()); Columns columns; columns.push_back(flat_json); @@ -379,16 +366,12 @@ TEST_P(FlatJsonKeysTestFixture2, json_keys) { auto flat_json = JsonColumn::create(); auto flat_json_ptr = flat_json.get(); - std::vector full_paths; - for (const auto& p : param_flat_path) { - full_paths.emplace_back(p); - } Columns columns{flat_json, builder.build(true)}; - flat_json_ptr->init_flat_columns(full_paths, param_flat_type); - JsonFlattener jf(param_flat_path, param_flat_type); - jf.flatten(json_column.get(), &flat_json_ptr->get_flat_fields()); + JsonFlattener jf(param_flat_path, param_flat_type, false); + jf.flatten(json_column.get()); + flat_json_ptr->set_flat_columns(param_flat_path, param_flat_type, jf.mutable_result()); Status st = JsonFunctions::native_json_path_prepare(ctx.get(), FunctionContext::FunctionStateScope::FRAGMENT_LOCAL); ASSERT_OK(st); @@ -457,10 +440,9 @@ class FlatGetJsonXXXTestFixture2 : public ::testing::TestWithParam(flat_json.get()); - flat_json_ptr->init_flat_columns(flat_path, flat_type); - - JsonFlattener jf(flat_path, flat_type); - jf.flatten(ints.get(), &flat_json_ptr->get_flat_fields()); + JsonFlattener jf(flat_path, flat_type, false); + jf.flatten(ints.get()); + flat_json_ptr->set_flat_columns(flat_path, flat_type, jf.mutable_result()); Columns columns{flat_json, builder.build(true)}; @@ -618,13 +600,11 @@ TEST_P(FlatJsonDeriverPaths, flat_json_path_test) { ASSERT_TRUE(json2.ok()); json_column->append(&*json2); - Columns columns{json_column}; - JsonFlattener jf; - config::json_flat_internal_column_min_limit = 0; - jf.derived_paths(columns); - config::json_flat_internal_column_min_limit = 5; - std::vector path = jf.get_flat_paths(); - std::vector type = jf.get_flat_types(); + std::vector columns{json_column.get()}; + JsonPathDeriver jf; + jf.derived(columns); + std::vector path = jf.flat_paths(); + std::vector type = jf.flat_types(); ASSERT_EQ(param_flat_path, path); ASSERT_EQ(param_flat_type, type); @@ -635,15 +615,15 @@ INSTANTIATE_TEST_SUITE_P(FlatJsonPathDeriver, FlatJsonDeriverPaths, ::testing::Values( std::make_tuple(R"({ "k1": 1, "k2": 2 })", R"({ "k1": 3, "k2": 4 })", std::vector {"k1", "k2"}, std::vector {TYPE_BIGINT, TYPE_BIGINT}), std::make_tuple(R"({ "k1": "v1" })", R"({ "k1": "v33" })", std::vector {"k1"}, std::vector {TYPE_VARCHAR}), - std::make_tuple(R"({ "k1": {"k2": 1} })", R"({ "k1": 123 })", std::vector {"k1"}, std::vector {TYPE_JSON}), + std::make_tuple(R"({ "k1": {"k2": 1} })", R"({ "k1": 123 })", std::vector {}, std::vector {}), std::make_tuple(R"({ "k1": "v1" })", R"({ "k1": 1.123 })", std::vector {"k1"}, std::vector {TYPE_JSON}), - std::make_tuple(R"({ "k1": {"k2": 1} })", R"({ "k1": 1.123 })", std::vector {"k1"}, std::vector {TYPE_JSON}), + std::make_tuple(R"({ "k1": {"k2": 1} })", R"({ "k1": 1.123 })", std::vector {}, std::vector {}), std::make_tuple(R"({ "k1": [1,2,3] })", R"({ "k1": "v33" })", std::vector {"k1"}, std::vector {TYPE_JSON}), std::make_tuple(R"({ "k1": "v1", "k2": [3,4,5], "k3": 1, "k4": 1.2344 })", - R"({ "k1": "abc", "k2": [11,123,54], "k3": 23423, "k4": 1.2344 })", - std::vector {"k3", "k4", "k1", "k2"}, - std::vector {TYPE_BIGINT, TYPE_DOUBLE, TYPE_VARCHAR, TYPE_JSON}), + R"({ "k1": "abc", "k2": [11,123,54], "k3": 23423, "k4": 1.2344 })", + std::vector {"k1", "k2", "k3", "k4"}, + std::vector {TYPE_VARCHAR, TYPE_JSON, TYPE_BIGINT, TYPE_DOUBLE}), std::make_tuple(R"({ "k1": 1, "k2": "a" })", R"({ "k1": 3, "k2": null })", std::vector {"k1", "k2"}, std::vector {TYPE_BIGINT, TYPE_JSON}), std::make_tuple(R"({ "k1": 1, "k2": 2 })", R"({ "k1": 3, "k2": 4 })", std::vector {"k1", "k2"}, std::vector {TYPE_BIGINT, TYPE_BIGINT}), diff --git a/be/test/exprs/json_functions_test.cpp b/be/test/exprs/json_functions_test.cpp index e9b7f47e37a0a6..dd6613a8baa478 100644 --- a/be/test/exprs/json_functions_test.cpp +++ b/be/test/exprs/json_functions_test.cpp @@ -20,6 +20,7 @@ #include #include +#include #include "butil/time.h" #include "column/const_column.h" @@ -490,14 +491,13 @@ TEST_P(FlatJsonQueryTestFixture, json_query) { auto flat_json = JsonColumn::create(); auto flat_json_ptr = flat_json.get(); - std::vector full_paths; - for (const auto& p : param_flat_path) { - full_paths.emplace_back(p); + std::vector param_flat_type; + for (auto _ : param_flat_path) { + param_flat_type.emplace_back(LogicalType::TYPE_JSON); } - flat_json_ptr->init_flat_columns(full_paths); - - JsonFlattener jf(param_flat_path); - jf.flatten(json_col.get(), &flat_json_ptr->get_flat_fields()); + JsonFlattener jf(param_flat_path, param_flat_type, false); + jf.flatten(json_col.get()); + flat_json_ptr->set_flat_columns(param_flat_path, param_flat_type, jf.mutable_result()); Columns columns{flat_json, builder.build(true)}; @@ -682,14 +682,13 @@ TEST_P(FlatJsonExistsTestFixture, flat_json_exists_test) { auto flat_json = JsonColumn::create(); auto* flat_json_ptr = down_cast(flat_json.get()); - std::vector full_paths; - for (const auto& p : param_flat_path) { - full_paths.emplace_back(p); + std::vector param_flat_type; + for (auto _ : param_flat_path) { + param_flat_type.emplace_back(LogicalType::TYPE_JSON); } - flat_json_ptr->init_flat_columns(full_paths); - - JsonFlattener jf(param_flat_path); - jf.flatten(json_col.get(), &flat_json_ptr->get_flat_fields()); + JsonFlattener jf(param_flat_path, param_flat_type, false); + jf.flatten(json_col.get()); + flat_json_ptr->set_flat_columns(param_flat_path, param_flat_type, jf.mutable_result()); Columns columns; columns.push_back(flat_json); @@ -758,14 +757,13 @@ TEST_F(JsonFunctionsTest, flat_json_invalid_path_test) { auto flat_json = JsonColumn::create(); auto* flat_json_ptr = down_cast(flat_json.get()); - std::vector full_paths; - for (const auto& p : param_flat_path) { - full_paths.emplace_back(p); + std::vector param_flat_type; + for (auto _ : param_flat_path) { + param_flat_type.emplace_back(LogicalType::TYPE_JSON); } - flat_json_ptr->init_flat_columns(full_paths); - - JsonFlattener jf(param_flat_path); - jf.flatten(json_col.get(), &flat_json_ptr->get_flat_fields()); + JsonFlattener jf(param_flat_path, param_flat_type, false); + jf.flatten(json_col.get()); + flat_json_ptr->set_flat_columns(param_flat_path, param_flat_type, jf.mutable_result()); Columns columns; columns.push_back(flat_json); @@ -806,14 +804,13 @@ TEST_F(JsonFunctionsTest, flat_json_invalid_constant_json_test) { auto flat_json = JsonColumn::create(); auto* flat_json_ptr = down_cast(flat_json.get()); - std::vector full_paths; - for (const auto& p : param_flat_path) { - full_paths.emplace_back(p); + std::vector param_flat_type; + for (auto _ : param_flat_path) { + param_flat_type.emplace_back(LogicalType::TYPE_JSON); } - flat_json_ptr->init_flat_columns(full_paths); - - JsonFlattener jf(param_flat_path); - jf.flatten(json_col.get(), &flat_json_ptr->get_flat_fields()); + JsonFlattener jf(param_flat_path, param_flat_type, false); + jf.flatten(json_col.get()); + flat_json_ptr->set_flat_columns(param_flat_path, param_flat_type, jf.mutable_result()); Columns columns; columns.push_back(ConstColumn::create(flat_json, 2)); @@ -852,14 +849,13 @@ TEST_F(JsonFunctionsTest, flat_json_variable_path_test) { auto flat_json = JsonColumn::create(); auto* flat_json_ptr = down_cast(flat_json.get()); - std::vector full_paths; - for (const auto& p : param_flat_path) { - full_paths.emplace_back(p); + std::vector param_flat_type; + for (auto _ : param_flat_path) { + param_flat_type.emplace_back(LogicalType::TYPE_JSON); } - flat_json_ptr->init_flat_columns(full_paths); - - JsonFlattener jf(param_flat_path); - jf.flatten(json_col.get(), &flat_json_ptr->get_flat_fields()); + JsonFlattener jf(param_flat_path, param_flat_type, false); + jf.flatten(json_col.get()); + flat_json_ptr->set_flat_columns(param_flat_path, param_flat_type, jf.mutable_result()); Columns columns; columns.push_back(flat_json); @@ -899,14 +895,13 @@ TEST_F(JsonFunctionsTest, flat_json_invalid_variable_path_test) { auto flat_json = JsonColumn::create(); auto* flat_json_ptr = down_cast(flat_json.get()); - std::vector full_paths; - for (const auto& p : param_flat_path) { - full_paths.emplace_back(p); + std::vector param_flat_type; + for (auto _ : param_flat_path) { + param_flat_type.emplace_back(LogicalType::TYPE_JSON); } - flat_json_ptr->init_flat_columns(full_paths); - - JsonFlattener jf(param_flat_path); - jf.flatten(json_col.get(), &flat_json_ptr->get_flat_fields()); + JsonFlattener jf(param_flat_path, param_flat_type, false); + jf.flatten(json_col.get()); + flat_json_ptr->set_flat_columns(param_flat_path, param_flat_type, jf.mutable_result()); Columns columns; columns.push_back(flat_json); @@ -948,14 +943,13 @@ TEST_F(JsonFunctionsTest, flat_json_invalid_null_path_test) { auto flat_json = JsonColumn::create(); auto* flat_json_ptr = down_cast(flat_json.get()); - std::vector full_paths; - for (const auto& p : param_flat_path) { - full_paths.emplace_back(p); + std::vector param_flat_type; + for (auto _ : param_flat_path) { + param_flat_type.emplace_back(LogicalType::TYPE_JSON); } - flat_json_ptr->init_flat_columns(full_paths); - - JsonFlattener jf(param_flat_path); - jf.flatten(json_col.get(), &flat_json_ptr->get_flat_fields()); + JsonFlattener jf(param_flat_path, param_flat_type, false); + jf.flatten(json_col.get()); + flat_json_ptr->set_flat_columns(param_flat_path, param_flat_type, jf.mutable_result()); Columns columns; columns.push_back(flat_json); @@ -996,14 +990,13 @@ TEST_F(JsonFunctionsTest, flat_json_constant_path_test) { auto flat_json = JsonColumn::create(); auto* flat_json_ptr = down_cast(flat_json.get()); - std::vector full_paths; - for (const auto& p : param_flat_path) { - full_paths.emplace_back(p); + std::vector param_flat_type; + for (auto _ : param_flat_path) { + param_flat_type.emplace_back(LogicalType::TYPE_JSON); } - flat_json_ptr->init_flat_columns(full_paths); - - JsonFlattener jf(param_flat_path); - jf.flatten(json_col.get(), &flat_json_ptr->get_flat_fields()); + JsonFlattener jf(param_flat_path, param_flat_type, false); + jf.flatten(json_col.get()); + flat_json_ptr->set_flat_columns(param_flat_path, param_flat_type, jf.mutable_result()); Columns columns; columns.push_back(flat_json); @@ -1306,14 +1299,13 @@ TEST_P(FlatJsonLengthTestFixture, flat_json_length_test) { auto flat_json = JsonColumn::create(); auto* flat_json_ptr = down_cast(flat_json.get()); - std::vector full_paths; - for (const auto& p : param_flat_path) { - full_paths.emplace_back(p); + std::vector param_flat_type; + for (auto _ : param_flat_path) { + param_flat_type.emplace_back(LogicalType::TYPE_JSON); } - flat_json_ptr->init_flat_columns(full_paths); - - JsonFlattener jf(param_flat_path); - jf.flatten(json_col.get(), &flat_json_ptr->get_flat_fields()); + JsonFlattener jf(param_flat_path, param_flat_type, false); + jf.flatten(json_col.get()); + flat_json_ptr->set_flat_columns(param_flat_path, param_flat_type, jf.mutable_result()); Columns columns; columns.push_back(flat_json); diff --git a/be/test/storage/rowset/flat_json_column_rw_test.cpp b/be/test/storage/rowset/flat_json_column_rw_test.cpp index c2b0566a99c047..a65af57eafb7c9 100644 --- a/be/test/storage/rowset/flat_json_column_rw_test.cpp +++ b/be/test/storage/rowset/flat_json_column_rw_test.cpp @@ -14,6 +14,9 @@ #include +#include +#include + #include "column/column_access_path.h" #include "column/json_column.h" #include "column/nullable_column.h" @@ -48,7 +51,7 @@ class FlatJsonColumnRWTest : public testing::Test { ~FlatJsonColumnRWTest() override = default; protected: - void SetUp() override {} + void SetUp() override { _meta.reset(new ColumnMetaPB()); } void TearDown() override {} @@ -56,13 +59,13 @@ class FlatJsonColumnRWTest : public testing::Test { return std::make_shared(fs, FileInfo{fname}, 1, _dummy_segment_schema, nullptr); } - void test_json(const std::string& case_file, ColumnPtr& write_col, ColumnPtr& read_col, ColumnAccessPath* path) { + void test_json(ColumnWriterOptions& writer_opts, const std::string& case_file, ColumnPtr& write_col, + ColumnPtr& read_col, ColumnAccessPath* path) { auto fs = std::make_shared(); ASSERT_TRUE(fs->create_dir(TEST_DIR).ok()); TabletColumn json_tablet_column = create_with_default_value(""); TypeInfoPtr type_info = get_type_info(json_tablet_column); - ColumnMetaPB meta; const std::string fname = TEST_DIR + case_file; auto segment = create_dummy_segment(fs, fname); @@ -71,8 +74,7 @@ class FlatJsonColumnRWTest : public testing::Test { { ASSIGN_OR_ABORT(auto wfile, fs->new_writable_file(fname)); - ColumnWriterOptions writer_opts; - writer_opts.meta = &meta; + writer_opts.meta = _meta.get(); writer_opts.meta->set_column_id(0); writer_opts.meta->set_unique_id(0); writer_opts.meta->set_type(TYPE_JSON); @@ -96,7 +98,7 @@ class FlatJsonColumnRWTest : public testing::Test { } LOG(INFO) << "Finish writing"; - auto res = ColumnReader::create(&meta, segment.get(), nullptr); + auto res = ColumnReader::create(_meta.get(), segment.get(), nullptr); ASSERT_TRUE(res.ok()); auto reader = std::move(res).value(); @@ -122,11 +124,74 @@ class FlatJsonColumnRWTest : public testing::Test { private: std::shared_ptr _dummy_segment_schema; + std::shared_ptr _meta; }; -TEST_F(FlatJsonColumnRWTest, testNormalFlatJson) { - config::json_flat_internal_column_min_limit = 1; +TEST_F(FlatJsonColumnRWTest, testNormalJson) { + ColumnPtr write_col = JsonColumn::create(); + auto* json_col = down_cast(write_col.get()); + + ASSIGN_OR_ABORT(auto jv1, JsonValue::parse("{\"a\": 1, \"b\": 21}")); + ASSIGN_OR_ABORT(auto jv2, JsonValue::parse("{\"a\": 2, \"b\": 22}")); + ASSIGN_OR_ABORT(auto jv3, JsonValue::parse("{\"a\": 3, \"b\": 23}")); + ASSIGN_OR_ABORT(auto jv4, JsonValue::parse("{\"a\": 4, \"b\": 24}")); + ASSIGN_OR_ABORT(auto jv5, JsonValue::parse("{\"a\": 5, \"b\": 25}")); + + json_col->append(&jv1); + json_col->append(&jv2); + json_col->append(&jv3); + json_col->append(&jv4); + json_col->append(&jv5); + + ColumnPtr read_col = JsonColumn::create(); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = false; + test_json(writer_opts, "/test_flat_json_rw1.data", write_col, read_col, nullptr); + + auto* read_json = down_cast(read_col.get()); + EXPECT_FALSE(read_json->is_flat_json()); + EXPECT_EQ(5, read_json->size()); + EXPECT_EQ(0, read_json->get_flat_fields().size()); + EXPECT_EQ("{\"a\": 1, \"b\": 21}", read_json->debug_item(0)); + EXPECT_EQ("{\"a\": 4, \"b\": 24}", read_json->debug_item(3)); +} + +TEST_F(FlatJsonColumnRWTest, testNormalJsonWithPath) { + ColumnPtr write_col = JsonColumn::create(); + auto* json_col = down_cast(write_col.get()); + + ASSIGN_OR_ABORT(auto jv1, JsonValue::parse("{\"a\": 1, \"b\": 21}")); + ASSIGN_OR_ABORT(auto jv2, JsonValue::parse("{\"a\": 2, \"b\": 22}")); + ASSIGN_OR_ABORT(auto jv3, JsonValue::parse("{\"a\": 3, \"b\": 23}")); + ASSIGN_OR_ABORT(auto jv4, JsonValue::parse("{\"a\": 4, \"b\": 24}")); + ASSIGN_OR_ABORT(auto jv5, JsonValue::parse("{\"a\": 5, \"b\": 25}")); + + json_col->append(&jv1); + json_col->append(&jv2); + json_col->append(&jv3); + json_col->append(&jv4); + json_col->append(&jv5); + + ASSIGN_OR_ABORT(auto root_path, ColumnAccessPath::create(TAccessPathType::FIELD, "root", 0)); + ASSIGN_OR_ABORT(auto f1_path, ColumnAccessPath::create(TAccessPathType::FIELD, "a", 0)); + ASSIGN_OR_ABORT(auto f2_path, ColumnAccessPath::create(TAccessPathType::FIELD, "b", 0)); + root_path->children().emplace_back(std::move(f1_path)); + root_path->children().emplace_back(std::move(f2_path)); + + ColumnPtr read_col = JsonColumn::create(); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = false; + test_json(writer_opts, "/test_flat_json_rw1.data", write_col, read_col, root_path.get()); + + auto* read_json = down_cast(read_col.get()); + EXPECT_TRUE(read_json->is_flat_json()); + EXPECT_EQ(5, read_json->size()); + EXPECT_EQ(2, read_json->get_flat_fields().size()); + EXPECT_EQ("{a: 1, b: 21}", read_json->debug_item(0)); + EXPECT_EQ("{a: 4, b: 24}", read_json->debug_item(3)); +} +TEST_F(FlatJsonColumnRWTest, testNormalFlatJsonWithPath) { ColumnPtr write_col = JsonColumn::create(); auto* json_col = down_cast(write_col.get()); @@ -149,7 +214,9 @@ TEST_F(FlatJsonColumnRWTest, testNormalFlatJson) { root_path->children().emplace_back(std::move(f2_path)); ColumnPtr read_col = JsonColumn::create(); - test_json("/test_flat_json_rw1.data", write_col, read_col, root_path.get()); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = true; + test_json(writer_opts, "/test_flat_json_rw1.data", write_col, read_col, root_path.get()); auto* read_json = down_cast(read_col.get()); EXPECT_TRUE(read_json->is_flat_json()); @@ -161,9 +228,37 @@ TEST_F(FlatJsonColumnRWTest, testNormalFlatJson) { EXPECT_EQ("3", read_json->get_flat_field("a")->debug_item(2)); } -TEST_F(FlatJsonColumnRWTest, testNullFlatJson) { - config::json_flat_internal_column_min_limit = 1; +TEST_F(FlatJsonColumnRWTest, testNormalFlatJsonWithoutPath) { + ColumnPtr write_col = JsonColumn::create(); + auto* json_col = down_cast(write_col.get()); + + ASSIGN_OR_ABORT(auto jv1, JsonValue::parse("{\"a\": 1, \"b\": 21}")); + ASSIGN_OR_ABORT(auto jv2, JsonValue::parse("{\"a\": 2, \"b\": 22}")); + ASSIGN_OR_ABORT(auto jv3, JsonValue::parse("{\"a\": 3, \"b\": 23}")); + ASSIGN_OR_ABORT(auto jv4, JsonValue::parse("{\"a\": 4, \"b\": 24}")); + ASSIGN_OR_ABORT(auto jv5, JsonValue::parse("{\"a\": 5, \"b\": 25}")); + json_col->append(&jv1); + json_col->append(&jv2); + json_col->append(&jv3); + json_col->append(&jv4); + json_col->append(&jv5); + + ColumnPtr read_col = JsonColumn::create(); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = true; + test_json(writer_opts, "/test_flat_json_rw1.data", write_col, read_col, nullptr); + + auto* read_json = down_cast(read_col.get()); + EXPECT_FALSE(read_json->is_flat_json()); + EXPECT_EQ(5, read_json->size()); + ASSERT_EQ(0, read_json->get_flat_fields().size()); + EXPECT_EQ("{\"a\": 1, \"b\": 21}", read_json->debug_item(0)); + EXPECT_EQ("{\"a\": 4, \"b\": 24}", read_json->debug_item(3)); +} + +TEST_F(FlatJsonColumnRWTest, testNullNormalFlatJson) { + config::json_flat_null_factor = 0.4; ColumnPtr write_col = JsonColumn::create(); auto* json_col = down_cast(write_col.get()); @@ -195,7 +290,9 @@ TEST_F(FlatJsonColumnRWTest, testNullFlatJson) { root_path->children().emplace_back(std::move(f2_path)); ColumnPtr read_col = NullableColumn::create(JsonColumn::create(), NullColumn::create()); - test_json("/test_flat_json_rw2.data", write_nl_col, read_col, root_path.get()); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = true; + test_json(writer_opts, "/test_flat_json_rw2.data", write_nl_col, read_col, root_path.get()); auto* read_json = down_cast(down_cast(read_col.get())->data_column().get()); EXPECT_TRUE(read_json->is_flat_json()); @@ -204,13 +301,11 @@ TEST_F(FlatJsonColumnRWTest, testNullFlatJson) { EXPECT_EQ("{a: 5, b: 25}", read_col->debug_item(4)); } -TEST_F(FlatJsonColumnRWTest, testLimitFlatJson) { - config::json_flat_internal_column_min_limit = 5; - +TEST_F(FlatJsonColumnRWTest, tesArrayFlatJson) { ColumnPtr write_col = JsonColumn::create(); auto* json_col = down_cast(write_col.get()); - ASSIGN_OR_ABORT(auto jv1, JsonValue::parse("{\"a\": 1, \"b\": 21}")); + ASSIGN_OR_ABORT(auto jv1, JsonValue::parse(R"([{"a": 1}, {"b": 21}] )")); ASSIGN_OR_ABORT(auto jv2, JsonValue::parse("{\"a\": 2, \"b\": 22}")); ASSIGN_OR_ABORT(auto jv3, JsonValue::parse("{\"a\": 3, \"b\": 23}")); ASSIGN_OR_ABORT(auto jv4, JsonValue::parse("{\"a\": 4, \"b\": 24}")); @@ -229,24 +324,23 @@ TEST_F(FlatJsonColumnRWTest, testLimitFlatJson) { root_path->children().emplace_back(std::move(f2_path)); ColumnPtr read_col = JsonColumn::create(); - test_json("/test_flat_json_rw3.data", write_col, read_col, root_path.get()); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = true; + test_json(writer_opts, "/test_flat_json_rw3.data", write_col, read_col, root_path.get()); auto* read_json = down_cast(read_col.get()); EXPECT_TRUE(read_json->is_flat_json()); EXPECT_EQ(5, read_json->size()); ASSERT_EQ(2, read_json->get_flat_fields().size()); - EXPECT_EQ("{a: 1, b: 21}", read_json->debug_item(0)); + EXPECT_EQ("{a: NULL, b: NULL}", read_json->debug_item(0)); EXPECT_EQ("{a: 4, b: 24}", read_json->debug_item(3)); - EXPECT_EQ("3", read_json->get_flat_field("a")->debug_item(2)); } -TEST_F(FlatJsonColumnRWTest, tesArrayFlatJson) { - config::json_flat_internal_column_min_limit = 5; - +TEST_F(FlatJsonColumnRWTest, testEmptyFlatObject) { ColumnPtr write_col = JsonColumn::create(); auto* json_col = down_cast(write_col.get()); - ASSIGN_OR_ABORT(auto jv1, JsonValue::parse(R"( [{"a": 1}, {"b": 21}] )")); + ASSIGN_OR_ABORT(auto jv1, JsonValue::parse(R"("" )")); ASSIGN_OR_ABORT(auto jv2, JsonValue::parse("{\"a\": 2, \"b\": 22}")); ASSIGN_OR_ABORT(auto jv3, JsonValue::parse("{\"a\": 3, \"b\": 23}")); ASSIGN_OR_ABORT(auto jv4, JsonValue::parse("{\"a\": 4, \"b\": 24}")); @@ -264,8 +358,11 @@ TEST_F(FlatJsonColumnRWTest, tesArrayFlatJson) { root_path->children().emplace_back(std::move(f1_path)); root_path->children().emplace_back(std::move(f2_path)); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = true; + ColumnPtr read_col = JsonColumn::create(); - test_json("/test_flat_json_rw3.data", write_col, read_col, root_path.get()); + test_json(writer_opts, "/test_flat_json_rw4.data", write_col, read_col, root_path.get()); auto* read_json = down_cast(read_col.get()); EXPECT_TRUE(read_json->is_flat_json()); @@ -275,17 +372,90 @@ TEST_F(FlatJsonColumnRWTest, tesArrayFlatJson) { EXPECT_EQ("{a: 4, b: 24}", read_json->debug_item(3)); } -TEST_F(FlatJsonColumnRWTest, testEmptyObject) { - config::json_flat_internal_column_min_limit = 5; +TEST_F(FlatJsonColumnRWTest, testMergeRemainFlatJson) { + ColumnPtr write_col = JsonColumn::create(); + auto* json_col = down_cast(write_col.get()); + + ASSIGN_OR_ABORT(auto jv1, JsonValue::parse(R"({"a": 1, "b": 21, "c": 31})")); + ASSIGN_OR_ABORT(auto jv2, JsonValue::parse(R"({"a": 2, "b": 22, "d": 32})")); + ASSIGN_OR_ABORT(auto jv3, JsonValue::parse(R"({"a": 3, "b": 23, "e": [1,2,3]})")); + ASSIGN_OR_ABORT(auto jv4, JsonValue::parse(R"({"a": 4, "b": 24, "g": {"x": 1}})")); + ASSIGN_OR_ABORT(auto jv5, JsonValue::parse(R"({"a": 5, "b": 25})")); + + json_col->append(&jv1); + json_col->append(&jv2); + json_col->append(&jv3); + json_col->append(&jv4); + json_col->append(&jv5); + + ColumnPtr read_col = JsonColumn::create(); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = true; + test_json(writer_opts, "/test_flat_json_rw2.data", write_col, read_col, nullptr); + + EXPECT_EQ(3, writer_opts.meta->children_columns_size()); + EXPECT_TRUE(writer_opts.meta->json_meta().is_flat()); + EXPECT_TRUE(writer_opts.meta->json_meta().has_remain()); + EXPECT_EQ("a", writer_opts.meta->children_columns(0).name()); + EXPECT_EQ("b", writer_opts.meta->children_columns(1).name()); + EXPECT_EQ("remain", writer_opts.meta->children_columns(2).name()); + + auto* read_json = down_cast(read_col.get()); + EXPECT_FALSE(read_json->is_flat_json()); + EXPECT_EQ(5, read_col->size()); + EXPECT_EQ(R"({"a": 1, "b": 21, "c": 31})", read_col->debug_item(0)); + EXPECT_EQ(R"({"a": 2, "b": 22, "d": 32})", read_col->debug_item(1)); + EXPECT_EQ(R"({"a": 3, "b": 23, "e": [1, 2, 3]})", read_col->debug_item(2)); + EXPECT_EQ(R"({"a": 4, "b": 24, "g": {"x": 1}})", read_col->debug_item(3)); + EXPECT_EQ(R"({"a": 5, "b": 25})", read_col->debug_item(4)); +} +TEST_F(FlatJsonColumnRWTest, testMergeRemainFlatJson2) { ColumnPtr write_col = JsonColumn::create(); auto* json_col = down_cast(write_col.get()); + std::vector json = { + R"({"a": 1, "b": {"b1": 22, "b2": {"b3": "abc"}, "b4": 1}, "c": 31})", + R"({"a": 2, "b": {"b1": 23, "b2": {"b3": "efg"}, "b4": [1, 2, 3]}, "d": 32})", + R"({"a": 3, "b": {"b1": 24, "b2": {"b3": "xyz"}, "b4": {"b5": 1}}, "e": [1, 2, 3]})", + R"({"a": 4, "b": {"b1": 25, "b2": {"b3": "qwe"}, "b4": {"b7": 2}}, "g": {"x": 1}})", + R"({"a": 5, "b": {"b1": 26, "b2": {"b3": "sdf"}, "b4": 23}})"}; - ASSIGN_OR_ABORT(auto jv1, JsonValue::parse(R"( "" )")); - ASSIGN_OR_ABORT(auto jv2, JsonValue::parse("{\"a\": 2, \"b\": 22}")); - ASSIGN_OR_ABORT(auto jv3, JsonValue::parse("{\"a\": 3, \"b\": 23}")); - ASSIGN_OR_ABORT(auto jv4, JsonValue::parse("{\"a\": 4, \"b\": 24}")); - ASSIGN_OR_ABORT(auto jv5, JsonValue::parse("{\"a\": 5, \"b\": 25}")); + for (auto& x : json) { + ASSIGN_OR_ABORT(auto jv, JsonValue::parse(x)); + json_col->append(jv); + } + + ColumnPtr read_col = JsonColumn::create(); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = true; + test_json(writer_opts, "/test_flat_json_rw2.data", write_col, read_col, nullptr); + + EXPECT_EQ(4, writer_opts.meta->children_columns_size()); + EXPECT_TRUE(writer_opts.meta->json_meta().is_flat()); + EXPECT_TRUE(writer_opts.meta->json_meta().has_remain()); + EXPECT_EQ("a", writer_opts.meta->children_columns(0).name()); + EXPECT_EQ("b.b1", writer_opts.meta->children_columns(1).name()); + EXPECT_EQ("b.b2.b3", writer_opts.meta->children_columns(2).name()); + EXPECT_EQ("remain", writer_opts.meta->children_columns(3).name()); + + auto* read_json = down_cast(read_col.get()); + EXPECT_FALSE(read_json->is_flat_json()); + EXPECT_EQ(5, read_col->size()); + + for (size_t i = 0; i < json.size(); i++) { + EXPECT_EQ(json[i], read_col->debug_item(i)); + } +} + +TEST_F(FlatJsonColumnRWTest, testMergeMiddleRemainFlatJson) { + ColumnPtr write_col = JsonColumn::create(); + auto* json_col = down_cast(write_col.get()); + + ASSIGN_OR_ABORT(auto jv1, JsonValue::parse(R"({"a": 1, "b": 21, "c": 31})")); + ASSIGN_OR_ABORT(auto jv2, JsonValue::parse(R"({"a": 2, "b": 22, "d": 32})")); + ASSIGN_OR_ABORT(auto jv3, JsonValue::parse(R"({"a": 3, "b": 23, "e": [1,2,3]})")); + ASSIGN_OR_ABORT(auto jv4, JsonValue::parse(R"({"a": 4, "b": 24, "g": {"x": 1}})")); + ASSIGN_OR_ABORT(auto jv5, JsonValue::parse(R"({"a": 5, "b": 25})")); json_col->append(&jv1); json_col->append(&jv2); @@ -295,19 +465,563 @@ TEST_F(FlatJsonColumnRWTest, testEmptyObject) { ASSIGN_OR_ABORT(auto root_path, ColumnAccessPath::create(TAccessPathType::FIELD, "root", 0)); ASSIGN_OR_ABORT(auto f1_path, ColumnAccessPath::create(TAccessPathType::FIELD, "a", 0)); - ASSIGN_OR_ABORT(auto f2_path, ColumnAccessPath::create(TAccessPathType::FIELD, "b", 0)); + ASSIGN_OR_ABORT(auto f2_path, ColumnAccessPath::create(TAccessPathType::FIELD, "c", 0)); root_path->children().emplace_back(std::move(f1_path)); root_path->children().emplace_back(std::move(f2_path)); ColumnPtr read_col = JsonColumn::create(); - test_json("/test_flat_json_rw4.data", write_col, read_col, root_path.get()); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = true; + test_json(writer_opts, "/test_flat_json_rw2.data", write_col, read_col, root_path.get()); + + auto* read_json = down_cast(read_col.get()); + + EXPECT_EQ(3, writer_opts.meta->children_columns_size()); + EXPECT_TRUE(writer_opts.meta->json_meta().is_flat()); + EXPECT_TRUE(writer_opts.meta->json_meta().has_remain()); + EXPECT_EQ("a", writer_opts.meta->children_columns(0).name()); + EXPECT_EQ("b", writer_opts.meta->children_columns(1).name()); + EXPECT_EQ("remain", writer_opts.meta->children_columns(2).name()); + + EXPECT_TRUE(read_json->is_flat_json()); + EXPECT_EQ(5, read_col->size()); + EXPECT_EQ("{a: 1, c: 31}", read_col->debug_item(0)); + EXPECT_EQ("{a: 2, c: NULL}", read_col->debug_item(1)); + EXPECT_EQ("{a: 3, c: NULL}", read_col->debug_item(2)); + EXPECT_EQ("{a: 4, c: NULL}", read_col->debug_item(3)); + EXPECT_EQ("{a: 5, c: NULL}", read_col->debug_item(4)); +} + +TEST_F(FlatJsonColumnRWTest, testMergeMiddleRemainFlatJson2) { + config::json_flat_null_factor = 0.4; + ColumnPtr write_col = JsonColumn::create(); + auto* json_col = down_cast(write_col.get()); + std::vector json = { + R"({"a": 1, "b": {"b1": 22, "b2": {"b3": "abc"}, "b4": 1}, "c": 31})", + R"({"a": 2, "b": {"b1": 23, "b2": {"b3": "efg"}, "b4": [1, 2, 3]}, "d": 32})", + R"({"a": 3, "b": {"b1": 24, "b2": {"b3": "xyz"}, "b4": {"b5": 1}}, "e": [1, 2, 3]})", + R"({"a": 4, "b": {"b1": 25, "b2": {"b3": "qwe"}, "b4": {"b7": 2}}, "g": {"x": 1}})", + R"({"a": 5, "b": {"b1": 26, "b2": {"b3": "sdf"}, "b4": 23}})"}; + + for (auto& x : json) { + ASSIGN_OR_ABORT(auto jv, JsonValue::parse(x)); + json_col->append(jv); + } + + ColumnPtr read_col = JsonColumn::create(); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = true; + test_json(writer_opts, "/test_flat_json_rw2.data", write_col, read_col, nullptr); + + EXPECT_EQ(4, writer_opts.meta->children_columns_size()); + EXPECT_TRUE(writer_opts.meta->json_meta().is_flat()); + EXPECT_TRUE(writer_opts.meta->json_meta().has_remain()); + EXPECT_EQ("a", writer_opts.meta->children_columns(0).name()); + EXPECT_EQ("b.b1", writer_opts.meta->children_columns(1).name()); + EXPECT_EQ("b.b2.b3", writer_opts.meta->children_columns(2).name()); + EXPECT_EQ("remain", writer_opts.meta->children_columns(3).name()); + + auto* read_json = down_cast(read_col.get()); + EXPECT_FALSE(read_json->is_flat_json()); + EXPECT_EQ(5, read_col->size()); + for (size_t i = 0; i < json.size(); i++) { + EXPECT_EQ(json[i], read_col->debug_item(i)); + } +} + +TEST_F(FlatJsonColumnRWTest, testMergeMiddleRemainFlatJson3) { + config::json_flat_null_factor = 0.4; + ColumnPtr write_col = JsonColumn::create(); + auto* json_col = down_cast(write_col.get()); + std::vector json = { + R"({"a": 1, "b": {"b1": 22, "b2": {"b3": "abc", "c1": {"c2": "a", "ce": 1},"bc": 1}, "b4": 1}})", + R"({"a": 2, "b": {"b1": 23, "b2": {"b3": "efg", "c1": {"c2": "b", "cd": 2},"bd": 2}, "b4": [1, 2, 3]}})", + R"({"a": 3, "b": {"b1": 24, "b2": {"b3": "xyz", "c1": {"c2": "c", "cf": 3},"be": 3}, "b4": {"b5": 1}}})", + R"({"a": 4, "b": {"b1": 25, "b2": {"b3": "qwe", "c1": {"c2": "d", "cg": 4},"bf": 4}, "b4": {"b7": 2}}})", + R"({"a": 5, "b": {"b1": 26, "b2": {"b3": "sdf", "c1": {"c2": "e", "ch": 5},"bg": 5}, "b4": 23}})"}; + + for (auto& x : json) { + ASSIGN_OR_ABORT(auto jv, JsonValue::parse(x)); + json_col->append(jv); + } + + ASSIGN_OR_ABORT(auto root_path, ColumnAccessPath::create(TAccessPathType::FIELD, "root", 0)); + ASSIGN_OR_ABORT(auto b_path, ColumnAccessPath::create(TAccessPathType::FIELD, "b", 0)); + ASSIGN_OR_ABORT(auto b2_path, ColumnAccessPath::create(TAccessPathType::FIELD, "b.b2", 0)); + + b_path->children().emplace_back(std::move(b2_path)); + root_path->children().emplace_back(std::move(b_path)); + + ColumnPtr read_col = JsonColumn::create(); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = true; + test_json(writer_opts, "/test_flat_json_rw2.data", write_col, read_col, root_path.get()); + + EXPECT_EQ(5, writer_opts.meta->children_columns_size()); + EXPECT_TRUE(writer_opts.meta->json_meta().is_flat()); + EXPECT_TRUE(writer_opts.meta->json_meta().has_remain()); + EXPECT_EQ("a", writer_opts.meta->children_columns(0).name()); + EXPECT_EQ("b.b1", writer_opts.meta->children_columns(1).name()); + EXPECT_EQ("b.b2.b3", writer_opts.meta->children_columns(2).name()); + EXPECT_EQ("b.b2.c1.c2", writer_opts.meta->children_columns(3).name()); + EXPECT_EQ("remain", writer_opts.meta->children_columns(4).name()); auto* read_json = down_cast(read_col.get()); EXPECT_TRUE(read_json->is_flat_json()); - EXPECT_EQ(5, read_json->size()); - ASSERT_EQ(2, read_json->get_flat_fields().size()); - EXPECT_EQ("{a: NULL, b: NULL}", read_json->debug_item(0)); - EXPECT_EQ("{a: 4, b: 24}", read_json->debug_item(3)); + EXPECT_EQ(5, read_col->size()); + EXPECT_EQ(R"({b.b2: {"b3": "abc", "bc": 1, "c1": {"c2": "a", "ce": 1}}})", read_col->debug_item(0)); + EXPECT_EQ(R"({b.b2: {"b3": "efg", "bd": 2, "c1": {"c2": "b", "cd": 2}}})", read_col->debug_item(1)); + EXPECT_EQ(R"({b.b2: {"b3": "xyz", "be": 3, "c1": {"c2": "c", "cf": 3}}})", read_col->debug_item(2)); + EXPECT_EQ(R"({b.b2: {"b3": "qwe", "bf": 4, "c1": {"c2": "d", "cg": 4}}})", read_col->debug_item(3)); + EXPECT_EQ(R"({b.b2: {"b3": "sdf", "bg": 5, "c1": {"c2": "e", "ch": 5}}})", read_col->debug_item(4)); +} + +TEST_F(FlatJsonColumnRWTest, testDeepFlatJson) { + ColumnPtr write_col = JsonColumn::create(); + auto* json_col = down_cast(write_col.get()); + std::vector json = { + R"({"a": 1, "b": {"b1": 22, "b2": {"b3": "abc"}, "b4": 1}, "c": 31})", + R"({"a": 2, "b": {"b1": 23, "b2": {"b3": "efg"}, "b4": [1,2,3]}, "d": 32})", + R"({"a": 3, "b": {"b1": 24, "b2": {"b3": "xyz"}, "b4": {"b5": 1}}, "e": [1,2,3]})", + R"({"a": 4, "b": {"b1": 25, "b2": {"b3": "qwe"}, "b4": {"b7": 2}}, "g": {"x": 1}})", + R"({"a": 5, "b": {"b1": 26, "b2": {"b3": "sdf"}, "b4": 23}})"}; + + for (auto& x : json) { + ASSIGN_OR_ABORT(auto jv, JsonValue::parse(x)); + json_col->append(jv); + } + + ASSIGN_OR_ABORT(auto root_path, ColumnAccessPath::create(TAccessPathType::FIELD, "root", 0)); + ASSIGN_OR_ABORT(auto b_path, ColumnAccessPath::create(TAccessPathType::FIELD, "b", 0)); + ASSIGN_OR_ABORT(auto b2_path, ColumnAccessPath::create(TAccessPathType::FIELD, "b.b2", 0)); + ASSIGN_OR_ABORT(auto b3_path, ColumnAccessPath::create(TAccessPathType::FIELD, "b.b2.b3", 0)); + ASSIGN_OR_ABORT(auto b4_path, ColumnAccessPath::create(TAccessPathType::FIELD, "b.b4", 0)); + ASSIGN_OR_ABORT(auto b5_path, ColumnAccessPath::create(TAccessPathType::FIELD, "b.b4.b5", 0)); + + b4_path->children().emplace_back(std::move(b5_path)); + b2_path->children().emplace_back(std::move(b3_path)); + b_path->children().emplace_back(std::move(b4_path)); + b_path->children().emplace_back(std::move(b2_path)); + root_path->children().emplace_back(std::move(b_path)); + + ColumnPtr read_col = JsonColumn::create(); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = true; + test_json(writer_opts, "/test_flat_json_rw2.data", write_col, read_col, root_path.get()); + + EXPECT_EQ(4, writer_opts.meta->children_columns_size()); + EXPECT_TRUE(writer_opts.meta->json_meta().is_flat()); + EXPECT_TRUE(writer_opts.meta->json_meta().has_remain()); + EXPECT_EQ("a", writer_opts.meta->children_columns(0).name()); + EXPECT_EQ("b.b1", writer_opts.meta->children_columns(1).name()); + EXPECT_EQ("b.b2.b3", writer_opts.meta->children_columns(2).name()); + EXPECT_EQ("remain", writer_opts.meta->children_columns(3).name()); + + auto* read_json = down_cast(read_col.get()); + EXPECT_TRUE(read_json->is_flat_json()); + EXPECT_EQ(5, read_col->size()); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: "abc"})", read_col->debug_item(0)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: "efg"})", read_col->debug_item(1)); + EXPECT_EQ(R"({b.b4.b5: 1, b.b2.b3: "xyz"})", read_col->debug_item(2)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: "qwe"})", read_col->debug_item(3)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: "sdf"})", read_col->debug_item(4)); +} + +TEST_F(FlatJsonColumnRWTest, testHyperFlatJson) { + config::json_flat_null_factor = 0.4; + config::json_flat_sparsity_factor = 0.5; + + ColumnPtr write_col = JsonColumn::create(); + auto* json_col = down_cast(write_col.get()); + std::vector json = { + R"({"a": 1, "gg": "te1", "ff": {"f1": "985"}, "b": {"b1": 22, "b2": {"b3": "abc", "c1": {"c2": "a", "ce": 1},"bc": 1}, "b4": 1}})", + R"({"a": 2, "gg": "te2", "ff": {"f1": "984"}, "b": {"b1": 23, "b2": {"b3": "efg", "c1": {"c2": "b", "cd": 2},"bd": 2}, "b4": [1, 2, 3]}})", + R"({"a": 3, "gg": "te3", "ff": {"f1": "983"}, "b": {"b1": 24, "b2": {"b3": "xyz", "c1": {"c2": "c", "cf": 3},"be": 3}, "b4": {"b5": 1}}})", + R"({"a": 4, "gg": "te4", "ff": 781, "b": {"b1": 25, "b2": {"b3": "qwe", "c1": {"c2": "d", "cg": 4},"bf": 4}, "b4": {"b7": 2}}})", + R"({"a": 5, "gg": "te5", "ff": 782, "b": {"b1": 26, "b2": {"b3": "sdf", "c1": {"c2": "e", "ch": 5},"bg": 5}, "b4": 23}})"}; + + for (auto& x : json) { + ASSIGN_OR_ABORT(auto jv, JsonValue::parse(x)); + json_col->append(jv); + } + + ASSIGN_OR_ABORT(auto root_path, ColumnAccessPath::create(TAccessPathType::FIELD, "root", 0)); + ASSIGN_OR_ABORT(auto b_path, ColumnAccessPath::create(TAccessPathType::FIELD, "b", 0)); + ASSIGN_OR_ABORT(auto b2_path, ColumnAccessPath::create(TAccessPathType::FIELD, "b.b2", 0)); + ASSIGN_OR_ABORT(auto b3_path, ColumnAccessPath::create(TAccessPathType::FIELD, "b.b2.b3", 0)); + ASSIGN_OR_ABORT(auto b4_path, ColumnAccessPath::create(TAccessPathType::FIELD, "b.b4", 0)); + ASSIGN_OR_ABORT(auto b5_path, ColumnAccessPath::create(TAccessPathType::FIELD, "b.b4.b5", 0)); + ASSIGN_OR_ABORT(auto a_path, ColumnAccessPath::create(TAccessPathType::FIELD, "a", 0)); + ASSIGN_OR_ABORT(auto ff_path, ColumnAccessPath::create(TAccessPathType::FIELD, "ff", 0)); + ASSIGN_OR_ABORT(auto f1_path, ColumnAccessPath::create(TAccessPathType::FIELD, "ff.f1", 0)); + ASSIGN_OR_ABORT(auto gg_path, ColumnAccessPath::create(TAccessPathType::FIELD, "gg", 0)); + ASSIGN_OR_ABORT(auto g1_path, ColumnAccessPath::create(TAccessPathType::FIELD, "gg.g1", 0)); + b4_path->children().emplace_back(std::move(b5_path)); + b2_path->children().emplace_back(std::move(b3_path)); + b_path->children().emplace_back(std::move(b4_path)); + b_path->children().emplace_back(std::move(b2_path)); + ff_path->children().emplace_back(std::move(f1_path)); + gg_path->children().emplace_back(std::move(g1_path)); + root_path->children().emplace_back(std::move(b_path)); + root_path->children().emplace_back(std::move(a_path)); + root_path->children().emplace_back(std::move(ff_path)); + root_path->children().emplace_back(std::move(gg_path)); + + ColumnPtr read_col = JsonColumn::create(); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = true; + test_json(writer_opts, "/test_flat_json_rw2.data", write_col, read_col, root_path.get()); + + EXPECT_EQ(7, writer_opts.meta->children_columns_size()); + EXPECT_TRUE(writer_opts.meta->json_meta().is_flat()); + EXPECT_TRUE(writer_opts.meta->json_meta().has_remain()); + EXPECT_EQ("a", writer_opts.meta->children_columns(0).name()); + EXPECT_EQ("b.b1", writer_opts.meta->children_columns(1).name()); + EXPECT_EQ("b.b2.b3", writer_opts.meta->children_columns(2).name()); + EXPECT_EQ("b.b2.c1.c2", writer_opts.meta->children_columns(3).name()); + EXPECT_EQ("ff.f1", writer_opts.meta->children_columns(4).name()); + EXPECT_EQ("gg", writer_opts.meta->children_columns(5).name()); + EXPECT_EQ("remain", writer_opts.meta->children_columns(6).name()); + + auto* read_json = down_cast(read_col.get()); + EXPECT_TRUE(read_json->is_flat_json()); + EXPECT_EQ(5, read_col->size()); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: "abc", a: 1, ff.f1: "985", gg.g1: NULL})", read_col->debug_item(0)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: "efg", a: 2, ff.f1: "984", gg.g1: NULL})", read_col->debug_item(1)); + EXPECT_EQ(R"({b.b4.b5: 1, b.b2.b3: "xyz", a: 3, ff.f1: "983", gg.g1: NULL})", read_col->debug_item(2)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: "qwe", a: 4, ff.f1: NULL, gg.g1: NULL})", read_col->debug_item(3)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: "sdf", a: 5, ff.f1: NULL, gg.g1: NULL})", read_col->debug_item(4)); +} + +TEST_F(FlatJsonColumnRWTest, testMergeRemainJson) { + ColumnPtr write_col = JsonColumn::create(); + auto* json_col = down_cast(write_col.get()); + + ASSIGN_OR_ABORT(auto jv1, JsonValue::parse(R"({"a": 1, "b": 21, "c": 31})")); + ASSIGN_OR_ABORT(auto jv2, JsonValue::parse(R"({"a": 2, "b": 22, "d": 32})")); + ASSIGN_OR_ABORT(auto jv3, JsonValue::parse(R"({"a": 3, "b": 23, "e": [1,2,3]})")); + ASSIGN_OR_ABORT(auto jv4, JsonValue::parse(R"({"a": 4, "b": 24, "g": {"x": 1}})")); + ASSIGN_OR_ABORT(auto jv5, JsonValue::parse(R"({"a": 5, "b": 25})")); + + json_col->append(&jv1); + json_col->append(&jv2); + json_col->append(&jv3); + json_col->append(&jv4); + json_col->append(&jv5); + + ColumnPtr read_col = JsonColumn::create(); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = false; + test_json(writer_opts, "/test_flat_json_rw2.data", write_col, read_col, nullptr); + + EXPECT_EQ(0, writer_opts.meta->children_columns_size()); + EXPECT_FALSE(writer_opts.meta->json_meta().is_flat()); + EXPECT_FALSE(writer_opts.meta->json_meta().has_remain()); + + auto* read_json = down_cast(read_col.get()); + EXPECT_FALSE(read_json->is_flat_json()); + EXPECT_EQ(5, read_col->size()); + EXPECT_EQ(R"({"a": 1, "b": 21, "c": 31})", read_col->debug_item(0)); + EXPECT_EQ(R"({"a": 2, "b": 22, "d": 32})", read_col->debug_item(1)); + EXPECT_EQ(R"({"a": 3, "b": 23, "e": [1, 2, 3]})", read_col->debug_item(2)); + EXPECT_EQ(R"({"a": 4, "b": 24, "g": {"x": 1}})", read_col->debug_item(3)); + EXPECT_EQ(R"({"a": 5, "b": 25})", read_col->debug_item(4)); +} + +TEST_F(FlatJsonColumnRWTest, testMergeMiddleRemainJson) { + ColumnPtr write_col = JsonColumn::create(); + auto* json_col = down_cast(write_col.get()); + + ASSIGN_OR_ABORT(auto jv1, JsonValue::parse(R"({"a": 1, "b": 21, "c": 31})")); + ASSIGN_OR_ABORT(auto jv2, JsonValue::parse(R"({"a": 2, "b": 22, "d": 32})")); + ASSIGN_OR_ABORT(auto jv3, JsonValue::parse(R"({"a": 3, "b": 23, "e": [1,2,3]})")); + ASSIGN_OR_ABORT(auto jv4, JsonValue::parse(R"({"a": 4, "b": 24, "g": {"x": 1}})")); + ASSIGN_OR_ABORT(auto jv5, JsonValue::parse(R"({"a": 5, "b": 25})")); + + json_col->append(&jv1); + json_col->append(&jv2); + json_col->append(&jv3); + json_col->append(&jv4); + json_col->append(&jv5); + + ASSIGN_OR_ABORT(auto root_path, ColumnAccessPath::create(TAccessPathType::FIELD, "root", 0)); + ASSIGN_OR_ABORT(auto f1_path, ColumnAccessPath::create(TAccessPathType::FIELD, "a", 0)); + ASSIGN_OR_ABORT(auto f2_path, ColumnAccessPath::create(TAccessPathType::FIELD, "c", 0)); + root_path->children().emplace_back(std::move(f1_path)); + root_path->children().emplace_back(std::move(f2_path)); + + ColumnPtr read_col = JsonColumn::create(); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = false; + test_json(writer_opts, "/test_flat_json_rw2.data", write_col, read_col, root_path.get()); + + EXPECT_EQ(0, writer_opts.meta->children_columns_size()); + EXPECT_FALSE(writer_opts.meta->json_meta().is_flat()); + EXPECT_FALSE(writer_opts.meta->json_meta().has_remain()); + + auto* read_json = down_cast(read_col.get()); + + EXPECT_TRUE(read_json->is_flat_json()); + EXPECT_EQ(5, read_col->size()); + EXPECT_EQ("{a: 1, c: 31}", read_col->debug_item(0)); + EXPECT_EQ("{a: 2, c: NULL}", read_col->debug_item(1)); + EXPECT_EQ("{a: 3, c: NULL}", read_col->debug_item(2)); + EXPECT_EQ("{a: 4, c: NULL}", read_col->debug_item(3)); + EXPECT_EQ("{a: 5, c: NULL}", read_col->debug_item(4)); +} + +TEST_F(FlatJsonColumnRWTest, testMergeMiddleRemainJson2) { + config::json_flat_null_factor = 0.4; + ColumnPtr write_col = JsonColumn::create(); + auto* json_col = down_cast(write_col.get()); + std::vector json = { + R"({"a": 1, "b": {"b1": 22, "b2": {"b3": "abc", "c1": {"c2": "a", "ce": 1},"bc": 1}, "b4": 1}})", + R"({"a": 2, "b": {"b1": 23, "b2": {"b3": "efg", "c1": {"c2": "b", "cd": 2},"bd": 2}, "b4": [1, 2, 3]}})", + R"({"a": 3, "b": {"b1": 24, "b2": {"b3": "xyz", "c1": {"c2": "c", "cf": 3},"be": 3}, "b4": {"b5": 1}}})", + R"({"a": 4, "b": {"b1": 25, "b2": {"b3": "qwe", "c1": {"c2": "d", "cg": 4},"bf": 4}, "b4": {"b7": 2}}})", + R"({"a": 5, "b": {"b1": 26, "b2": {"b3": "sdf", "c1": {"c2": "e", "ch": 5},"bg": 5}, "b4": 23}})"}; + + for (auto& x : json) { + ASSIGN_OR_ABORT(auto jv, JsonValue::parse(x)); + json_col->append(jv); + } + + ASSIGN_OR_ABORT(auto root_path, ColumnAccessPath::create(TAccessPathType::FIELD, "root", 0)); + ASSIGN_OR_ABORT(auto b_path, ColumnAccessPath::create(TAccessPathType::FIELD, "b", 0)); + ASSIGN_OR_ABORT(auto b2_path, ColumnAccessPath::create(TAccessPathType::FIELD, "b.b2", 0)); + + b_path->children().emplace_back(std::move(b2_path)); + root_path->children().emplace_back(std::move(b_path)); + + ColumnPtr read_col = JsonColumn::create(); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = false; + test_json(writer_opts, "/test_flat_json_rw2.data", write_col, read_col, root_path.get()); + + EXPECT_EQ(0, writer_opts.meta->children_columns_size()); + EXPECT_FALSE(writer_opts.meta->json_meta().is_flat()); + EXPECT_FALSE(writer_opts.meta->json_meta().has_remain()); + + auto* read_json = down_cast(read_col.get()); + EXPECT_TRUE(read_json->is_flat_json()); + EXPECT_EQ(5, read_col->size()); + EXPECT_EQ(R"({b.b2: {"b3": "abc", "bc": 1, "c1": {"c2": "a", "ce": 1}}})", read_col->debug_item(0)); + EXPECT_EQ(R"({b.b2: {"b3": "efg", "bd": 2, "c1": {"c2": "b", "cd": 2}}})", read_col->debug_item(1)); + EXPECT_EQ(R"({b.b2: {"b3": "xyz", "be": 3, "c1": {"c2": "c", "cf": 3}}})", read_col->debug_item(2)); + EXPECT_EQ(R"({b.b2: {"b3": "qwe", "bf": 4, "c1": {"c2": "d", "cg": 4}}})", read_col->debug_item(3)); + EXPECT_EQ(R"({b.b2: {"b3": "sdf", "bg": 5, "c1": {"c2": "e", "ch": 5}}})", read_col->debug_item(4)); +} + +TEST_F(FlatJsonColumnRWTest, testDeepJson) { + ColumnPtr write_col = JsonColumn::create(); + auto* json_col = down_cast(write_col.get()); + std::vector json = { + R"({"a": 1, "b": {"b1": 22, "b2": {"b3": "abc"}, "b4": 1}, "c": 31})", + R"({"a": 2, "b": {"b1": 23, "b2": {"b3": "efg"}, "b4": [1,2,3]}, "d": 32})", + R"({"a": 3, "b": {"b1": 24, "b2": {"b3": "xyz"}, "b4": {"b5": 1}}, "e": [1,2,3]})", + R"({"a": 4, "b": {"b1": 25, "b2": {"b3": "qwe"}, "b4": {"b7": 2}}, "g": {"x": 1}})", + R"({"a": 5, "b": {"b1": 26, "b2": {"b3": "sdf"}, "b4": 23}})"}; + + for (auto& x : json) { + ASSIGN_OR_ABORT(auto jv, JsonValue::parse(x)); + json_col->append(jv); + } + + ColumnAccessPath root; + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_JSON, "b.b4.b5"); + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_JSON, "b.b2.b3"); + + ColumnPtr read_col = JsonColumn::create(); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = false; + test_json(writer_opts, "/test_flat_json_rw2.data", write_col, read_col, &root); + + EXPECT_EQ(0, writer_opts.meta->children_columns_size()); + EXPECT_FALSE(writer_opts.meta->json_meta().is_flat()); + EXPECT_FALSE(writer_opts.meta->json_meta().has_remain()); + + auto* read_json = down_cast(read_col.get()); + EXPECT_TRUE(read_json->is_flat_json()); + EXPECT_EQ(5, read_col->size()); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: "abc"})", read_col->debug_item(0)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: "efg"})", read_col->debug_item(1)); + EXPECT_EQ(R"({b.b4.b5: 1, b.b2.b3: "xyz"})", read_col->debug_item(2)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: "qwe"})", read_col->debug_item(3)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: "sdf"})", read_col->debug_item(4)); +} + +TEST_F(FlatJsonColumnRWTest, testHyperJson) { + config::json_flat_null_factor = 0.4; + config::json_flat_sparsity_factor = 0.5; + + ColumnPtr write_col = JsonColumn::create(); + auto* json_col = down_cast(write_col.get()); + std::vector json = { + R"({"a": 1, "gg": "te1", "ff": {"f1": "985"}, "b": {"b1": 22, "b2": {"b3": "abc", "c1": {"c2": "a", "ce": 1},"bc": 1}, "b4": 1}})", + R"({"a": 2, "gg": "te2", "ff": {"f1": "984"}, "b": {"b1": 23, "b2": {"b3": "efg", "c1": {"c2": "b", "cd": 2},"bd": 2}, "b4": [1, 2, 3]}})", + R"({"a": 3, "gg": "te3", "ff": {"f1": "983"}, "b": {"b1": 24, "b2": {"b3": "xyz", "c1": {"c2": "c", "cf": 3},"be": 3}, "b4": {"b5": 1}}})", + R"({"a": 4, "gg": "te4", "ff": 781, "b": {"b1": 25, "b2": {"b3": "qwe", "c1": {"c2": "d", "cg": 4},"bf": 4}, "b4": {"b7": 2}}})", + R"({"a": 5, "gg": "te5", "ff": 782, "b": {"b1": 26, "b2": {"b3": "sdf", "c1": {"c2": "e", "ch": 5},"bg": 5}, "b4": 23}})"}; + + for (auto& x : json) { + ASSIGN_OR_ABORT(auto jv, JsonValue::parse(x)); + json_col->append(jv); + } + + ColumnAccessPath root; + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_JSON, "b.b4.b5"); + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_JSON, "b.b2.b3"); + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_JSON, "a"); + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_JSON, "ff.f1"); + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_JSON, "gg.g1"); + + ColumnPtr read_col = JsonColumn::create(); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = false; + test_json(writer_opts, "/test_flat_json_rw2.data", write_col, read_col, &root); + + EXPECT_EQ(0, writer_opts.meta->children_columns_size()); + EXPECT_FALSE(writer_opts.meta->json_meta().is_flat()); + EXPECT_FALSE(writer_opts.meta->json_meta().has_remain()); + + auto* read_json = down_cast(read_col.get()); + EXPECT_TRUE(read_json->is_flat_json()); + EXPECT_EQ(5, read_col->size()); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: "abc", a: 1, ff.f1: "985", gg.g1: NULL})", read_col->debug_item(0)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: "efg", a: 2, ff.f1: "984", gg.g1: NULL})", read_col->debug_item(1)); + EXPECT_EQ(R"({b.b4.b5: 1, b.b2.b3: "xyz", a: 3, ff.f1: "983", gg.g1: NULL})", read_col->debug_item(2)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: "qwe", a: 4, ff.f1: NULL, gg.g1: NULL})", read_col->debug_item(3)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: "sdf", a: 5, ff.f1: NULL, gg.g1: NULL})", read_col->debug_item(4)); +} + +TEST_F(FlatJsonColumnRWTest, testHyperNoCastTypeJson) { + config::json_flat_null_factor = 0.4; + config::json_flat_sparsity_factor = 0.5; + + ColumnPtr write_col = JsonColumn::create(); + auto* json_col = down_cast(write_col.get()); + std::vector json = { + R"({"a": 1, "gg": "te1", "ff": {"f1": "985"}, "b": {"b1": 22, "b2": {"b3": "abc", "c1": {"c2": "a", "ce": 1},"bc": 1}, "b4": 1}})", + R"({"a": 2, "gg": "te2", "ff": {"f1": "984"}, "b": {"b1": 23, "b2": {"b3": "efg", "c1": {"c2": "b", "cd": 2},"bd": 2}, "b4": [1, 2, 3]}})", + R"({"a": 3, "gg": "te3", "ff": {"f1": "983"}, "b": {"b1": 24, "b2": {"b3": "xyz", "c1": {"c2": "c", "cf": 3},"be": 3}, "b4": {"b5": 1}}})", + R"({"a": 4, "gg": "te4", "ff": 781, "b": {"b1": 25, "b2": {"b3": "qwe", "c1": {"c2": "d", "cg": 4},"bf": 4}, "b4": {"b7": 2}}})", + R"({"a": 5, "gg": "te5", "ff": 782, "b": {"b1": 26, "b2": {"b3": "sdf", "c1": {"c2": "e", "ch": 5},"bg": 5}, "b4": 23}})"}; + + for (auto& x : json) { + ASSIGN_OR_ABORT(auto jv, JsonValue::parse(x)); + json_col->append(jv); + } + + ColumnAccessPath root; + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_BIGINT, "b.b4.b5"); + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_VARCHAR, "b.b2.b3"); + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_BIGINT, "a"); + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_JSON, "ff.f1"); + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_VARCHAR, "gg.g1"); + + ColumnPtr read_col = JsonColumn::create(); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = true; + test_json(writer_opts, "/test_flat_json_rw2.data", write_col, read_col, &root); + + auto* read_json = down_cast(read_col.get()); + EXPECT_TRUE(read_json->is_flat_json()); + EXPECT_EQ(5, read_col->size()); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: 'abc', a: 1, ff.f1: "985", gg.g1: NULL})", read_col->debug_item(0)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: 'efg', a: 2, ff.f1: "984", gg.g1: NULL})", read_col->debug_item(1)); + EXPECT_EQ(R"({b.b4.b5: 1, b.b2.b3: 'xyz', a: 3, ff.f1: "983", gg.g1: NULL})", read_col->debug_item(2)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: 'qwe', a: 4, ff.f1: NULL, gg.g1: NULL})", read_col->debug_item(3)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2.b3: 'sdf', a: 5, ff.f1: NULL, gg.g1: NULL})", read_col->debug_item(4)); +} + +TEST_F(FlatJsonColumnRWTest, testHyperCastTypeJson) { + config::json_flat_null_factor = 0.4; + config::json_flat_sparsity_factor = 0.5; + + ColumnPtr write_col = JsonColumn::create(); + auto* json_col = down_cast(write_col.get()); + std::vector json = { + R"({"a": 1, "gg": "te1", "ff": {"f1": "985"}, "b": {"b1": 22, "b2": {"b3": "abc", "c1": {"c2": "a", "ce": 1},"bc": 1}, "b4": 1}})", + R"({"a": 2, "gg": "te2", "ff": {"f1": "984"}, "b": {"b1": 23, "b2": {"b3": "efg", "c1": {"c2": "b", "cd": 2},"bd": 2}, "b4": [1, 2, 3]}})", + R"({"a": 3, "gg": "te3", "ff": {"f1": "983"}, "b": {"b1": 24, "b2": {"b3": "xyz", "c1": {"c2": "c", "cf": 3},"be": 3}, "b4": {"b5": 1}}})", + R"({"a": 4, "gg": "te4", "ff": 781, "b": {"b1": 25, "b2": {"b3": "qwe", "c1": {"c2": "d", "cg": 4},"bf": 4}, "b4": {"b7": 2}}})", + R"({"a": 5, "gg": "te5", "ff": 782, "b": {"b1": 26, "b2": {"b3": "sdf", "c1": {"c2": "e", "ch": 5},"bg": 5}, "b4": 23}})"}; + + for (auto& x : json) { + ASSIGN_OR_ABORT(auto jv, JsonValue::parse(x)); + json_col->append(jv); + } + + ColumnAccessPath root; + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_DOUBLE, "b.b4.b5"); + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_BIGINT, "b.b2"); + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_VARCHAR, "a"); + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_BIGINT, "ff.f1"); + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_JSON, "gg.g1"); + + ColumnPtr read_col = JsonColumn::create(); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = true; + test_json(writer_opts, "/test_flat_json_rw2.data", write_col, read_col, &root); + + auto* read_json = down_cast(read_col.get()); + EXPECT_TRUE(read_json->is_flat_json()); + EXPECT_EQ(5, read_col->size()); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2: NULL, a: '1', ff.f1: 985, gg.g1: NULL})", read_col->debug_item(0)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2: NULL, a: '2', ff.f1: 984, gg.g1: NULL})", read_col->debug_item(1)); + EXPECT_EQ(R"({b.b4.b5: 1, b.b2: NULL, a: '3', ff.f1: 983, gg.g1: NULL})", read_col->debug_item(2)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2: NULL, a: '4', ff.f1: NULL, gg.g1: NULL})", read_col->debug_item(3)); + EXPECT_EQ(R"({b.b4.b5: NULL, b.b2: NULL, a: '5', ff.f1: NULL, gg.g1: NULL})", read_col->debug_item(4)); +} + +TEST_F(FlatJsonColumnRWTest, testHyperCastTypeJson2) { + config::json_flat_null_factor = 0.4; + config::json_flat_sparsity_factor = 0.5; + + ColumnPtr write_col = JsonColumn::create(); + auto* json_col = down_cast(write_col.get()); + std::vector json = { + R"({"a": 1, "gg": "te1", "ff": {"f1": "985"}, "b": {"b1": 22, "b2": {"b3": "abc", "c1": {"c2": "a", "ce": 1},"bc": 1}, "b4": 1}})", + R"({"a": 2, "gg": "te2", "ff": {"f1": "984"}, "b": {"b1": 23, "b2": {"b3": "efg", "c1": {"c2": "b", "cd": 2},"bd": 2}, "b4": [1, 2, 3]}})", + R"({"a": 3, "gg": "te3", "ff": {"f1": "983"}, "b": {"b1": 24, "b2": {"b3": "xyz", "c1": {"c2": "c", "cf": 3},"be": 3}, "b4": {"b5": 1}}})", + R"({"a": 4, "gg": "te4", "ff": 781, "b": {"b1": 25, "b2": {"b3": "qwe", "c1": {"c2": "d", "cg": 4},"bf": 4}, "b4": {"b7": 2}}})", + R"({"a": 5, "gg": "te5", "ff": 782, "b": {"b1": 26, "b2": {"b3": "sdf", "c1": {"c2": "e", "ch": 5},"bg": 5}, "b4": 23}})"}; + + for (auto& x : json) { + ASSIGN_OR_ABORT(auto jv, JsonValue::parse(x)); + json_col->append(jv); + } + + ColumnAccessPath root; + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_DOUBLE, "b.b4.b5"); + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_VARCHAR, "b.b2"); + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_VARCHAR, "a"); + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_BIGINT, "ff.f1"); + ColumnAccessPath::insert_json_path(&root, LogicalType::TYPE_JSON, "gg.g1"); + + ColumnPtr read_col = JsonColumn::create(); + ColumnWriterOptions writer_opts; + writer_opts.need_flat = true; + test_json(writer_opts, "/test_flat_json_rw2.data", write_col, read_col, &root); + + auto* read_json = down_cast(read_col.get()); + EXPECT_TRUE(read_json->is_flat_json()); + EXPECT_EQ(5, read_col->size()); + EXPECT_EQ( + R"({b.b4.b5: NULL, b.b2: '{"b3": "abc", "bc": 1, "c1": {"c2": "a", "ce": 1}}', a: '1', ff.f1: 985, gg.g1: NULL})", + read_col->debug_item(0)); + EXPECT_EQ( + R"({b.b4.b5: NULL, b.b2: '{"b3": "efg", "bd": 2, "c1": {"c2": "b", "cd": 2}}', a: '2', ff.f1: 984, gg.g1: NULL})", + read_col->debug_item(1)); + EXPECT_EQ( + R"({b.b4.b5: 1, b.b2: '{"b3": "xyz", "be": 3, "c1": {"c2": "c", "cf": 3}}', a: '3', ff.f1: 983, gg.g1: NULL})", + read_col->debug_item(2)); + EXPECT_EQ( + R"({b.b4.b5: NULL, b.b2: '{"b3": "qwe", "bf": 4, "c1": {"c2": "d", "cg": 4}}', a: '4', ff.f1: NULL, gg.g1: NULL})", + read_col->debug_item(3)); + EXPECT_EQ( + R"({b.b4.b5: NULL, b.b2: '{"b3": "sdf", "bg": 5, "c1": {"c2": "e", "ch": 5}}', a: '5', ff.f1: NULL, gg.g1: NULL})", + read_col->debug_item(4)); } } // namespace starrocks diff --git a/be/test/util/json_flattener_test.cpp b/be/test/util/json_flattener_test.cpp new file mode 100644 index 00000000000000..7d209f8b36bf3f --- /dev/null +++ b/be/test/util/json_flattener_test.cpp @@ -0,0 +1,272 @@ +// Copyright 2021-present StarRocks, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "util/json_flattener.h" + +#include +#include +#include +#include + +#include +#include + +#include "column/const_column.h" +#include "column/json_column.h" +#include "column/nullable_column.h" +#include "column/vectorized_fwd.h" +#include "common/config.h" +#include "common/status.h" +#include "common/statusor.h" +#include "exprs/mock_vectorized_expr.h" +#include "gtest/gtest-param-test.h" +#include "gutil/casts.h" +#include "gutil/strings/strip.h" +#include "testutil/assert.h" +#include "types/logical_type.h" +#include "util/json.h" +#include "util/json_flattener.h" + +namespace starrocks { + +class JsonPathDeriverTest + : public ::testing::TestWithParam< + std::tuple, std::vector>> {}; + +TEST_P(JsonPathDeriverTest, json_path_deriver_test) { + std::unique_ptr ctx(FunctionContext::create_test_context()); + auto json_column = JsonColumn::create(); + ColumnBuilder builder(1); + + std::string param_json1 = std::get<0>(GetParam()); + std::string param_json2 = std::get<1>(GetParam()); + bool param_has_remain = std::get<2>(GetParam()); + std::vector param_flat_path = std::get<3>(GetParam()); + std::vector param_flat_type = std::get<4>(GetParam()); + + auto json = JsonValue::parse(param_json1); + ASSERT_TRUE(json.ok()); + json_column->append(&*json); + + auto json2 = JsonValue::parse(param_json2); + ASSERT_TRUE(json2.ok()); + json_column->append(&*json2); + + std::vector columns{json_column.get()}; + JsonPathDeriver jf; + jf.derived(columns); + std::vector path = jf.flat_paths(); + std::vector type = jf.flat_types(); + + ASSERT_EQ(param_has_remain, jf.has_remain_json()); + ASSERT_EQ(param_flat_path, path); + ASSERT_EQ(param_flat_type, type); +} + +// clang-format off +INSTANTIATE_TEST_SUITE_P(JsonPathDeriverCases, JsonPathDeriverTest, + ::testing::Values( + // NORMAL + std::make_tuple(R"( {"k1": 1, "k2": 2} )", R"( {"k1": 3, "k2": 4} )", false, std::vector {"k1", "k2"}, std::vector {TYPE_BIGINT, TYPE_BIGINT}), + std::make_tuple(R"( {"k1": 1, "k2": 2} )", R"( {"k1": 3} )", true, std::vector {"k1"}, std::vector {TYPE_BIGINT}), + std::make_tuple(R"( {"k1": 1, "k2": 2} )", R"( {"k1": 3, "k3": 4} )", true, std::vector {"k1"}, std::vector {TYPE_BIGINT}), + + // EMPTY + std::make_tuple(R"( {"k1": 1, "k2": {}} )", R"( {"k1": 3, "k2": {}} )", true, std::vector {"k1"}, std::vector {TYPE_BIGINT}), + std::make_tuple(R"( {} )", R"( {"k1": 3} )", true, std::vector {}, std::vector {}), + + // DEEP + std::make_tuple(R"( {"k2": {"j1": 1, "j2": 2}} )", R"( {"k2": {"j1": 3, "j2": 4}} )", false, std::vector {"k2.j1", "k2.j2"}, std::vector {TYPE_BIGINT, TYPE_BIGINT}), + std::make_tuple(R"( {"k2": {"j1": 1, "j2": 2}} )", R"( {"k2": {"j1": 3, "j3": 4}} )", true, std::vector {"k2.j1"}, std::vector {TYPE_BIGINT}), + std::make_tuple(R"( {"k2": {"j1": 1, "j2": 2}} )", R"( {"k2": {"j1": 3, "j2": {"p1": "abc"}}} )", true, std::vector {"k2.j1"}, std::vector {TYPE_BIGINT}), + std::make_tuple(R"( {"k2": {"j1": 1, "j2": {"p1": [1,2,3,4]}}} )", R"( {"k2": {"j1": 3, "j2": {"p1": "abc"}}} )", false, std::vector {"k2.j1", "k2.j2.p1"}, std::vector {TYPE_BIGINT, TYPE_JSON}) +)); +// clang-format on + +class JsonFlattenerTest : public testing::Test { +public: + JsonFlattenerTest() = default; + + ~JsonFlattenerTest() override = default; + +protected: + void SetUp() override {} + + void TearDown() override {} + + std::vector test_json(const std::vector& inputs, const std::vector& paths, + const std::vector& types, bool has_remain) { + ColumnPtr input = JsonColumn::create(); + JsonColumn* json_input = down_cast(input.get()); + for (const auto& json : inputs) { + ASSIGN_OR_ABORT(auto json_value, JsonValue::parse(json)); + json_input->append(&json_value); + } + + JsonFlattener flattener(paths, types, has_remain); + flattener.flatten(json_input); + + auto result = flattener.mutable_result(); + if (has_remain) { + for (size_t i = 0; i < result.size() - 1; i++) { + auto& c = result[i]; + EXPECT_TRUE(c->is_nullable()); + auto* nullable = down_cast(c.get()); + EXPECT_EQ(input->size(), nullable->size()); + } + EXPECT_FALSE(result.back()->is_nullable()); + EXPECT_EQ(input->size(), result.back()->size()); + EXPECT_EQ(paths.size() + 1, result.size()); + } else { + for (auto& c : result) { + EXPECT_TRUE(c->is_nullable()); + auto* nullable = down_cast(c.get()); + EXPECT_EQ(input->size(), nullable->size()); + } + EXPECT_EQ(paths.size(), result.size()); + } + + return result; + } + + std::vector test_null_json(const std::vector& inputs, const std::vector& paths, + const std::vector& types, bool has_remain) { + ColumnPtr input = JsonColumn::create(); + NullColumnPtr nulls = NullColumn::create(); + JsonColumn* json_input = down_cast(input.get()); + for (const auto& json : inputs) { + if (json == "NULL") { + json_input->append_default(); + nulls->append(1); + } else { + ASSIGN_OR_ABORT(auto json_value, JsonValue::parse(json)); + json_input->append(&json_value); + nulls->append(0); + } + } + + auto nullable_input = NullableColumn::create(input, nulls); + JsonFlattener flattener(paths, types, has_remain); + flattener.flatten(nullable_input.get()); + + auto result = flattener.mutable_result(); + if (has_remain) { + for (size_t i = 0; i < result.size() - 1; i++) { + auto& c = result[i]; + EXPECT_TRUE(c->is_nullable()); + auto* nullable = down_cast(c.get()); + EXPECT_EQ(input->size(), nullable->size()); + } + EXPECT_FALSE(result.back()->is_nullable()); + EXPECT_EQ(input->size(), result.back()->size()); + EXPECT_EQ(paths.size() + 1, result.size()); + } else { + for (auto& c : result) { + EXPECT_TRUE(c->is_nullable()); + auto* nullable = down_cast(c.get()); + EXPECT_EQ(input->size(), nullable->size()); + } + EXPECT_EQ(paths.size(), result.size()); + } + return result; + } +}; + +TEST_F(JsonFlattenerTest, testNormalJson) { + std::vector json = {R"( {"k1": 1, "k2": 2} )", R"( {"k1": 3, "k2": 4} )"}; + + std::vector paths = {"k1", "k2"}; + std::vector types = {TYPE_BIGINT, TYPE_BIGINT}; + auto result = test_json(json, paths, types, false); + EXPECT_EQ("1", result[0]->debug_item(0)); + EXPECT_EQ("4", result[1]->debug_item(1)); +} + +TEST_F(JsonFlattenerTest, testCastNormalJson) { + std::vector json = {R"( {"k1": 1, "k2": 2} )", R"( {"k1": 3, "k2": [1,2,3,4]} )"}; + + std::vector paths = {"k1", "k2"}; + std::vector types = {TYPE_BIGINT, TYPE_JSON}; + auto result = test_json(json, paths, types, false); + EXPECT_EQ("1", result[0]->debug_item(0)); + EXPECT_EQ("[1, 2, 3, 4]", result[1]->debug_item(1)); +} + +TEST_F(JsonFlattenerTest, testCastJson) { + std::vector json = {R"( {"k1": 1, "k2": 2} )", R"( {"k1": 3, "k2": [1,2,3,4]} )"}; + + std::vector paths = {"k1", "k2"}; + std::vector types = {TYPE_BIGINT, TYPE_BIGINT}; + auto result = test_json(json, paths, types, false); + EXPECT_EQ("1", result[0]->debug_item(0)); + EXPECT_EQ("NULL", result[1]->debug_item(1)); +} + +TEST_F(JsonFlattenerTest, testDeepJson) { + std::vector json = {R"( {"k1": 1, "k2": 2} )", + R"( {"k1": {"c1": 123}, "k2": {"j1": "abc", "j2": 123}} )"}; + + std::vector paths = {"k1", "k2.j1"}; + std::vector types = {TYPE_BIGINT, TYPE_VARCHAR}; + auto result = test_json(json, paths, types, false); + EXPECT_EQ("1", result[0]->debug_item(0)); + EXPECT_EQ("NULL", result[1]->debug_item(0)); + EXPECT_EQ("NULL", result[0]->debug_item(1)); + EXPECT_EQ("'abc'", result[1]->debug_item(1)); +} + +TEST_F(JsonFlattenerTest, testDeepJson2) { + std::vector json = {R"( {"k1": 1, "k2": 2} )", + R"( {"k1": {"c1": 123}, "k2": {"j1": "abc", "j2": 123}} )"}; + + std::vector paths = {"k1", "k2.j1", "k2.j2"}; + std::vector types = {TYPE_JSON, TYPE_JSON, TYPE_BIGINT}; + auto result = test_json(json, paths, types, false); + EXPECT_EQ("1", result[0]->debug_item(0)); + EXPECT_EQ(R"({"c1": 123})", result[0]->debug_item(1)); + EXPECT_EQ("NULL", result[1]->debug_item(0)); + EXPECT_EQ("\"abc\"", result[1]->debug_item(1)); + EXPECT_EQ("NULL", result[2]->debug_item(0)); + EXPECT_EQ("123", result[2]->debug_item(1)); +} + +TEST_F(JsonFlattenerTest, testDeepJson3) { + std::vector json = {R"( {"k1": 1, "k2": 2} )", + R"( {"k1": {"c1": 123}, "k2": {"j1": "abc", "j2": 123}} )"}; + + std::vector paths = {"k1", "k2.j1", "k2"}; + std::vector types = {TYPE_JSON, TYPE_JSON, TYPE_JSON}; + auto result = test_json(json, paths, types, false); + EXPECT_EQ("1", result[0]->debug_item(0)); + EXPECT_EQ(R"({"c1": 123})", result[0]->debug_item(1)); + EXPECT_EQ("NULL", result[1]->debug_item(0)); + EXPECT_EQ("\"abc\"", result[1]->debug_item(1)); + EXPECT_EQ("NULL", result[2]->debug_item(0)); + EXPECT_EQ("NULL", result[2]->debug_item(1)); +} + +TEST_F(JsonFlattenerTest, testMiddleJson) { + std::vector json = {R"( {"k1": {"c1": {"d1": 123 }}, "k2": {"j1": "def", "j2": {"g1": [1,2,3]}}} )", + R"( {"k1": {"c1": {"d1": "abc"}}, "k2": {"j1": "abc", "j2": {"g1": 123}}} )"}; + + std::vector paths = {"k1.c1", "k2.j2"}; + std::vector types = {TYPE_JSON, TYPE_JSON}; + auto result = test_json(json, paths, types, false); + EXPECT_EQ(R"({"d1": 123})", result[0]->debug_item(0)); + EXPECT_EQ(R"({"d1": "abc"})", result[0]->debug_item(1)); + EXPECT_EQ(R"({"g1": [1, 2, 3]})", result[1]->debug_item(0)); + EXPECT_EQ(R"({"g1": 123})", result[1]->debug_item(1)); +} + +} // namespace starrocks diff --git a/gensrc/proto/segment.proto b/gensrc/proto/segment.proto index 799f0642700d76..2aaf998d74ea89 100644 --- a/gensrc/proto/segment.proto +++ b/gensrc/proto/segment.proto @@ -42,196 +42,205 @@ import "olap_common.proto"; import "types.proto"; message MetadataPairPB { - optional string key = 1; - optional bytes value = 2; + optional string key = 1; + optional bytes value = 2; } enum EncodingTypePB { - UNKNOWN_ENCODING = 0; - DEFAULT_ENCODING = 1; - PLAIN_ENCODING = 2; - PREFIX_ENCODING = 3; - RLE = 4; - DICT_ENCODING = 5; - BIT_SHUFFLE = 6; - FOR_ENCODING = 7; // Frame-Of-Reference + UNKNOWN_ENCODING = 0; + DEFAULT_ENCODING = 1; + PLAIN_ENCODING = 2; + PREFIX_ENCODING = 3; + RLE = 4; + DICT_ENCODING = 5; + BIT_SHUFFLE = 6; + FOR_ENCODING = 7; // Frame-Of-Reference } enum PageTypePB { - UNKNOWN_PAGE_TYPE = 0; - DATA_PAGE = 1; - INDEX_PAGE = 2; - DICTIONARY_PAGE = 3; - SHORT_KEY_PAGE = 4; + UNKNOWN_PAGE_TYPE = 0; + DATA_PAGE = 1; + INDEX_PAGE = 2; + DICTIONARY_PAGE = 3; + SHORT_KEY_PAGE = 4; } enum NullEncodingPB { - BITSHUFFLE_NULL = 0; - LZ4_NULL = 1; - RLE_NULL = 2; + BITSHUFFLE_NULL = 0; + LZ4_NULL = 1; + RLE_NULL = 2; } message DataPageFooterPB { - // required: ordinal of the first value - optional uint64 first_ordinal = 1; - // required: number of values, including NULLs - optional uint64 num_values = 2; - // required: size of nullmap, 0 if the page doesn't contain NULL - optional uint32 nullmap_size = 3; - // array column is made up of offset and element. - // every ordinal in offset column corresponds to a ordinal element. - // If there is a array column like [1, 2, 3], [4, 5, 6] - // The ordinal 2 in offset column corresponds to ordinal 4 in element column. - // - // ordinal of element column only for array column, largest array item ordinal + 1, - // used to calculate the length of last array in this page - optional uint64 corresponding_element_ordinal = 4; - // possible values: 1, 2 - // if format_version is 1, no value will be stored in this page for NULL records; - // if format_version is 2, a default value will be stored for each NULL record. - // another difference is that the format 1 use Run-Length encoding to encode the null map, - // while format 2 use the bitshuffle. - optional uint32 format_version = 20; - optional NullEncodingPB null_encoding = 21; + // required: ordinal of the first value + optional uint64 first_ordinal = 1; + // required: number of values, including NULLs + optional uint64 num_values = 2; + // required: size of nullmap, 0 if the page doesn't contain NULL + optional uint32 nullmap_size = 3; + // array column is made up of offset and element. + // every ordinal in offset column corresponds to a ordinal element. + // If there is a array column like [1, 2, 3], [4, 5, 6] + // The ordinal 2 in offset column corresponds to ordinal 4 in element column. + // + // ordinal of element column only for array column, largest array item ordinal + // + 1, used to calculate the length of last array in this page + optional uint64 corresponding_element_ordinal = 4; + // possible values: 1, 2 + // if format_version is 1, no value will be stored in this page for NULL + // records; if format_version is 2, a default value will be stored for each + // NULL record. another difference is that the format 1 use Run-Length + // encoding to encode the null map, while format 2 use the bitshuffle. + optional uint32 format_version = 20; + optional NullEncodingPB null_encoding = 21; } message IndexPageFooterPB { - // required: number of index entries in this page - optional uint32 num_entries = 1; - - enum Type { - UNKNOWN_INDEX_PAGE_TYPE = 0; - LEAF = 1; - INTERNAL = 2; - }; - // required: type of the index page - optional Type type = 2; + // required: number of index entries in this page + optional uint32 num_entries = 1; + + enum Type { + UNKNOWN_INDEX_PAGE_TYPE = 0; + LEAF = 1; + INTERNAL = 2; + }; + // required: type of the index page + optional Type type = 2; } message DictPageFooterPB { - // required: encoding for dictionary - optional EncodingTypePB encoding = 1; + // required: encoding for dictionary + optional EncodingTypePB encoding = 1; } message ShortKeyFooterPB { - // How many index item in this index. - optional uint32 num_items = 1; - // The total bytes occupied by the index key - optional uint32 key_bytes = 2; - // The total bytes occupied by the key offsets - optional uint32 offset_bytes = 3; - // Segment id which this index is belong to - optional uint32 segment_id = 4; - // number rows in each block - optional uint32 num_rows_per_block = 5; - // How many rows in this segment - optional uint32 num_segment_rows = 6; + // How many index item in this index. + optional uint32 num_items = 1; + // The total bytes occupied by the index key + optional uint32 key_bytes = 2; + // The total bytes occupied by the key offsets + optional uint32 offset_bytes = 3; + // Segment id which this index is belong to + optional uint32 segment_id = 4; + // number rows in each block + optional uint32 num_rows_per_block = 5; + // How many rows in this segment + optional uint32 num_segment_rows = 6; } message PageFooterPB { - // required: indicates which of the *_footer fields is set - optional PageTypePB type = 1; - // required: page body size before compression (exclude footer and crc). - // page body is uncompressed when it's equal to page body size - optional uint32 uncompressed_size = 2; - // present only when type == DATA_PAGE - optional DataPageFooterPB data_page_footer = 7; - // present only when type == INDEX_PAGE - optional IndexPageFooterPB index_page_footer = 8; - // present only when type == DICTIONARY_PAGE - optional DictPageFooterPB dict_page_footer = 9; - // present only when type == SHORT_KEY_PAGE - optional ShortKeyFooterPB short_key_page_footer = 10; + // required: indicates which of the *_footer fields is set + optional PageTypePB type = 1; + // required: page body size before compression (exclude footer and crc). + // page body is uncompressed when it's equal to page body size + optional uint32 uncompressed_size = 2; + // present only when type == DATA_PAGE + optional DataPageFooterPB data_page_footer = 7; + // present only when type == INDEX_PAGE + optional IndexPageFooterPB index_page_footer = 8; + // present only when type == DICTIONARY_PAGE + optional DictPageFooterPB dict_page_footer = 9; + // present only when type == SHORT_KEY_PAGE + optional ShortKeyFooterPB short_key_page_footer = 10; } message ZoneMapPB { - // minimum not-null value, invalid when all values are null(has_not_null==false) - optional bytes min = 1; - // maximum not-null value, invalid when all values are null (has_not_null==false) - optional bytes max = 2; - // whether the zone has null value - optional bool has_null = 3; - // whether the zone has not-null value - optional bool has_not_null = 4; + // minimum not-null value, invalid when all values are + // null(has_not_null==false) + optional bytes min = 1; + // maximum not-null value, invalid when all values are null + // (has_not_null==false) + optional bytes max = 2; + // whether the zone has null value + optional bool has_null = 3; + // whether the zone has not-null value + optional bool has_not_null = 4; } // Metadata for JSON type column message JsonMetaPB { - // Format version - // Version 1: encode each JSON datum individually, as so called row-oriented format - // Version 2(WIP): columnar encoding for JSON - optional uint32 format_version = 1; + // Format version + // Version 1: encode each JSON datum individually, as so called row-oriented + // format Version 2(WIP): columnar encoding for JSON + optional uint32 format_version = 1; + + optional bool is_flat = 2; + + optional bool has_remain = 3; } message ColumnMetaPB { - // column id in table schema - optional uint32 column_id = 1; - // unique column id - optional uint32 unique_id = 2; - // this field is FieldType's value - optional int32 type = 3; - // var length for string type - optional int32 length = 4; - optional EncodingTypePB encoding = 5; - // compress type for column - optional CompressionTypePB compression = 6; - // if this column can be nullable - optional bool is_nullable = 7; - // metadata about all the column indexes - repeated ColumnIndexMetaPB indexes = 8; - // pointer to dictionary page when using DICT_ENCODING - optional PagePointerPB dict_page = 9; - repeated ColumnMetaPB children_columns = 10; - // required by array/struct/map reader to create child reader. - optional uint64 num_rows = 11; - // whether all data pages are encoded by dict encoding. - optional bool all_dict_encoded = 30; - // used to calculate reader chunk size in vertical compaction - optional uint64 total_mem_footprint = 31; - // for json column only - optional JsonMetaPB json_meta = 32; - // for json flat column only - optional bytes name = 33; - optional int32 compression_level = 34; + // column id in table schema + optional uint32 column_id = 1; + // unique column id + optional uint32 unique_id = 2; + // this field is FieldType's value + optional int32 type = 3; + // var length for string type + optional int32 length = 4; + optional EncodingTypePB encoding = 5; + // compress type for column + optional CompressionTypePB compression = 6; + // if this column can be nullable + optional bool is_nullable = 7; + // metadata about all the column indexes + repeated ColumnIndexMetaPB indexes = 8; + // pointer to dictionary page when using DICT_ENCODING + optional PagePointerPB dict_page = 9; + repeated ColumnMetaPB children_columns = 10; + // required by array/struct/map reader to create child reader. + optional uint64 num_rows = 11; + // whether all data pages are encoded by dict encoding. + optional bool all_dict_encoded = 30; + // used to calculate reader chunk size in vertical compaction + optional uint64 total_mem_footprint = 31; + // for json column only + optional JsonMetaPB json_meta = 32; + // for json flat column only + optional bytes name = 33; + optional int32 compression_level = 34; } message SegmentFooterPB { - optional uint32 version = 1 [default = 1]; // file version - repeated ColumnMetaPB columns = 2; // tablet schema - optional uint32 num_rows = 3; // number of values - optional uint64 index_footprint = 4; // Deprecated - optional uint64 data_footprint = 5; // Deprecated - optional uint64 raw_data_footprint = 6; // Deprecated - - optional CompressionTypePB compress_type = 7 [default = LZ4_FRAME]; // Deprecated - repeated MetadataPairPB file_meta_datas = 8; // Deprecated - - // Short key index's page - optional PagePointerPB short_key_index_page = 9; + optional uint32 version = 1 [default = 1]; // file version + repeated ColumnMetaPB columns = 2; // tablet schema + optional uint32 num_rows = 3; // number of values + optional uint64 index_footprint = 4; // Deprecated + optional uint64 data_footprint = 5; // Deprecated + optional uint64 raw_data_footprint = 6; // Deprecated + + optional CompressionTypePB compress_type = 7 + [default = LZ4_FRAME]; // Deprecated + repeated MetadataPairPB file_meta_datas = 8; // Deprecated + + // Short key index's page + optional PagePointerPB short_key_index_page = 9; } message BTreeMetaPB { - // required: pointer to either root index page or sole data page based on is_root_data_page - optional PagePointerPB root_page = 1; - // required: true if we only have one data page, in which case root points to that page directly - optional bool is_root_data_page = 2; + // required: pointer to either root index page or sole data page based on + // is_root_data_page + optional PagePointerPB root_page = 1; + // required: true if we only have one data page, in which case root points to + // that page directly + optional bool is_root_data_page = 2; } message IndexedColumnMetaPB { - // required: FieldType value - optional int32 data_type = 1; - // required: encoding for this column - optional EncodingTypePB encoding = 2; - // required: total number of values in this column - optional int64 num_values = 3; - // present iff this column has ordinal index - optional BTreeMetaPB ordinal_index_meta = 4; - // present iff this column contains sorted values and has value index - optional BTreeMetaPB value_index_meta = 5; - // compression type for data and index page - optional CompressionTypePB compression = 6 [default = NO_COMPRESSION]; - // index size - optional uint64 size = 7; + // required: FieldType value + optional int32 data_type = 1; + // required: encoding for this column + optional EncodingTypePB encoding = 2; + // required: total number of values in this column + optional int64 num_values = 3; + // present iff this column has ordinal index + optional BTreeMetaPB ordinal_index_meta = 4; + // present iff this column contains sorted values and has value index + optional BTreeMetaPB value_index_meta = 5; + // compression type for data and index page + optional CompressionTypePB compression = 6 [default = NO_COMPRESSION]; + // index size + optional uint64 size = 7; } // ------------------------------------------------------------- @@ -239,63 +248,64 @@ message IndexedColumnMetaPB { // ------------------------------------------------------------- enum ColumnIndexTypePB { - UNKNOWN_INDEX_TYPE = 0; - ORDINAL_INDEX = 1; - ZONE_MAP_INDEX = 2; - BITMAP_INDEX = 3; - BLOOM_FILTER_INDEX = 4; + UNKNOWN_INDEX_TYPE = 0; + ORDINAL_INDEX = 1; + ZONE_MAP_INDEX = 2; + BITMAP_INDEX = 3; + BLOOM_FILTER_INDEX = 4; } message ColumnIndexMetaPB { - optional ColumnIndexTypePB type = 1; - optional OrdinalIndexPB ordinal_index = 7; - optional ZoneMapIndexPB zone_map_index = 8; - optional BitmapIndexPB bitmap_index = 9; - optional BloomFilterIndexPB bloom_filter_index = 10; + optional ColumnIndexTypePB type = 1; + optional OrdinalIndexPB ordinal_index = 7; + optional ZoneMapIndexPB zone_map_index = 8; + optional BitmapIndexPB bitmap_index = 9; + optional BloomFilterIndexPB bloom_filter_index = 10; } message OrdinalIndexPB { - // required: the root page can be data page if there is only one data page, - // or the only index page if there is more than one data pages. - optional BTreeMetaPB root_page = 1; + // required: the root page can be data page if there is only one data page, + // or the only index page if there is more than one data pages. + optional BTreeMetaPB root_page = 1; } message ZoneMapIndexPB { - // required: segment-level zone map - optional ZoneMapPB segment_zone_map = 1; - // required: zone map for each data page is stored in an IndexedColumn with ordinal index - optional IndexedColumnMetaPB page_zone_maps = 2; + // required: segment-level zone map + optional ZoneMapPB segment_zone_map = 1; + // required: zone map for each data page is stored in an IndexedColumn with + // ordinal index + optional IndexedColumnMetaPB page_zone_maps = 2; } message BitmapIndexPB { - enum BitmapType { - UNKNOWN_BITMAP_TYPE = 0; - ROARING_BITMAP = 1; - } - optional BitmapType bitmap_type = 1 [default = ROARING_BITMAP]; - // required: whether the index contains null key. - // if true, the last bitmap (ordinal:dict_column.num_values) in bitmap_column is - // the bitmap for null key. we don't store null key in dict_column. - optional bool has_null = 2; - // required: meta for ordered dictionary part - optional IndexedColumnMetaPB dict_column = 3; - // required: meta for bitmaps part - optional IndexedColumnMetaPB bitmap_column = 4; + enum BitmapType { + UNKNOWN_BITMAP_TYPE = 0; + ROARING_BITMAP = 1; + } + optional BitmapType bitmap_type = 1 [default = ROARING_BITMAP]; + // required: whether the index contains null key. + // if true, the last bitmap (ordinal:dict_column.num_values) in bitmap_column + // is the bitmap for null key. we don't store null key in dict_column. + optional bool has_null = 2; + // required: meta for ordered dictionary part + optional IndexedColumnMetaPB dict_column = 3; + // required: meta for bitmaps part + optional IndexedColumnMetaPB bitmap_column = 4; } enum HashStrategyPB { - HASH_MURMUR3_X64_64 = 0; + HASH_MURMUR3_X64_64 = 0; } enum BloomFilterAlgorithmPB { - BLOCK_BLOOM_FILTER = 0; - CLASSIC_BLOOM_FILTER = 1; + BLOCK_BLOOM_FILTER = 0; + CLASSIC_BLOOM_FILTER = 1; } message BloomFilterIndexPB { - // required - optional HashStrategyPB hash_strategy = 1; - optional BloomFilterAlgorithmPB algorithm = 2; - // required: meta for bloom filters - optional IndexedColumnMetaPB bloom_filter = 3; + // required + optional HashStrategyPB hash_strategy = 1; + optional BloomFilterAlgorithmPB algorithm = 2; + // required: meta for bloom filters + optional IndexedColumnMetaPB bloom_filter = 3; }