Skip to content

Commit

Permalink
fix {} and select col and col not in json.
Browse files Browse the repository at this point in the history
  • Loading branch information
hubgeter committed Nov 26, 2024
1 parent 3ff999e commit 72b173b
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 30 deletions.
55 changes: 31 additions & 24 deletions be/src/vec/exec/format/json/new_json_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -491,8 +491,8 @@ Status NewJsonReader::_vhandle_simple_json(RuntimeState* /*state*/, Block& block
bool valid = false;
if (_next_row >= _total_rows) { // parse json and generic document
Status st = _parse_json(is_empty_row, eof);
if (st.is<DATA_QUALITY_ERROR>()) {
continue; // continue to read next
if (_is_load && st.is<DATA_QUALITY_ERROR>()) {
continue; // continue to read next (for load, after this , already append error to file.)
}
RETURN_IF_ERROR(st);
if (*is_empty_row) {
Expand Down Expand Up @@ -842,14 +842,15 @@ Status NewJsonReader::_set_column_value(rapidjson::Value& objectValue, Block& bl
column_ptr->insert_default();
} else {
// not found, filling with default value
RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, valid));
RETURN_IF_ERROR(
_fill_missing_column(slot_desc, _serdes[slot_idx], column_ptr, valid));
if (!(*valid)) {
return Status::OK();
}
}
}
}
if (!has_valid_value) {
if (!has_valid_value && _is_load) {
// there is no valid value in json line but has filled with default value before
// so remove this line in block
string col_names;
Expand Down Expand Up @@ -1094,13 +1095,13 @@ Status NewJsonReader::_write_columns_by_jsonpath(rapidjson::Value& objectValue,
has_valid_value = true;
} else {
// not found, filling with default value
RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, valid));
RETURN_IF_ERROR(_fill_missing_column(slot_desc, _serdes[i], column_ptr, valid));
if (!(*valid)) {
return Status::OK();
}
}
}
if (!has_valid_value) {
if (!has_valid_value && _is_load) {
// there is no valid value in json line but has filled with default value before
// so remove this line in block
for (int i = 0; i < block.columns(); ++i) {
Expand Down Expand Up @@ -1250,7 +1251,7 @@ Status NewJsonReader::_simdjson_handle_simple_json(RuntimeState* /*state*/, Bloc

// step2: get json value by json doc
Status st = _get_json_value(&size, eof, &error, is_empty_row);
if (st.is<DATA_QUALITY_ERROR>()) {
if (_is_load && st.is<DATA_QUALITY_ERROR>()) {
return Status::OK();
}
RETURN_IF_ERROR(st);
Expand Down Expand Up @@ -1558,7 +1559,8 @@ Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val
_seen_columns[column_index] = true;
has_valid_value = true;
}
if (!has_valid_value) {

if (!has_valid_value && _is_load) {
string col_names;
for (auto* slot_desc : slot_descs) {
col_names.append(slot_desc->col_name() + ", ");
Expand Down Expand Up @@ -1623,7 +1625,7 @@ Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val
_process_skip_bitmap_mark(slot_desc, column_ptr, block, cur_row_count, valid);
column_ptr->insert_default();
} else {
RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, valid));
RETURN_IF_ERROR(_fill_missing_column(slot_desc, _serdes[i], column_ptr, valid));
if (!(*valid)) {
return Status::OK();
}
Expand Down Expand Up @@ -2021,7 +2023,7 @@ Status NewJsonReader::_simdjson_write_columns_by_jsonpath(
has_valid_value = true;
} else if (i >= _parsed_jsonpaths.size() || st.is<NOT_FOUND>()) {
// not match in jsondata, filling with default value
RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, valid));
RETURN_IF_ERROR(_fill_missing_column(slot_desc, _serdes[i], column_ptr, valid));
if (!(*valid)) {
return Status::OK();
}
Expand Down Expand Up @@ -2086,25 +2088,30 @@ Status NewJsonReader::_get_column_default_value(
return Status::OK();
}

Status NewJsonReader::_fill_missing_column(SlotDescriptor* slot_desc, IColumn* column_ptr,
bool* valid) {
if (slot_desc->is_nullable()) {
auto* nullable_column = reinterpret_cast<ColumnNullable*>(column_ptr);
column_ptr = &nullable_column->get_nested_column();
auto col_value = _col_default_value_map.find(slot_desc->col_name());
if (col_value == _col_default_value_map.end()) {
Status NewJsonReader::_fill_missing_column(SlotDescriptor* slot_desc, DataTypeSerDeSPtr serde,
IColumn* column_ptr, bool* valid) {
auto col_value = _col_default_value_map.find(slot_desc->col_name());
if (col_value == _col_default_value_map.end()) {
if (slot_desc->is_nullable()) {
auto* nullable_column = static_cast<ColumnNullable*>(column_ptr);
nullable_column->insert_default();
} else {
const std::string& v_str = col_value->second;
nullable_column->get_null_map_data().push_back(0);
assert_cast<ColumnString*>(column_ptr)->insert_data(v_str.c_str(), v_str.size());
if (_is_load) {
RETURN_IF_ERROR(_append_error_msg(
nullptr, "The column `{}` is not nullable, but it's not found in jsondata.",
slot_desc->col_name(), valid));
} else {
return Status::DataQualityError(
"The column `{}` is not nullable, but it's not found in jsondata.",
slot_desc->col_name());
}
}
} else {
RETURN_IF_ERROR(_append_error_msg(
nullptr, "The column `{}` is not nullable, but it's not found in jsondata.",
slot_desc->col_name(), valid));
const std::string& v_str = col_value->second;
Slice column_default_value {v_str};
RETURN_IF_ERROR(serde->deserialize_one_cell_from_json(*column_ptr, column_default_value,
_serde_options));
}

*valid = true;
return Status::OK();
}
Expand Down
4 changes: 2 additions & 2 deletions be/src/vec/exec/format/json/new_json_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,8 @@ class NewJsonReader : public GenericReader {
const std::unordered_map<std::string, vectorized::VExprContextSPtr>&
col_default_value_ctx);

Status _fill_missing_column(SlotDescriptor* slot_desc, vectorized::IColumn* column_ptr,
bool* valid);
Status _fill_missing_column(SlotDescriptor* slot_desc, DataTypeSerDeSPtr serde,
vectorized::IColumn* column_ptr, bool* valid);

// fe will add skip_bitmap_col to _file_slot_descs iff the target olap table has skip_bitmap_col
// and the current load is a flexible partial update
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,10 @@
{"id":4,"COL1":40,"col2":{"col2a":10,"col2b":"string4","new_col":"new_val","col2a":40},"col3":{"4":"string4"}}
{"id":5}
{"id":6,"col1":60,"col2":{"COL2a":60,"col2b":600},"col3":{"6":600}}
{"id":7,"col1":70,"col3":{"7":"string7"},"col2":{"col2b":"string7","col2a":70}}
{"id":7,"col1":70,"col3":{"7":"string7"},"col2":{"col2b":"string7","col2a":70}}




{}
{"a":5}
Original file line number Diff line number Diff line change
Expand Up @@ -434,9 +434,10 @@ protected TFileAttributes getFileAttributes() throws UserException {

fileAttributes.setJsonpaths("");
fileAttributes.setJsonRoot("");
fileAttributes.setNumAsString(false);
fileAttributes.setNumAsString(true);
fileAttributes.setFuzzyParse(false);
fileAttributes.setReadJsonByLine(true);
fileAttributes.setStripOuterArray(false);
fileAttributes.setHeaderType("");
} else {
throw new UserException(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ user3 [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made v
user4 [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength training", "metrics":{"duration":1, "calories":300}}, "shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}]

-- !q8 --
\N \N \N \N
\N \N \N \N
1 10 {"col2a":10, "col2b":"string1"} {1:"string10"}
2 20 {"col2a":20, "col2b":"string2"} {2:"string2"}
3 30 {"col2a":30, "col2b":"string3"} \N
Expand All @@ -43,6 +45,17 @@ user4 [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made v
6 60 {"col2a":60, "col2b":"600"} {6:"600"}
7 70 {"col2a":70, "col2b":"string7"} {7:"string7"}

-- !q9 --
\N \N
\N \N
\N 5
10 1
20 2
30 3
40 4
60 6
70 7

-- !q1 --
1 true 127 32767 2147483647 9223372036854775807 123.45 123456.789 123456789 1234.5678 123456.789012 123456789.012345678901 string_value binary_value 2024-03-20 2024-03-20T12:00 2024-03-20T12:00:00.123457 2024-03-20T12:00:00.123457 char_value1 char_value2 char_value3 varchar_value1 varchar_value2 varchar_value3 {"key1":"value1"} {"key1":"value1"} {"a ":"b "} {1:10} {1:100000000000} {1.1:10.1} {1.1:10.1} {1:0} {1.1:1.1} {1.23:1.23} {1.2345:1.2345} {1.23456789:1.23456789} {1.23456789:1.23456789} {1.2345678901234568:1.2345678901234568} ["string1", "string2"] [1, 2, 3] [100000000000, 200000000000] [1.1, 2.2] [1.123456789, 2.123456789] [1, 0] ["varchar1", "varchar2"] ["char1 ", "char2 "] [1.1, 2.2] [1.23, 2.34] [1.2345, 2.3456] [1.23456789, 2.34567891] [1.23456789, 2.34567891] [1.2345678901234568, 2.3456789012345679] {"s_bigint":1234567890} {"key":[{"s_int":123}]} {"struct_field":["value1", "value2"]} {"struct_field_null":null, "struct_field_null2":null} {"struct_non_nulls_after_nulls1":123, "struct_non_nulls_after_nulls2":"value"} {"struct_field1":123, "struct_field2":"value", "strict_field3":{"nested_struct_field1":123, "nested_struct_field2":"nested_value"}} {"null_key":null} [null, "value1", "value2"] ["value1", null, "value2"] ["value1", "value2", null] [null, null, null] dt1
2 false 58 12345 2147483000 \N 789.56 654321.123 987654321 5678.1234 987654.321098 987654321.098765432109 changed_string new_binary_value 2025-05-25 2025-05-25T15:30 2025-05-25T15:30:00.654322 2025-05-25T15:30:00.654322 char_new_value1 char_new_value2 char_new_value3 varchar_new_value1 varchar_new_value2 varchar_new_value3 {"key2":"value2"} {"key2":"value2"} {"x ":"y "} {2:20} {2:200000000000} {2.2:20.2} {2.2:20.2} {0:1} {2.2:2.2} {2.34:2.34} {2.3456:2.3456} {2.34567891:2.34567891} {2.34567891:2.34567891} {2.3456789012345679:2.3456789012345679} ["string3", "string4"] [4, 5, 6] [300000000000, 400000000000] [2.2, 3.3] [2.123456789, 3.123456789] [0, 1] ["varchar3", "varchar4"] ["char3 ", "char4 "] [2.2, 3.3] [2.34, 3.45] [2.3456, 3.4567] [2.34567891, 3.45678901] [2.34567891, 3.45678901] [2.3456789012345679, 3.4567890123456789] {"s_bigint":9876543210} {"key2":[{"s_int":456}]} {"struct_field":["new_value1", "new_value2"]} {"struct_field_null":null, "struct_field_null2":null} {"struct_non_nulls_after_nulls1":456, "struct_non_nulls_after_nulls2":"new_value"} {"struct_field1":456, "struct_field2":"new_value", "strict_field3":{"nested_struct_field1":456, "nested_struct_field2":"nested_value2"}} {"null_key":null} [null, "new_value1", "new_value2"] ["new_value1", null, "new_value2"] ["new_value1", "new_value2", null] [null, null, null] dt1
Expand Down Expand Up @@ -79,6 +92,8 @@ user3 [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made v
user4 [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made vegan meal", "metrics":{"time_spent":1.5, "calories":500}}, "movie":{"details":"Watched action movie", "metrics":{"duration":2, "rating":8.5}}}}, {"activity_date":"2024-08-02", "activities":{"gym":{"details":"Strength training", "metrics":{"duration":1, "calories":300}}, "shopping":{"details":"Bought groceries", "metrics":{"items":10, "cost":100}}}}]

-- !q8 --
\N \N \N \N
\N \N \N \N
1 10 {"col2a":10, "col2b":"string1"} {1:"string10"}
2 20 {"col2a":20, "col2b":"string2"} {2:"string2"}
3 30 {"col2a":30, "col2b":"string3"} \N
Expand All @@ -87,3 +102,14 @@ user4 [{"activity_date":"2024-08-01", "activities":{"cooking":{"details":"Made v
6 60 {"col2a":60, "col2b":"600"} {6:"600"}
7 70 {"col2a":70, "col2b":"string7"} {7:"string7"}

-- !q9 --
\N \N
\N \N
\N 5
10 1
20 2
30 3
40 4
60 6
70 7

Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ suite("hive_json_basic_test", "p0,external,hive,external_docker,external_docker
qt_q7 """ select user_id,activity_log from ${tb2} order by user_id"""


qt_q8 """ select * from ${tb3} order by id """

order_qt_q8 """ select * from ${tb3} order by id """

order_qt_q9 """ select col1,id from ${tb3} order by id """




Expand Down

0 comments on commit 72b173b

Please sign in to comment.