Skip to content

Commit

Permalink
[Bug](join) fix columnstr64's offset overflow on serialize_value_into…
Browse files Browse the repository at this point in the history
…_arena (#46461)

### What problem does this PR solve?
```sql
mysql [test]>select /*+ LEADING(a,b) */ count(*) from d_table as a, d_table2 as b where a.k4=b.k4 and a.k1=b.k1;

+----------+
| count(*) |
+----------+
| 50000000 |
+----------+
1 row in set (4.87 sec)

mysql [test]>select /*+ LEADING(b,a) */ count(*) from d_table as a, d_table2 as b where a.k4=b.k4 and a.k1=b.k1;

+----------+
| count(*) |
+----------+
| 42949673 |
+----------+
1 row in set (21.32 sec)
```
### Release note

None

### Check List (For Author)

- Test <!-- At least one of them must be included. -->
    - [x] Regression test
    - [ ] Unit Test
    - [ ] Manual test (add detailed scripts or steps below)
    - [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
        - [ ] Previous test can cover this change.
        - [ ] No code files have been changed.
        - [ ] Other reason <!-- Add your reason?  -->

- Behavior changed:
    - [x] No.
    - [ ] Yes. <!-- Explain the behavior change -->

- Does this need documentation?
    - [x] No.
- [ ] Yes. <!-- Add document PR link here. eg:
apache/doris-website#1214 -->

### Check List (For Reviewer who merge this PR)

- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
  • Loading branch information
BiteTheDDDDt authored Jan 8, 2025
1 parent b26aabe commit b0da422
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 15 deletions.
22 changes: 10 additions & 12 deletions be/src/vec/columns/column_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,9 +398,8 @@ ColumnPtr ColumnStr<T>::permute(const IColumn::Permutation& perm, size_t limit)
template <typename T>
StringRef ColumnStr<T>::serialize_value_into_arena(size_t n, Arena& arena,
char const*& begin) const {
// Use uint32 instead of size_t to reduce agg key's length.
auto string_size(static_cast<uint32_t>(size_at(n)));
auto offset(static_cast<uint32_t>(offset_at(n)));
auto string_size(size_at(n));
auto offset(offset_at(n));

StringRef res;
res.size = sizeof(string_size) + string_size;
Expand All @@ -414,7 +413,7 @@ StringRef ColumnStr<T>::serialize_value_into_arena(size_t n, Arena& arena,

template <typename T>
const char* ColumnStr<T>::deserialize_and_insert_from_arena(const char* pos) {
const uint32_t string_size = unaligned_load<uint32_t>(pos);
const auto string_size = unaligned_load<uint32_t>(pos);
pos += sizeof(string_size);

const size_t old_size = chars.size();
Expand All @@ -432,7 +431,7 @@ size_t ColumnStr<T>::get_max_row_byte_size() const {
T max_size = 0;
size_t num_rows = offsets.size();
for (size_t i = 0; i < num_rows; ++i) {
max_size = std::max(max_size, size_at(i));
max_size = std::max(max_size, T(size_at(i)));
}

return max_size + sizeof(uint32_t);
Expand All @@ -442,9 +441,8 @@ template <typename T>
void ColumnStr<T>::serialize_vec(std::vector<StringRef>& keys, size_t num_rows,
size_t max_row_byte_size) const {
for (size_t i = 0; i < num_rows; ++i) {
// Use uint32 instead of size_t to reduce agg key's length.
auto offset(static_cast<uint32_t>(offset_at(i)));
auto string_size(static_cast<uint32_t>(size_at(i)));
auto offset(offset_at(i));
auto string_size(size_at(i));

auto* ptr = const_cast<char*>(keys[i].data + keys[i].size);
memcpy_fixed<uint32_t>(ptr, (char*)&string_size);
Expand All @@ -470,7 +468,7 @@ void ColumnStr<T>::serialize_vec_with_null_map(std::vector<StringRef>& keys, siz
auto offset(offset_at(i));
auto string_size(size_at(i));

memcpy_fixed<UInt32>(dest + 1, (char*)&string_size);
memcpy_fixed<uint32_t>(dest + 1, (char*)&string_size);
memcpy(dest + 1 + sizeof(string_size), &chars[offset], string_size);
keys[i].size += sizeof(string_size) + string_size + sizeof(UInt8);
} else {
Expand All @@ -487,7 +485,7 @@ void ColumnStr<T>::serialize_vec_with_null_map(std::vector<StringRef>& keys, siz
auto offset(offset_at(i));
auto string_size(size_at(i));

memcpy_fixed<UInt32>(dest + 1, (char*)&string_size);
memcpy_fixed<uint32_t>(dest + 1, (char*)&string_size);
memcpy(dest + 1 + sizeof(string_size), &chars[offset], string_size);
keys[i].size += sizeof(string_size) + string_size + sizeof(UInt8);
}
Expand All @@ -497,7 +495,7 @@ void ColumnStr<T>::serialize_vec_with_null_map(std::vector<StringRef>& keys, siz
template <typename T>
void ColumnStr<T>::deserialize_vec(std::vector<StringRef>& keys, const size_t num_rows) {
for (size_t i = 0; i != num_rows; ++i) {
auto original_ptr = keys[i].data;
const auto* original_ptr = keys[i].data;
keys[i].data = deserialize_and_insert_from_arena(original_ptr);
keys[i].size -= keys[i].data - original_ptr;
}
Expand All @@ -508,7 +506,7 @@ void ColumnStr<T>::deserialize_vec_with_null_map(std::vector<StringRef>& keys,
const size_t num_rows, const uint8_t* null_map) {
for (size_t i = 0; i != num_rows; ++i) {
if (null_map[i] == 0) {
auto original_ptr = keys[i].data;
const auto* original_ptr = keys[i].data;
keys[i].data = deserialize_and_insert_from_arena(original_ptr);
keys[i].size -= keys[i].data - original_ptr;
} else {
Expand Down
10 changes: 7 additions & 3 deletions be/src/vec/columns/column_string.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <typeinfo>
#include <vector>

#include "common/cast_set.h"
#include "common/compiler_util.h" // IWYU pragma: keep
#include "common/exception.h"
#include "common/status.h"
Expand Down Expand Up @@ -89,8 +90,11 @@ class ColumnStr final : public COWHelper<IColumn, ColumnStr<T>> {
// Start position of i-th element.
T ALWAYS_INLINE offset_at(ssize_t i) const { return offsets[i - 1]; }

/// Size of i-th element, including terminating zero.
T ALWAYS_INLINE size_at(ssize_t i) const { return offsets[i] - offsets[i - 1]; }
// Size of i-th element, including terminating zero.
// assume that the length of a single element is less than 32-bit
uint32_t ALWAYS_INLINE size_at(ssize_t i) const {
return uint32_t(offsets[i] - offsets[i - 1]);
}

template <bool positive>
struct less;
Expand Down Expand Up @@ -357,7 +361,7 @@ class ColumnStr final : public COWHelper<IColumn, ColumnStr<T>> {

for (size_t i = start_index; i < start_index + num; i++) {
int32_t codeword = data_array[i];
auto& src = dict[codeword];
const auto& src = dict[codeword];
memcpy(chars.data() + old_size, src.data, src.size);
old_size += src.size;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !test --
50000000

-- !test --
50000000

Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

suite("str64_serialize") {

sql """ DROP TABLE IF EXISTS d_table; """
sql """ DROP TABLE IF EXISTS d_table2; """


sql """
create table d_table (
k1 int null,
k2 int not null,
k3 bigint null,
k4 varchar(100) null
)
duplicate key (k1,k2,k3)
distributed BY hash(k1) buckets 3
properties("replication_num" = "1");
"""
sql """
create table d_table2 (
k1 int null,
k2 int not null,
k3 bigint null,
k4 varchar(100) null
)
duplicate key (k1,k2,k3)
distributed BY hash(k1) buckets 3
properties("replication_num" = "1");
"""

sql """insert into d_table select 1,1,1,'1234567890abcdefghigalsdhaluihdicandejionxaoxwdeuhwenudzmwoedxneiowdxiowedjxneiowdjixoneiiexdnuiexef' from (select 1 k1) as t lateral view explode_numbers(50000000) tmp1 as e1;
"""

sql """insert into d_table2 select 1,1,1,'1234567890abcdefghigalsdhaluihdicandejionxaoxwdeuhwenudzmwoedxneiowdxiowedjxneiowdjixoneiiexdnuiexef';
"""
sql "set parallel_pipeline_task_num=1;"

qt_test "select /*+ LEADING(a,b) */ count(*) from d_table as a, d_table2 as b where a.k4=b.k4 and a.k1=b.k1;"
qt_test "select /*+ LEADING(b,a) */ count(*) from d_table as a, d_table2 as b where a.k4=b.k4 and a.k1=b.k1;"
}

0 comments on commit b0da422

Please sign in to comment.