Skip to content

Commit

Permalink
Merge pull request ClickHouse#70053 from bigo-sg/regReplace-empty-needle
Browse files Browse the repository at this point in the history
Allow empty needle in replaceRegexp*
  • Loading branch information
vitlibar authored Oct 21, 2024
2 parents 9d5410e + 57db542 commit f41d604
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 26 deletions.
62 changes: 48 additions & 14 deletions src/Functions/ReplaceRegexpImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ namespace DB

namespace ErrorCodes
{
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int BAD_ARGUMENTS;
}

Expand Down Expand Up @@ -205,7 +204,11 @@ struct ReplaceRegexpImpl
size_t input_rows_count)
{
if (needle.empty())
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
{
res_data.assign(haystack_data);
res_offsets.assign(haystack_offsets);
return;
}

ColumnString::Offset res_offset = 0;
res_data.reserve(haystack_data.size());
Expand Down Expand Up @@ -240,7 +243,7 @@ struct ReplaceRegexpImpl

for (size_t i = 0; i < input_rows_count; ++i)
{
size_t from = i > 0 ? haystack_offsets[i - 1] : 0;
size_t from = haystack_offsets[i - 1];

const char * hs_data = reinterpret_cast<const char *>(haystack_data.data() + from);
const size_t hs_length = static_cast<unsigned>(haystack_offsets[i] - from - 1);
Expand Down Expand Up @@ -271,17 +274,24 @@ struct ReplaceRegexpImpl

for (size_t i = 0; i < input_rows_count; ++i)
{
size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0;
size_t hs_from = haystack_offsets[i - 1];
const char * hs_data = reinterpret_cast<const char *>(haystack_data.data() + hs_from);
const size_t hs_length = static_cast<unsigned>(haystack_offsets[i] - hs_from - 1);

size_t ndl_from = i > 0 ? needle_offsets[i - 1] : 0;
size_t ndl_from = needle_offsets[i - 1];
const char * ndl_data = reinterpret_cast<const char *>(needle_data.data() + ndl_from);
const size_t ndl_length = static_cast<unsigned>(needle_offsets[i] - ndl_from - 1);
std::string_view needle(ndl_data, ndl_length);

if (needle.empty())
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
{
res_data.insert(res_data.end(), hs_data, hs_data + hs_length);
res_data.push_back(0);

res_offset += hs_length + 1;
res_offsets[i] = res_offset;
continue;
}

re2::RE2 searcher(needle, regexp_options);
if (!searcher.ok())
Expand All @@ -308,7 +318,11 @@ struct ReplaceRegexpImpl
assert(haystack_offsets.size() == replacement_offsets.size());

if (needle.empty())
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
{
res_data.assign(haystack_data);
res_offsets.assign(haystack_offsets);
return;
}

ColumnString::Offset res_offset = 0;
res_data.reserve(haystack_data.size());
Expand All @@ -325,11 +339,11 @@ struct ReplaceRegexpImpl

for (size_t i = 0; i < input_rows_count; ++i)
{
size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0;
size_t hs_from = haystack_offsets[i - 1];
const char * hs_data = reinterpret_cast<const char *>(haystack_data.data() + hs_from);
const size_t hs_length = static_cast<unsigned>(haystack_offsets[i] - hs_from - 1);

size_t repl_from = i > 0 ? replacement_offsets[i - 1] : 0;
size_t repl_from = replacement_offsets[i - 1];
const char * repl_data = reinterpret_cast<const char *>(replacement_data.data() + repl_from);
const size_t repl_length = static_cast<unsigned>(replacement_offsets[i] - repl_from - 1);
std::string_view replacement(repl_data, repl_length);
Expand Down Expand Up @@ -364,19 +378,25 @@ struct ReplaceRegexpImpl

for (size_t i = 0; i < input_rows_count; ++i)
{
size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0;
size_t hs_from = haystack_offsets[i - 1];
const char * hs_data = reinterpret_cast<const char *>(haystack_data.data() + hs_from);
const size_t hs_length = static_cast<unsigned>(haystack_offsets[i] - hs_from - 1);

size_t ndl_from = i > 0 ? needle_offsets[i - 1] : 0;
size_t ndl_from = needle_offsets[i - 1];
const char * ndl_data = reinterpret_cast<const char *>(needle_data.data() + ndl_from);
const size_t ndl_length = static_cast<unsigned>(needle_offsets[i] - ndl_from - 1);
std::string_view needle(ndl_data, ndl_length);

if (needle.empty())
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
{
res_data.insert(res_data.end(), hs_data, hs_data + hs_length);
res_data.push_back(0);
res_offsets[i] = res_offsets[i - 1] + hs_length + 1;
res_offset = res_offsets[i];
continue;
}

size_t repl_from = i > 0 ? replacement_offsets[i - 1] : 0;
size_t repl_from = replacement_offsets[i - 1];
const char * repl_data = reinterpret_cast<const char *>(replacement_data.data() + repl_from);
const size_t repl_length = static_cast<unsigned>(replacement_offsets[i] - repl_from - 1);
std::string_view replacement(repl_data, repl_length);
Expand All @@ -403,7 +423,21 @@ struct ReplaceRegexpImpl
size_t input_rows_count)
{
if (needle.empty())
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
{
chassert(input_rows_count == haystack_data.size() / n);
/// Since ColumnFixedString does not have a zero byte at the end, while ColumnString does,
/// we need to split haystack_data into strings of length n, add 1 zero byte to the end of each string
/// and then copy to res_data, ref: ColumnString.h and ColumnFixedString.h
res_data.reserve(haystack_data.size() + input_rows_count);
res_offsets.resize(input_rows_count);
for (size_t i = 0; i < input_rows_count; ++i)
{
res_data.insert(res_data.end(), haystack_data.begin() + i * n, haystack_data.begin() + (i + 1) * n);
res_data.push_back(0);
res_offsets[i] = res_offsets[i - 1] + n + 1;
}
return;
}

ColumnString::Offset res_offset = 0;
res_data.reserve(haystack_data.size());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,15 +134,30 @@
3 Hello World not_found x Hello World
4 Hello World [eo] x Hxllo World
5 Hello World . x xello World
Check that whether an exception is thrown if the needle is empty
- should not throw an exception if the needle is empty
- non-const needle, const replacement
Hexxo Worxd
Hello World
Hexlo World
Hello World
Hexxo Worxd
Hello World
Hexlo World
Hello World
- const needle, non-const replacement
Hello World
Hello World
Hello World
Hello World
Hello World
Hello World
Hello World
Hello World
- non-const needle, non-const replacement
Hexxo Worxd
Hello World
Hexlo World
Hello World
Hexxo Worxd
Hello World
Hexlo World
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,7 @@ SELECT id, haystack, needle, replacement, replaceRegexpOne('Hello World', needle

DROP TABLE IF EXISTS test_tab;


SELECT 'Check that whether an exception is thrown if the needle is empty';
SELECT '- should not throw an exception if the needle is empty';

CREATE TABLE test_tab
(id UInt32, haystack String, needle String, replacement String)
Expand All @@ -79,22 +78,22 @@ CREATE TABLE test_tab

INSERT INTO test_tab VALUES (1, 'Hello World', 'l', 'x') (2, 'Hello World', '', 'y');

-- needle: non-const, replacement: const
SELECT '- non-const needle, const replacement';
SELECT replaceAll(haystack, needle, 'x') FROM test_tab;
SELECT replaceOne(haystack, needle, 'x') FROM test_tab;
SELECT replaceRegexpAll(haystack, needle, 'x') FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceRegexpOne(haystack, needle, 'x') FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceRegexpAll(haystack, needle, 'x') FROM test_tab;
SELECT replaceRegexpOne(haystack, needle, 'x') FROM test_tab;

-- needle: const, replacement: non-const
SELECT '- const needle, non-const replacement';
SELECT replaceAll(haystack, '', replacement) FROM test_tab;
SELECT replaceOne(haystack, '', replacement) FROM test_tab;
SELECT replaceRegexpAll(haystack, '', replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceRegexpOne(haystack, '', replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceRegexpAll(haystack, '', replacement) FROM test_tab;
SELECT replaceRegexpOne(haystack, '', replacement) FROM test_tab;

-- needle: non-const, replacement: non-const
SELECT '- non-const needle, non-const replacement';
SELECT replaceAll(haystack, needle, replacement) FROM test_tab;
SELECT replaceOne(haystack, needle, replacement) FROM test_tab;
SELECT replaceRegexpAll(haystack, needle, replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceRegexpOne(haystack, needle, replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceRegexpAll(haystack, needle, replacement) FROM test_tab;
SELECT replaceRegexpOne(haystack, needle, replacement) FROM test_tab;

DROP TABLE IF EXISTS test_tab;

0 comments on commit f41d604

Please sign in to comment.