Skip to content

Commit

Permalink
Add sort_and_merge_tests, update exception to be more consistent, upd…
Browse files Browse the repository at this point in the history
…ate docs to reflect new behavior
  • Loading branch information
G-D-Petrov committed Jan 15, 2025
1 parent 5319604 commit 43d141c
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 2 deletions.
4 changes: 2 additions & 2 deletions cpp/arcticdb/column_store/memory_segment_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -650,7 +650,7 @@ size_t SegmentInMemoryImpl::num_bytes() const {
void SegmentInMemoryImpl::sort(const std::string& column_name) {
init_column_map();
auto idx = column_index(std::string_view(column_name));
user_input::check<ErrorCode::E_COLUMN_NOT_FOUND>(static_cast<bool>(idx), "Column {} not found in sort", column_name);
schema::check<ErrorCode::E_COLUMN_DOESNT_EXIST>(static_cast<bool>(idx), "Column {} not found in sort", column_name);
sort(static_cast<position_t>(idx.value()));
}

Expand All @@ -659,7 +659,7 @@ void SegmentInMemoryImpl::sort(const std::vector<std::string>& column_names) {
std::vector<position_t> positions;
for(const auto& column_name : column_names) {
auto idx = column_index(std::string_view(column_name));
user_input::check<ErrorCode::E_COLUMN_NOT_FOUND>(static_cast<bool>(idx), "Column {} not found in multi-sort", column_name);
schema::check<ErrorCode::E_COLUMN_DOESNT_EXIST>(static_cast<bool>(idx), "Column {} not found in multi-sort", column_name);
positions.emplace_back(static_cast<position_t>(*idx));
}
sort(positions);
Expand Down
2 changes: 2 additions & 0 deletions python/arcticdb/version_store/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -1241,6 +1241,7 @@ def finalize_staged_data(
is not the same as the schema of the existing data
- If dynamic schema is used and different segments have the same column names but their dtypes don't have a
common type (e.g string and any numeric type)
- If a different index name is encountered in the staged data, regardless of the schema mode
See Also
--------
Expand Down Expand Up @@ -1351,6 +1352,7 @@ def sort_and_finalize_staged_data(
staged segment is not the same as the schema of the existing data
- If dynamic schema is used and different segments have the same column names but their dtypes don't have a
common type (e.g string and any numeric type)
- If a different index name is encountered in the staged data, regardless of the schema mode
See Also
--------
Expand Down
53 changes: 53 additions & 0 deletions python/tests/unit/arcticdb/version_store/test_sort_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -868,3 +868,56 @@ def test_writing_wide_segment_over_sliced_data(self, lmdb_storage, lib_name):
lib.sort_and_finalize_staged_data("sym", mode=StagedDataFinalizeMethod.WRITE)

assert_frame_equal(lib.read("sym").data, df_1)


@pytest.mark.parametrize("delete_staged_data_on_failure", [True, False])
def test_sort_and_finalize_staged_data_write_dynamic_schema_named_index(
lmdb_library_static_dynamic, delete_staged_data_on_failure
):
lib = lmdb_library_static_dynamic
sym = "test_sort_and_finalize_staged_data_append_dynamic_schema_named_index"
df_0 = pd.DataFrame(
{"col_0": [0], "col_1": [0.5]}, index=pd.date_range("2024-01-01", periods=1)
)
df_0.index.name = "date"
df_1 = pd.DataFrame({"col_0": [1]}, index=pd.date_range("2024-01-02", periods=1))
lib.write(sym, df_0, staged=True)
lib.write(sym, df_1, staged=True)

with pytest.raises(SchemaException) as exception_info:
lib.sort_and_finalize_staged_data(
sym,
mode=StagedDataFinalizeMethod.WRITE,
delete_staged_data_on_failure=delete_staged_data_on_failure,
)

# Make sure that name of the problematic index column
assert "date" in str(exception_info.value)
expected_key_count = 0 if delete_staged_data_on_failure else 2
assert len(get_append_keys(lib, sym)) == expected_key_count


@pytest.mark.parametrize("delete_staged_data_on_failure", [True, False])
def test_sort_and_finalize_staged_data_append_dynamic_schema_named_index(
lmdb_library_static_dynamic, delete_staged_data_on_failure
):
lib = lmdb_library_static_dynamic
sym = "test_sort_and_finalize_staged_data_append_dynamic_schema_named_index"
df_0 = pd.DataFrame(
{"col_0": [0], "col_1": [0.5]}, index=pd.date_range("2024-01-01", periods=1)
)
df_0.index.name = "date"
df_1 = pd.DataFrame({"col_0": [1]}, index=pd.date_range("2024-01-02", periods=1))
lib.write(sym, df_0)
lib.write(sym, df_1, staged=True)
with pytest.raises(SchemaException) as exception_info:
lib.sort_and_finalize_staged_data(
sym,
mode=StagedDataFinalizeMethod.APPEND,
delete_staged_data_on_failure=delete_staged_data_on_failure,
)

# Make sure that name of the problematic index column
assert "date" in str(exception_info.value)
expected_key_count = 0 if delete_staged_data_on_failure else 1
assert len(get_append_keys(lib, sym)) == expected_key_count

0 comments on commit 43d141c

Please sign in to comment.