diff --git a/.gitignore b/.gitignore index 029173001c..ec5461de72 100644 --- a/.gitignore +++ b/.gitignore @@ -23,7 +23,10 @@ dask-worker-space/ ################### *.com *.class +*.exp +*.lib *.dll +*.pdb *.exe *.o *.so @@ -218,3 +221,6 @@ benchmarks/torch_data # API docs api_docs/ +/cpp/cmake-build-debug/ +/cpp/external/ +_deeplake/**/*.pyi \ No newline at end of file diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt new file mode 100644 index 0000000000..9f54d8f382 --- /dev/null +++ b/cpp/CMakeLists.txt @@ -0,0 +1,35 @@ +cmake_minimum_required(VERSION 3.16) +project(deeplake) + +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +# Avoid warning about DOWNLOAD_EXTRACT_TIMESTAMP in CMake 3.24: +if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") + cmake_policy(SET CMP0135 NEW) +endif() + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED True) +option(PYTHON_EXECUTABLE "Path to python executable") + +if(APPLE) + set (CMAKE_OSX_DEPLOYMENT_TARGET 10.15) +else() + #skip multi architecture build for linux + set (CMAKE_OSX_ARCHITECTURES) +endif() + +set(DEFAULT_PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(PYTHON_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/../) +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindStduuid.cmake) +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindJson.cmake) +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindPybind11.cmake) +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindGoogleTest.cmake) +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindSpdlog.cmake) +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindBackward.cmake) + +add_subdirectory(storage) +add_subdirectory(deeplog) +add_subdirectory(tests) +add_subdirectory(py_api) + diff --git a/cpp/cmake/FindBackward.cmake b/cpp/cmake/FindBackward.cmake new file mode 100644 index 0000000000..7d08cf85c9 --- /dev/null +++ b/cpp/cmake/FindBackward.cmake @@ -0,0 +1,13 @@ +include(FetchContent) + +set(backward_URL https://github.com/bombela/backward-cpp/archive/refs/tags/v1.6.tar.gz) +set(backward_URL_HASH c654d0923d43f1cea23d086729673498e4741fb2457e806cfaeaea7b20c97c10) +set(backward_SOURCE_DIR ${DEFAULT_PARENT_DIR}/external/backward) + +FetchContent_Declare( + backward + URL ${backward_URL} + URL_HASH SHA256=${backward_URL_HASH} + SOURCE_DIR ${backward_SOURCE_DIR} +) +FetchContent_MakeAvailable(backward) diff --git a/cpp/cmake/FindGoogleTest.cmake b/cpp/cmake/FindGoogleTest.cmake new file mode 100644 index 0000000000..c47084af0d --- /dev/null +++ b/cpp/cmake/FindGoogleTest.cmake @@ -0,0 +1,15 @@ +include(FetchContent) + +set(googletest_URL https://github.com/google/googletest.git) +set(googletest_TAG v1.12.0) +set(googletest_SOURCE_DIR ${DEFAULT_PARENT_DIR}/external/googletest) +set(googletest_INCLUDE_DIRS ${SOURCE_DIR}/googletest/include ${SOURCE_DIR}/googlemock/include) + +FetchContent_Declare(googletest + GIT_REPOSITORY ${googletest_URL} + GIT_TAG ${googletest_TAG} + SOURCE_DIR ${googletest_SOURCE_DIR} +) + +FetchContent_MakeAvailable(googletest) +include_directories(${googletest_INCLUDE_DIR}) \ No newline at end of file diff --git a/cpp/cmake/FindJson.cmake b/cpp/cmake/FindJson.cmake new file mode 100644 index 0000000000..a47a8caccb --- /dev/null +++ b/cpp/cmake/FindJson.cmake @@ -0,0 +1,13 @@ +include(FetchContent) + +set(json_URL https://github.com/nlohmann/json/releases/download/v3.11.2/json.tar.xz) +set(json_URL_HASH 8c4b26bf4b422252e13f332bc5e388ec0ab5c3443d24399acb675e68278d341f) +set(json_SOURCE_DIR ${DEFAULT_PARENT_DIR}/external/json) + +FetchContent_Declare( + json + URL ${json_URL} + URL_HASH SHA256=${json_URL_HASH} + SOURCE_DIR ${json_SOURCE_DIR} +) +FetchContent_MakeAvailable(json) diff --git a/cpp/cmake/FindPybind11.cmake b/cpp/cmake/FindPybind11.cmake new file mode 100644 index 0000000000..2b23652568 --- /dev/null +++ b/cpp/cmake/FindPybind11.cmake @@ -0,0 +1,13 @@ +include(FetchContent) + +set(pybind11_URL https://github.com/pybind/pybind11.git) +set(pybind11_TAG v2.11.1) +set(pybind11_SOURCE_DIR ${DEFAULT_PARENT_DIR}/external/pybind11) + +FetchContent_Declare(pybind11 + GIT_REPOSITORY ${pybind11_URL} + GIT_TAG ${pybind11_TAG} + SOURCE_DIR ${pybind11_SOURCE_DIR} +) + +FetchContent_MakeAvailable(pybind11) diff --git a/cpp/cmake/FindSpdlog.cmake b/cpp/cmake/FindSpdlog.cmake new file mode 100644 index 0000000000..287399b1f0 --- /dev/null +++ b/cpp/cmake/FindSpdlog.cmake @@ -0,0 +1,13 @@ +include(FetchContent) + +set(spdlog_URL https://github.com/gabime/spdlog/archive/refs/tags/v1.12.0.tar.gz) +set(spdlog_URL_HASH 4dccf2d10f410c1e2feaff89966bfc49a1abb29ef6f08246335b110e001e09a9) +set(spdlog_SOURCE_DIR ${DEFAULT_PARENT_DIR}/external/spdlog) + +FetchContent_Declare( + spdlog + URL ${spdlog_URL} + URL_HASH SHA256=${spdlog_URL_HASH} + SOURCE_DIR ${spdlog_SOURCE_DIR} +) +FetchContent_MakeAvailable(spdlog) diff --git a/cpp/cmake/FindStduuid.cmake b/cpp/cmake/FindStduuid.cmake new file mode 100644 index 0000000000..ed3fec0798 --- /dev/null +++ b/cpp/cmake/FindStduuid.cmake @@ -0,0 +1,13 @@ +include(FetchContent) + +set(stduuid_URL https://github.com/mariusbancila/stduuid/archive/refs/tags/v1.2.3.zip) +set(stduuid_URL_HASH 0f867768ce55f2d8fa361be82f87f0ea5e51438bc47ca30cd92c9fd8b014e84e) +set(stduuid_SOURCE_DIR ${DEFAULT_PARENT_DIR}/external/stduuid) + +FetchContent_Declare( + stduuid + URL ${stduuid_URL} + URL_HASH SHA256=${stduuid_URL_HASH} + SOURCE_DIR ${stduuid_SOURCE_DIR} +) +FetchContent_MakeAvailable(stduuid) diff --git a/cpp/deeplog/CMakeLists.txt b/cpp/deeplog/CMakeLists.txt new file mode 100644 index 0000000000..6a2e8cca5f --- /dev/null +++ b/cpp/deeplog/CMakeLists.txt @@ -0,0 +1,18 @@ +project(deeplog) + +include(FetchContent) + +find_package(Arrow REQUIRED) +find_package(Parquet REQUIRED) +find_package(ArrowDataset REQUIRED) + +file(GLOB_RECURSE SOURCES "*.cpp") + +add_library(deeplog ${SOURCES}) + +target_link_libraries(deeplog PUBLIC storage) +target_link_libraries(deeplog PUBLIC stduuid nlohmann_json::nlohmann_json) +target_link_libraries(deeplog PUBLIC "$,Arrow::arrow_static,Arrow::arrow_shared>") +target_link_libraries(deeplog PUBLIC "$,Parquet::parquet_static,Parquet::parquet_shared>") +target_link_libraries(deeplog PUBLIC "$,ArrowDataset::arrow_dataset_static,ArrowDataset::arrow_dataset_shared>") +target_link_libraries(deeplog PUBLIC spdlog::spdlog) diff --git a/cpp/deeplog/actions/action.cpp b/cpp/deeplog/actions/action.cpp new file mode 100644 index 0000000000..8255ec06c9 --- /dev/null +++ b/cpp/deeplog/actions/action.cpp @@ -0,0 +1,6 @@ +#include "action.hpp" + +namespace deeplog { + + +}; diff --git a/cpp/deeplog/actions/action.hpp b/cpp/deeplog/actions/action.hpp new file mode 100644 index 0000000000..9f68767a7c --- /dev/null +++ b/cpp/deeplog/actions/action.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include +#include "deeplog_serializable.hpp" + +namespace deeplog { + + class action : public deeplog_serializable { + + public: + virtual nlohmann::json to_json() = 0; + + virtual std::string action_name() = 0; + + virtual std::shared_ptr action_type() = 0; + }; +} diff --git a/cpp/deeplog/actions/add_file_action.cpp b/cpp/deeplog/actions/add_file_action.cpp new file mode 100644 index 0000000000..e4cc485eb4 --- /dev/null +++ b/cpp/deeplog/actions/add_file_action.cpp @@ -0,0 +1,46 @@ +#include "add_file_action.hpp" + +namespace deeplog { + + std::shared_ptr add_file_action::arrow_type = std::dynamic_pointer_cast( + arrow::struct_({ + arrow::field("path", arrow::utf8(), true), + arrow::field("type", arrow::utf8(), true), + arrow::field("size", arrow::uint64(), true), + arrow::field("modificationTime", arrow::uint64(), true), + arrow::field("dataChange", arrow::boolean(), true), + arrow::field("numSamples", arrow::uint64(), true), + })); + + add_file_action::add_file_action(std::string path, std::string type, const long &size, const long &modification_time, const bool &data_change, const long &num_samples) : + path(std::move(path)), type(std::move(type)), size(size), modification_time(modification_time), data_change(data_change), num_samples(num_samples) {} + + add_file_action::add_file_action(const std::shared_ptr &value) { + path = from_struct("path", value).value(); + type = from_struct("type", value).value(); + size = from_struct("size", value).value(); + modification_time = from_struct("modificationTime", value).value(); + data_change = from_struct("dataChange", value).value(); + num_samples = from_struct("numSamples", value).value(); + } + + std::string add_file_action::action_name() { + return "add"; + } + + std::shared_ptr add_file_action::action_type() { + return arrow_type; + } + + nlohmann::json add_file_action::to_json() { + nlohmann::json json; + json["path"] = path; + json["type"] = type; + json["size"] = size; + json["modificationTime"] = modification_time; + json["dataChange"] = data_change; + json["numSamples"] = num_samples; + + return json; + } +} \ No newline at end of file diff --git a/cpp/deeplog/actions/add_file_action.hpp b/cpp/deeplog/actions/add_file_action.hpp new file mode 100644 index 0000000000..0d1f439f2a --- /dev/null +++ b/cpp/deeplog/actions/add_file_action.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include "action.hpp" + +namespace deeplog { + class add_file_action : public action { + + public: + std::string path; + std::string type; + unsigned long size; + long modification_time; + bool data_change; + unsigned long num_samples; + + public: + static std::shared_ptr arrow_type; + + add_file_action(std::string path, std::string type, const long &size, const long &modification_time, const bool &data_change, const long &num_samples); + + explicit add_file_action(const std::shared_ptr &struct_scalar); + + nlohmann::json to_json() override; + + std::string action_name() override; + + std::shared_ptr action_type() override; + }; +} \ No newline at end of file diff --git a/cpp/deeplog/actions/create_branch_action.cpp b/cpp/deeplog/actions/create_branch_action.cpp new file mode 100644 index 0000000000..a737a097db --- /dev/null +++ b/cpp/deeplog/actions/create_branch_action.cpp @@ -0,0 +1,42 @@ +#include "create_branch_action.hpp" + +#include + +namespace deeplog { + std::shared_ptr create_branch_action::arrow_type = std::dynamic_pointer_cast( + arrow::struct_({ + arrow::field("id", arrow::utf8(), true), + arrow::field("name", arrow::utf8(), true), + arrow::field("fromId", arrow::utf8(), true), + arrow::field("fromVersion", arrow::uint64(), true), + })); + + + create_branch_action::create_branch_action(std::string id, std::string name, std::optional from_id, const std::optional &from_version) : + id(std::move(id)), name(std::move(name)), from_id(std::move(from_id)), from_version(from_version) {} + + create_branch_action::create_branch_action(const std::shared_ptr &value) { + id = from_struct("id", value).value(); + name = from_struct("name", value).value(); + from_id = from_struct("fromId", value); + from_version = from_struct("fromVersion", value); + } + + std::string create_branch_action::action_name() { + return "branch"; + } + + std::shared_ptr create_branch_action::action_type() { + return arrow_type; + } + + nlohmann::json create_branch_action::to_json() { + nlohmann::json json; + json["id"] = id; + json["name"] = name; + json["fromId"] = to_json_value(from_id); + json["fromVersion"] = to_json_value(from_version); + + return json; + } +} \ No newline at end of file diff --git a/cpp/deeplog/actions/create_branch_action.hpp b/cpp/deeplog/actions/create_branch_action.hpp new file mode 100644 index 0000000000..a1f93068d4 --- /dev/null +++ b/cpp/deeplog/actions/create_branch_action.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include "action.hpp" + +namespace deeplog { + class create_branch_action : public action { + + public: + std::string id; + std::string name; + std::optional from_id; + std::optional from_version; + + public: + static std::shared_ptr arrow_type; + + create_branch_action(std::string id, std::string name, std::optional from_id, const std::optional &from_version); + + explicit create_branch_action(const std::shared_ptr &struct_scalar); + + nlohmann::json to_json() override; + + std::string action_name() override; + + std::shared_ptr action_type() override; + }; +} diff --git a/cpp/deeplog/actions/create_commit_action.cpp b/cpp/deeplog/actions/create_commit_action.cpp new file mode 100644 index 0000000000..fd1f892900 --- /dev/null +++ b/cpp/deeplog/actions/create_commit_action.cpp @@ -0,0 +1,44 @@ +#include "create_commit_action.hpp" + +namespace deeplog { + + std::shared_ptr create_commit_action::arrow_type = std::dynamic_pointer_cast( + arrow::struct_({ + arrow::field("id", arrow::utf8(), true), + arrow::field("branchId", arrow::utf8(), true), + arrow::field("branchVersion", arrow::uint64(), true), + arrow::field("message", arrow::utf8(), true), + arrow::field("commitTime", arrow::uint64(), true), + })); + + create_commit_action::create_commit_action(std::string id, std::string branch_id, const unsigned long &branch_version, const std::optional &message, const long &commit_time) : + id(std::move(id)), branch_id(std::move(branch_id)), branch_version(branch_version), message(std::move(message)), commit_time(commit_time) {} + + create_commit_action::create_commit_action(const std::shared_ptr &value) { + id = from_struct("id", value).value(); + branch_id = from_struct("branchId", value).value(); + branch_version = from_struct("branchVersion", value).value(); + message = from_struct("message", value); + commit_time = from_struct("commitTime", value).value(); + } + + std::string create_commit_action::action_name() { + return "commit"; + } + + std::shared_ptr create_commit_action::action_type() { + return arrow_type; + } + + nlohmann::json create_commit_action::to_json() { + nlohmann::json json; + + json["id"] = id; + json["branchId"] = branch_id; + json["branchVersion"] = branch_version; + json["message"] = to_json_value(message); + json["commitTime"] = commit_time; + + return json; + } +} \ No newline at end of file diff --git a/cpp/deeplog/actions/create_commit_action.hpp b/cpp/deeplog/actions/create_commit_action.hpp new file mode 100644 index 0000000000..9d8414d32e --- /dev/null +++ b/cpp/deeplog/actions/create_commit_action.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include "action.hpp" + +namespace deeplog { + class create_commit_action : public action { + + public: + std::string id; + std::string branch_id; + unsigned long branch_version; + std::optional message; + long commit_time; + + public: + static std::shared_ptr arrow_type; + + create_commit_action(std::string id, std::string branch_id, const unsigned long &branch_version, const std::optional &message, const long &commit_time); + + explicit create_commit_action(const std::shared_ptr &struct_scalar); + + nlohmann::json to_json() override; + + std::string action_name() override; + + std::shared_ptr action_type() override; + }; +} diff --git a/cpp/deeplog/actions/create_tensor_action.cpp b/cpp/deeplog/actions/create_tensor_action.cpp new file mode 100644 index 0000000000..1e879e4fe5 --- /dev/null +++ b/cpp/deeplog/actions/create_tensor_action.cpp @@ -0,0 +1,115 @@ +#include "create_tensor_action.hpp" + +namespace deeplog { + + std::shared_ptr create_tensor_action::arrow_type = std::dynamic_pointer_cast( + arrow::struct_({ + arrow::field("id", arrow::utf8(), true), + arrow::field("name", arrow::utf8(), true), + arrow::field("dtype", arrow::utf8(), true), + arrow::field("htype", arrow::utf8(), true), + arrow::field("length", arrow::uint64(), true), + arrow::field("is_link", arrow::boolean(), true), + arrow::field("is_sequence", arrow::boolean(), true), + arrow::field("hidden", arrow::boolean(), true), + arrow::field("chunkCompression", arrow::utf8(), true), + arrow::field("sampleCompression", arrow::utf8(), true), + arrow::field("links", arrow::map(arrow::utf8(), std::dynamic_pointer_cast( + arrow::struct_({ + arrow::field("extend", arrow::utf8(), true), + arrow::field("flatten_sequence", arrow::boolean(), true), + arrow::field("update", arrow::utf8(), true), + + }))), true), + arrow::field("maxChunkSize", arrow::uint64(), true), + arrow::field("minShape", arrow::list(arrow::uint64()), true), + arrow::field("maxShape", arrow::list(arrow::uint64()), true), + arrow::field("tilingThreshold", arrow::uint64(), true), + arrow::field("typestr", arrow::utf8(), true), + arrow::field("verify", arrow::boolean(), true), + arrow::field("version", arrow::utf8(), true), + })); + + create_tensor_action::create_tensor_action(std::string id, + std::string name, + std::optional dtype, + std::string htype, + const long &length, + const bool &is_link, + const bool &is_sequence, + const bool &hidden, + const std::optional &chunk_compression, + const std::optional &sample_compression, + const std::map &links, + const std::optional &max_chunk_size, + const std::vector &min_shape, + const std::vector &max_shape, + const std::optional &tiling_threshold, + const std::optional &typestr, + const bool &verify, + std::string version) + : id(std::move(id)), name(std::move(name)), + dtype(std::move(dtype)), htype(std::move(htype)), + length(length), is_link(is_link), is_sequence(is_sequence), hidden(hidden), + chunk_compression(chunk_compression), sample_compression(sample_compression), + links(links), + max_chunk_size(max_chunk_size), + min_shape(min_shape), + max_shape(max_shape), + tiling_threshold(tiling_threshold), typestr(typestr), + verify(verify), version(std::move(version)) {} + + create_tensor_action::create_tensor_action(const std::shared_ptr &value) { + id = from_struct("id", value).value(); + name = from_struct("name", value).value(); + dtype = from_struct("dtype", value); + htype = from_struct("htype", value).value(); + length = from_struct("length", value).value(); + is_link = from_struct("is_link", value).value(); + is_sequence = from_struct("is_sequence", value).value(); + hidden = from_struct("hidden", value).value(); + chunk_compression = from_struct("chunkCompression", value); + sample_compression = from_struct("sampleCompression", value); + links = from_struct>("links", value).value(); + max_chunk_size = from_struct("maxChunkSize", value); + min_shape = from_arraystruct("minShape", value); + max_shape = from_arraystruct("maxShape", value); + tiling_threshold = from_struct("tilingThreshold", value); + typestr = from_struct("typestr", value); + verify = from_struct("verify", value).value(); + version = from_struct("version", value).value(); + } + + std::string create_tensor_action::action_name() { + return "tensor"; + } + + std::shared_ptr create_tensor_action::action_type() { + return arrow_type; + } + + nlohmann::json create_tensor_action::to_json() { + nlohmann::json json; + + json["id"] = id; + json["name"] = name; + json["dtype"] = to_json_value(dtype); + json["htype"] = htype; + json["length"] = length; + json["is_link"] = is_link; + json["is_sequence"] = is_sequence; + json["hidden"] = hidden; + json["chunkCompression"] = to_json_value(chunk_compression); + json["sampleCompression"] = to_json_value(sample_compression); + json["links"] = to_json_value>(links); + json["maxChunkSize"] = to_json_value(max_chunk_size); + json["minShape"] = min_shape; + json["maxShape"] = max_shape; + json["tilingThreshold"] = to_json_value(tiling_threshold); + json["typestr"] = to_json_value(typestr); + json["verify"] = verify; + json["version"] = version; + + return json; + } +} diff --git a/cpp/deeplog/actions/create_tensor_action.hpp b/cpp/deeplog/actions/create_tensor_action.hpp new file mode 100644 index 0000000000..0783f54b2c --- /dev/null +++ b/cpp/deeplog/actions/create_tensor_action.hpp @@ -0,0 +1,60 @@ +#pragma once + +#include "action.hpp" +#include "tensor_link.hpp" + +namespace deeplog { + class create_tensor_action : public action { + + public: + std::string id; + std::string name; + std::optional dtype; + std::string htype; + long length; + bool is_link; + bool is_sequence; + bool hidden; + std::optional chunk_compression; + std::optional sample_compression; + std::map links; + std::optional max_chunk_size; + std::vector min_shape; + std::vector max_shape; + std::optional tiling_threshold; + std::optional typestr; + bool verify; + std::string version; + + public: + static std::shared_ptr arrow_type; + + create_tensor_action(std::string id, + std::string name, + std::optional dtype, + std::string htype, + const long &length, + const bool &is_link, + const bool &is_sequence, + const bool &hidden, + const std::optional &chunk_compression, + const std::optional &sample_compression, + const std::map &links, + const std::optional &max_chunk_size, + const std::vector &min_shape, + const std::vector &max_shape, + const std::optional &tiling_threshold, + const std::optional &typestr, + const bool &verify, + std::string version + ); + + explicit create_tensor_action(const std::shared_ptr &struct_scalar); + + nlohmann::json to_json() override; + + std::string action_name() override; + + std::shared_ptr action_type() override; + }; +} diff --git a/cpp/deeplog/actions/deeplog_serializable.cpp b/cpp/deeplog/actions/deeplog_serializable.cpp new file mode 100644 index 0000000000..e875f872ab --- /dev/null +++ b/cpp/deeplog/actions/deeplog_serializable.cpp @@ -0,0 +1,101 @@ +#include "deeplog_serializable.hpp" +#include "tensor_link.hpp" +#include + +namespace deeplog { + + template + nlohmann::json deeplog_serializable::to_json_value(const std::optional &value) const { + if (!value.has_value()) { + return nlohmann::json::value_t::null; + } + + if constexpr (std::is_same>::value) { + auto return_map = nlohmann::json::object(); + for (auto &item : value.value()) { + return_map[item.first] = item.second.to_json(); + } + return return_map; + } else { + return value.value(); + } + } + + template nlohmann::json deeplog_serializable::to_json_value(const std::optional &value) const; + + template nlohmann::json deeplog_serializable::to_json_value(const std::optional &value) const; + + template nlohmann::json deeplog_serializable::to_json_value(const std::optional &value) const; + + template nlohmann::json deeplog_serializable::to_json_value(const std::optional &value) const; + + //need to figure out how to not need to specify every map template option. Same below + template nlohmann::json deeplog_serializable::to_json_value(const std::optional> &value) const; + + template + std::optional deeplog_serializable::from_struct(const std::string &field_name, const std::shared_ptr &struct_scalar) { + auto scalar = struct_scalar->field(field_name).ValueOrDie(); + if (!scalar->is_valid) { + return std::nullopt; + } + + if constexpr (std::is_same::value) { + return std::reinterpret_pointer_cast(scalar)->value->ToString(); + } else if constexpr (std::is_same::value) { + return std::reinterpret_pointer_cast(scalar)->value; + } else if constexpr (std::is_same::value) { + return std::reinterpret_pointer_cast(scalar)->value; + } else if constexpr (std::is_same::value) { + return std::reinterpret_pointer_cast(scalar)->value; + } else if constexpr (std::is_same::value) { + return std::reinterpret_pointer_cast(scalar)->value; + } else if constexpr (std::is_same>::value) { + auto map_data = std::reinterpret_pointer_cast(scalar)->value; + T return_map = {}; + + for (auto i = 0; i < map_data->length(); ++i) { + auto raw_scalar = map_data->GetScalar(i).ValueOrDie(); + auto map_struct = std::dynamic_pointer_cast(raw_scalar); + std::string key = std::dynamic_pointer_cast(map_struct->field("key").ValueOrDie())->view().data(); + tensor_link value = tensor_link(std::dynamic_pointer_cast(map_struct->field("value").ValueOrDie())); + return_map.insert({key, value}); + } + + return return_map; + } else { + throw std::runtime_error("Unsupported struct type: " + std::string(typeid(T).name())); + } + } + + template + std::vector deeplog_serializable::from_arraystruct(const std::string &field_name, const std::shared_ptr &struct_scalar) { + auto list_scalar = std::reinterpret_pointer_cast(struct_scalar->field(field_name).ValueOrDie()); + if (!list_scalar->is_valid) { + return {}; + } + + auto array = std::reinterpret_pointer_cast(list_scalar->value); + + std::vector return_vector = {}; + return_vector.reserve(array->length()); + for (auto i = 0; i < array->length(); ++i) { + return_vector.push_back(array->Value(i)); + } + + return return_vector; + } + + template std::optional deeplog_serializable::from_struct(const std::string &field_name, const std::shared_ptr &struct_scalar); + + template std::optional deeplog_serializable::from_struct(const std::string &field_name, const std::shared_ptr &struct_scalar); + + template std::optional deeplog_serializable::from_struct(const std::string &field_name, const std::shared_ptr &struct_scalar); + + template std::optional deeplog_serializable::from_struct(const std::string &field_name, const std::shared_ptr &struct_scalar); + + template std::optional deeplog_serializable::from_struct(const std::string &field_name, const std::shared_ptr &struct_scalar); + + template std::optional> deeplog_serializable::from_struct>(const std::string &field_name, const std::shared_ptr &struct_scalar); + + template std::vector deeplog_serializable::from_arraystruct(const std::string &field_name, const std::shared_ptr &struct_scalar); +}; \ No newline at end of file diff --git a/cpp/deeplog/actions/deeplog_serializable.hpp b/cpp/deeplog/actions/deeplog_serializable.hpp new file mode 100644 index 0000000000..568a13a596 --- /dev/null +++ b/cpp/deeplog/actions/deeplog_serializable.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include +#include + +namespace deeplog { + class deeplog_serializable { + + public: + + template + nlohmann::json to_json_value(const std::optional &value) const; + + template + std::optional from_struct(const std::string &field_name, const std::shared_ptr &struct_scalar); + + template + std::vector from_arraystruct(const std::string &field_name, const std::shared_ptr &struct_scalar); + + }; +} diff --git a/cpp/deeplog/actions/metadata_action.cpp b/cpp/deeplog/actions/metadata_action.cpp new file mode 100644 index 0000000000..0fc0edb4a4 --- /dev/null +++ b/cpp/deeplog/actions/metadata_action.cpp @@ -0,0 +1,54 @@ +#include "metadata_action.hpp" +#include +#include +#include + +namespace deeplog { + + std::shared_ptr metadata_action::arrow_type = std::dynamic_pointer_cast( + arrow::struct_({ + arrow::field("id", arrow::utf8(), true), + arrow::field("name", arrow::utf8(), true), + arrow::field("description", arrow::utf8(), true), + arrow::field("createdTime", arrow::int64(), true), + })); + + deeplog::metadata_action::metadata_action(std::string id, const std::optional &name, const std::optional &description, + const long &created_time) : + id(std::move(id)), name(std::move(name)), description(std::move(description)), created_time(created_time) {} + + metadata_action::metadata_action(const std::shared_ptr &value) { + id = from_struct("id", value).value(); + name = from_struct("name", value); + description = from_struct("description", value); + created_time = from_struct("createdTime", value).value(); + } + + std::string metadata_action::action_name() { + return "metadata"; + } + + std::shared_ptr metadata_action::action_type() { + return arrow_type; + } + + nlohmann::json deeplog::metadata_action::to_json() { + nlohmann::json json; + + json["id"] = id; + json["name"] = to_json_value(name); + json["description"] = to_json_value(description); + json["createdTime"] = created_time; + + return json; + + } + + bool metadata_action::replaces(std::shared_ptr action) { + return action->action_name() == action_name(); + } + + std::shared_ptr metadata_action::replace(std::shared_ptr action) { + return shared_from_this(); + } +} \ No newline at end of file diff --git a/cpp/deeplog/actions/metadata_action.hpp b/cpp/deeplog/actions/metadata_action.hpp new file mode 100644 index 0000000000..4fb7dad768 --- /dev/null +++ b/cpp/deeplog/actions/metadata_action.hpp @@ -0,0 +1,37 @@ +#pragma once + +#include "action.hpp" +#include "replace_action.hpp" +#include +#include +#include + +namespace deeplog { + + class metadata_action : public action, public replace_action, public std::enable_shared_from_this { + public: + std::string id; + std::optional name; + std::optional description; + long created_time; + + public: + static std::shared_ptr arrow_type; + + metadata_action(std::string id, const std::optional &name, const std::optional &description, + const long &created_time); + + explicit metadata_action(const std::shared_ptr &struct_scalar); + + nlohmann::json to_json() override; + + std::string action_name() override; + + std::shared_ptr action_type() override; + + bool replaces(std::shared_ptr action) override; + + std::shared_ptr replace(std::shared_ptr action) override; + + }; +} diff --git a/cpp/deeplog/actions/protocol_action.cpp b/cpp/deeplog/actions/protocol_action.cpp new file mode 100644 index 0000000000..634b24c3f9 --- /dev/null +++ b/cpp/deeplog/actions/protocol_action.cpp @@ -0,0 +1,45 @@ +#include +#include +#include "protocol_action.hpp" + +namespace deeplog { + + std::shared_ptr protocol_action::arrow_type = std::dynamic_pointer_cast( + arrow::struct_({ + arrow::field("minReaderVersion", arrow::int32(), true), + arrow::field("minWriterVersion", arrow::int32(), true), + })); + + deeplog::protocol_action::protocol_action(const int &min_reader_version, const int &min_writer_version) + : min_reader_version(min_reader_version), min_writer_version(min_writer_version) {} + + protocol_action::protocol_action(const std::shared_ptr &value) { + min_reader_version = from_struct("minReaderVersion", value).value(); + min_writer_version = from_struct("minWriterVersion", value).value(); + } + + std::string protocol_action::action_name() { + return "protocol"; + } + + std::shared_ptr protocol_action::action_type() { + return arrow_type; + } + + nlohmann::json deeplog::protocol_action::to_json() { + nlohmann::json json; + + json["minReaderVersion"] = min_reader_version; + json["minWriterVersion"] = min_writer_version; + + return json; + } + + bool protocol_action::replaces(std::shared_ptr action) { + return action->action_name() == action_name(); + } + + std::shared_ptr protocol_action::replace(std::shared_ptr action) { + return shared_from_this(); + } +} \ No newline at end of file diff --git a/cpp/deeplog/actions/protocol_action.hpp b/cpp/deeplog/actions/protocol_action.hpp new file mode 100644 index 0000000000..7332887588 --- /dev/null +++ b/cpp/deeplog/actions/protocol_action.hpp @@ -0,0 +1,32 @@ +#pragma once + +#include "action.hpp" +#include "replace_action.hpp" +#include + +namespace deeplog { + + class protocol_action : public action, public replace_action, public std::enable_shared_from_this { + public: + int min_reader_version; + int min_writer_version; + + public: + static std::shared_ptr arrow_type; + + protocol_action(const int &min_reader_version, const int &min_writer_version); + + explicit protocol_action(const std::shared_ptr &struct_scalar); + + nlohmann::json to_json() override; + + std::string action_name() override; + + std::shared_ptr action_type() override; + + bool replaces(std::shared_ptr action) override; + + std::shared_ptr replace(std::shared_ptr action) override; + }; + +} diff --git a/cpp/deeplog/actions/remove_file_action.cpp b/cpp/deeplog/actions/remove_file_action.cpp new file mode 100644 index 0000000000..bd1a73505f --- /dev/null +++ b/cpp/deeplog/actions/remove_file_action.cpp @@ -0,0 +1,40 @@ +#include "remove_file_action.hpp" + +namespace deeplog { + + std::shared_ptr remove_file_action::arrow_type = std::dynamic_pointer_cast( + arrow::struct_({ + arrow::field("path", arrow::utf8(), false), + arrow::field("deletionTime", arrow::int64(), false), + arrow::field("dataChange", arrow::boolean(), false), + arrow::field("size", arrow::uint64(), false), + })); + + remove_file_action::remove_file_action(std::string path, const long &size, const long &deletion_timestamp, const bool &data_change) : + path(std::move(path)), size(size), deletion_time(deletion_timestamp), data_change(data_change) {}; + + remove_file_action::remove_file_action(const std::shared_ptr &value) { + path = from_struct("path", value).value(); + deletion_time = from_struct("deletionTime", value).value(); + data_change = from_struct("dataChange", value).value(); + size = from_struct("size", value).value(); + } + std::string remove_file_action::action_name() { + return "remove"; + } + + std::shared_ptr remove_file_action::action_type() { + return arrow_type; + } + + nlohmann::json remove_file_action::to_json() { + nlohmann::json json; + + json["path"] = path; + json["deletionTime"] = deletion_time; + json["dataChange"] = data_change; + json["size"] = size; + + return json; + } +} diff --git a/cpp/deeplog/actions/remove_file_action.hpp b/cpp/deeplog/actions/remove_file_action.hpp new file mode 100644 index 0000000000..2063ecfbd4 --- /dev/null +++ b/cpp/deeplog/actions/remove_file_action.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include +#include +#include "action.hpp" + +namespace deeplog { + class remove_file_action : public action { + public: + std::string path; + long deletion_time; + bool data_change; + unsigned long size; + + public: + static std::shared_ptr arrow_type; + + remove_file_action(std::string path, const long &size, const long &deletion_timestamp, const bool &data_change); + + explicit remove_file_action(const std::shared_ptr &struct_scalar); + + nlohmann::json to_json() override; + + std::string action_name() override; + + std::shared_ptr action_type() override; + }; +} diff --git a/cpp/deeplog/actions/replace_action.cpp b/cpp/deeplog/actions/replace_action.cpp new file mode 100644 index 0000000000..3fc455232b --- /dev/null +++ b/cpp/deeplog/actions/replace_action.cpp @@ -0,0 +1 @@ +#include "replace_action.hpp" diff --git a/cpp/deeplog/actions/replace_action.hpp b/cpp/deeplog/actions/replace_action.hpp new file mode 100644 index 0000000000..04e24265e8 --- /dev/null +++ b/cpp/deeplog/actions/replace_action.hpp @@ -0,0 +1,14 @@ +#pragma once + +#include "action.hpp" + +namespace deeplog { + + + class replace_action { + public: + virtual bool replaces(std::shared_ptr<::deeplog::action> action) = 0; + + virtual std::shared_ptr replace(std::shared_ptr<::deeplog::action> action) = 0; + }; +} diff --git a/cpp/deeplog/actions/tensor_link.cpp b/cpp/deeplog/actions/tensor_link.cpp new file mode 100644 index 0000000000..49de8d9443 --- /dev/null +++ b/cpp/deeplog/actions/tensor_link.cpp @@ -0,0 +1,28 @@ +#include "tensor_link.hpp" + +namespace deeplog { + std::shared_ptr tensor_link::arrow_type = std::dynamic_pointer_cast( + arrow::struct_({ + arrow::field("extend", arrow::utf8(), true), + arrow::field("flatten_sequence", arrow::boolean(), true), + arrow::field("update", arrow::utf8(), true), + + })); + + tensor_link::tensor_link(const std::optional &extend, const std::optional &flatten_sequence, const std::optional &update) + : extend(extend), flatten_sequence(flatten_sequence), update(update) {} + + tensor_link::tensor_link(const std::shared_ptr &value) { + extend = from_struct("extend", value); + flatten_sequence = from_struct("flatten_sequence", value); + update = from_struct("update", value); + } + + nlohmann::json tensor_link::to_json() const { + nlohmann::json json; + json["extend"] = to_json_value(extend); + json["flatten_sequence"] = to_json_value(flatten_sequence); + json["update"] = to_json_value(update); + return json; + } +} \ No newline at end of file diff --git a/cpp/deeplog/actions/tensor_link.hpp b/cpp/deeplog/actions/tensor_link.hpp new file mode 100644 index 0000000000..3db3e06e8a --- /dev/null +++ b/cpp/deeplog/actions/tensor_link.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include +#include +#include "deeplog_serializable.hpp" + +namespace deeplog { + + class tensor_link : public deeplog_serializable { + public: + static std::shared_ptr arrow_type; + + std::optional extend; + std::optional flatten_sequence; + std::optional update; + + tensor_link(const std::optional &extend, const std::optional &flatten_sequence, const std::optional &update); + + explicit tensor_link(const std::shared_ptr &struct_scalar); + + nlohmann::json to_json() const; + }; +} diff --git a/cpp/deeplog/base_snapshot.cpp b/cpp/deeplog/base_snapshot.cpp new file mode 100644 index 0000000000..b5dcc1ed1b --- /dev/null +++ b/cpp/deeplog/base_snapshot.cpp @@ -0,0 +1,53 @@ +#include "base_snapshot.hpp" + +namespace deeplog { + + base_snapshot::base_snapshot(const std::string &branch_id, const std::optional &version, const std::shared_ptr<::deeplog::deeplog> &deeplog) : + branch_id(branch_id), + version(0), + deeplog(deeplog), + actions_() { + std::tie(actions_, this->version) = deeplog->get_actions(branch_id, version); + } + + template + std::vector> base_snapshot::find_actions() const { + static_assert(std::is_base_of::value, "T must be a subclass of action"); + + std::vector> return_actions = {}; + for (auto found: *actions_) { + auto casted = std::dynamic_pointer_cast(found); + if (casted != nullptr) { + return_actions.push_back(casted); + } + } + + return return_actions; + + } + + template + std::shared_ptr base_snapshot::find_action() const { + static_assert(std::is_base_of::value, "T must be a subclass of action"); + + auto actions = find_actions(); + + if (actions.empty()) { + return nullptr; + } + return actions.at(0); + } + + + template std::vector> base_snapshot::find_actions() const; + + template std::vector> base_snapshot::find_actions() const; + + template std::vector> base_snapshot::find_actions() const; + + template std::vector> base_snapshot::find_actions() const; + + template std::shared_ptr base_snapshot::find_action() const; + + template std::shared_ptr base_snapshot::find_action() const; +} \ No newline at end of file diff --git a/cpp/deeplog/base_snapshot.hpp b/cpp/deeplog/base_snapshot.hpp new file mode 100644 index 0000000000..98e0224765 --- /dev/null +++ b/cpp/deeplog/base_snapshot.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include "deeplog.hpp" + +namespace deeplog { + class base_snapshot { + + public: + unsigned long version; + const std::string branch_id; + const std::shared_ptr deeplog; + + virtual std::shared_ptr update() const = 0; + + protected: + std::shared_ptr>> actions_; + + base_snapshot(const std::string &branch_id, const std::optional &version, const std::shared_ptr<::deeplog::deeplog> &deeplog); + + template + std::vector> find_actions() const; + + template + std::shared_ptr find_action() const; + }; +} \ No newline at end of file diff --git a/cpp/deeplog/deeplog.cpp b/cpp/deeplog/deeplog.cpp new file mode 100644 index 0000000000..53f726cc2c --- /dev/null +++ b/cpp/deeplog/deeplog.cpp @@ -0,0 +1,411 @@ +#include "deeplog.hpp" +#include +#include +#include +#include +#include "actions/protocol_action.hpp" +#include "actions/metadata_action.hpp" +#include "actions/create_branch_action.hpp" +#include "actions/create_tensor_action.hpp" +#include +#include +#include +#include +#include +#include +#include +#include "last_checkpoint.hpp" +#include "deeplog_v3.hpp" +#include "../storage/local_storage.hpp" +#include "util.hpp" +#include "json_parser.hpp" +#include + +namespace deeplog { + + const std::shared_ptr deeplog::arrow_schema = std::make_shared(arrow::FieldVector{ + arrow::field("protocol", protocol_action::arrow_type), + arrow::field("metadata", metadata_action::arrow_type), + arrow::field("add", add_file_action::arrow_type), + arrow::field("branch", create_branch_action::arrow_type), + arrow::field("tensor", create_tensor_action::arrow_type), + arrow::field("version", arrow::uint64()), + }); + + deeplog::deeplog(const std::shared_ptr &storage) : storage_(storage) {}; + + std::shared_ptr deeplog::create(const std::string &path, const int &log_version) { + return create(std::make_shared(storage::local_storage(path)), log_version); + } + + std::shared_ptr deeplog::create(const std::shared_ptr<::storage::storage> &storage, const int &log_version) { + if (log_version < 3) { + throw std::runtime_error("Log version " + std::to_string(log_version) + " is not supported"); + } + if (log_version == 3) { + return std::make_shared(deeplog_v3(storage)); + } + + if (storage->file("/_deeplake_log/" + META_BRANCH_ID + "/" + zero_pad(1) + ".json").exists() + || storage->file("/_deeplake_log/" + META_BRANCH_ID + "/_last_checkpoint.json").exists()) { + throw std::runtime_error("DeepLake config already exists"); + } + + auto log = std::make_shared(deeplog(storage)); + std::vector actions; + + auto protocol = std::make_shared(protocol_action(4, 4)); + auto metadata = std::make_shared(metadata_action(generate_id(), std::nullopt, std::nullopt, current_timestamp())); + + auto branch = std::make_shared(create_branch_action(generate_id(), "main", std::nullopt, std::nullopt)); + + log->commit(META_BRANCH_ID, 0, {protocol, metadata, branch}); + + return log; + + } + + std::shared_ptr deeplog::open(const std::string &path) { + spdlog::debug("Opening log at path: {}", std::filesystem::absolute(path).string()); + return open(std::make_shared(storage::local_storage(path))); + } + + std::shared_ptr deeplog::open(const std::shared_ptr &storage) { + if (!(storage->file("/_deeplake_log/" + META_BRANCH_ID + "/" + zero_pad(1) + ".json").exists() + || storage->file("/_deeplake_log/" + META_BRANCH_ID + "/_last_checkpoint.json").exists())) { + if (storage->file("/dataset_meta.json").exists()) { + return std::make_shared(deeplog_v3(storage)); + } + throw std::runtime_error("Cannot determine log format"); + } + + return std::make_shared(deeplog(storage)); + } + + int deeplog::log_format() const { + return 4; + } + + std::string zero_pad(const unsigned long &version) { + std::ostringstream ss; + ss << std::setw(20) << std::setfill('0') << (version); + return ss.str(); + } + + unsigned long deeplog::version(const std::string &branch_id) const { + return get<1>(get_actions(branch_id, std::nullopt)); + } + + bool deeplog::commit(const std::string &branch_id, + const unsigned long &base_version, + const std::vector> &actions) { + + auto log_dir = "/_deeplake_log/" + branch_id + "/"; + + auto operationFilePath = log_dir + zero_pad(base_version + 1) + ".json"; + + if (storage_->file(operationFilePath).exists()) { + spdlog::debug("Version {} file already exists", operationFilePath); + + return false; + } + + spdlog::debug("Committing {} actions to {}", actions.size(), operationFilePath); + + std::stringstream buffer; + for (auto action: actions) { + nlohmann::json json; + json[action->action_name()] = action->to_json(); + buffer << json; + } + + storage_->set_bytes(operationFilePath, buffer.str()); + return true; + } + + arrow::Result> deeplog::action_data(const std::string &branch_id, + const unsigned long &from, + const std::optional &to) const { + spdlog::debug("Reading action data for branch '{}' from {} to {}", branch_id, from, to.value_or(ULONG_MAX)); + unsigned long highest_version = 0; + std::vector> all_tables = {}; + + const auto dir_path = "/_deeplake_log/" + branch_id; + + auto last_checkpoint_path = "/_deeplake_log/" + branch_id + "/_last_checkpoint.json"; + auto last_checkpoint_ref = storage_->file(last_checkpoint_path); + if (last_checkpoint_ref.exists()) { + auto last_checkpoint_stream = storage_->get_bytes(last_checkpoint_path); + nlohmann::json last_checkpoint_json = nlohmann::json::parse(last_checkpoint_stream); + auto checkpoint = last_checkpoint(last_checkpoint_json); + + const arrow::Result> &result = read_checkpoint(dir_path, checkpoint.version); + if (!result.ok()) { + spdlog::error("Checkpoint read failed: {}", result.status().message()); + return result.status(); + } + all_tables.push_back(result.ValueOrDie()); + highest_version = checkpoint.version; + } + + + std::optional next_from = from; + std::set < ::storage::file_ref > sorted_paths = {}; + + if (storage_->file(dir_path).exists()) { + for (const auto &file_ref: storage_->list_files(dir_path)) { + if (file_ref.path.ends_with(".json") && !file_ref.path.ends_with("_last_checkpoint.json")) { + auto found_version = file_version(file_ref.path); + if (to.has_value() && found_version > to) { + continue; + } + + if (highest_version < found_version) { + highest_version = found_version; + } + + if (!next_from.has_value() || found_version >= next_from) { + sorted_paths.insert(file_ref); + } + } + } + } + + ARROW_ASSIGN_OR_RAISE(std::shared_ptr batch_builder, arrow::RecordBatchBuilder::Make(arrow_schema, arrow::default_memory_pool(), 20)); + + for (const auto &json_path: sorted_paths) { + spdlog::debug("Reading data from {}", json_path.path); + auto buffer_reader = open_arrow_istream(json_path); + + ARROW_RETURN_NOT_OK(json_parser::parse(buffer_reader, batch_builder)); + } + + ARROW_ASSIGN_OR_RAISE(auto json_batch, batch_builder->Flush()); + ARROW_ASSIGN_OR_RAISE(auto json_table, arrow::Table::FromRecordBatches(arrow_schema, {json_batch})); + all_tables.push_back(json_table); + + std::vector> version_row; + for (const auto &field: arrow_schema->fields()) { + if (field->name() == "version") { + version_row.push_back(arrow::MakeArrayFromScalar(arrow::UInt64Scalar(highest_version), 1).ValueOrDie()); + } else { + version_row.push_back(arrow::MakeArrayOfNull(field->type(), 1).ValueOrDie()); + } + } + + spdlog::debug("Finished loading data in {} to version {}", branch_id, highest_version); + all_tables.push_back(arrow::Table::Make(arrow_schema, version_row)); + + return arrow::ConcatenateTables(all_tables).ValueOrDie(); + } + + std::tuple>>, long> deeplog::get_actions(const std::string &branch_id, + const std::optional &to) const { + std::vector> return_actions = {}; + + auto all_operations_result = action_data(branch_id, 0, to); + if (!all_operations_result.ok()) { + throw std::runtime_error("Error reading action data: " + all_operations_result.status().message()); + } + + auto all_operations = all_operations_result.ValueOrDie(); + + spdlog::debug("Parsing action data..."); + + unsigned long version = 0; + for (long row_id = 0; row_id < all_operations->num_rows(); ++row_id) { + auto field_id = 0; + for (const auto &field: all_operations->fields()) { + auto scalar = all_operations->column(field_id)->GetScalar(row_id).ValueOrDie(); + if (scalar->is_valid) { + if (field->name() == "version") { + version = std::dynamic_pointer_cast(scalar)->value; + } else { + std::shared_ptr<::deeplog::action> action; + auto struct_scalar = std::dynamic_pointer_cast(scalar); + if (field->name() == "protocol") { + action = std::make_shared<::deeplog::protocol_action>(::deeplog::protocol_action(struct_scalar)); + } else if (field->name() == "metadata") { + action = std::make_shared<::deeplog::metadata_action>(::deeplog::metadata_action(struct_scalar)); + } else if (field->name() == "branch") { + action = std::make_shared<::deeplog::create_branch_action>(::deeplog::create_branch_action(struct_scalar)); + } else if (field->name() == "add") { + action = std::make_shared<::deeplog::add_file_action>(::deeplog::add_file_action(struct_scalar)); + } else if (field->name() == "tensor") { + action = std::make_shared<::deeplog::create_tensor_action>(::deeplog::create_tensor_action(struct_scalar)); + } else { + throw std::runtime_error("Unknown action type: " + field->name()); + } + + auto replace_action = std::dynamic_pointer_cast<::deeplog::replace_action>(action); + if (replace_action == nullptr) { + return_actions.push_back(action); + } else { + auto matches = std::find_if(return_actions.begin(), return_actions.end(), [replace_action](std::shared_ptr<::deeplog::action> a) { + return replace_action->replaces(a); + }); + + if (matches == return_actions.end()) { + return_actions.push_back(action); + } else { + auto index = std::distance(return_actions.begin(), matches); + auto replacement = replace_action->replace(*matches); + if (replacement == nullptr) { + return_actions.erase(return_actions.begin() + index); + } else { + return_actions.at(index) = replacement; + } + } + } + } + } + ++field_id; + } + } + + spdlog::debug("Loaded {} actions for branch '{}' to version {}", return_actions.size(), branch_id, version); + + return std::make_tuple(std::make_shared>>(return_actions), version); + } + + long deeplog::file_version(const std::string &path) const { + std::filesystem::path path_obj = path; + auto formatted_version = path_obj.filename().string() + .substr(0, path_obj.filename().string().length() - 5); + return std::stol(formatted_version); + } + + void deeplog::checkpoint(const std::string &branch_id) { + unsigned long version_to_checkpoint = version(branch_id); + + auto status = write_checkpoint(branch_id, version_to_checkpoint); + + if (!status.ok()) { + throw std::runtime_error(status.message()); + return; + } + nlohmann::json checkpoint_json = last_checkpoint(version_to_checkpoint, 3013); + + auto checkpoint_path = "/_deeplake_log/" + branch_id + "/_last_checkpoint.json"; + storage_->set_bytes(checkpoint_path, checkpoint_json.dump()); + } + + arrow::Result> deeplog::read_checkpoint(const std::string &dir_path, const unsigned long &version) const { + arrow::MemoryPool *pool = arrow::default_memory_pool(); + auto input = open_arrow_istream(storage_->file(dir_path + "/" + zero_pad(version) + ".checkpoint.parquet")); + + std::unique_ptr arrow_reader; + ARROW_RETURN_NOT_OK(parquet::arrow::OpenFile(input, pool, &arrow_reader)); + + std::shared_ptr table; + ARROW_RETURN_NOT_OK(arrow_reader->ReadTable(&table)); + + return arrow::Result>(table); + } + + arrow::Status deeplog::write_checkpoint(const std::string &branch_id, const unsigned long &version) { + auto [actions, last_version] = get_actions(branch_id, version); + + auto array_builders = create_arrow_builders(); + + for (const auto &action: *actions) { + std::string json = action->to_json().dump(); + + std::shared_ptr struct_scalar; + auto status = arrow::ipc::internal::json::ScalarFromJSON(action->action_type(), json, &struct_scalar); + if (!status.ok()) { + throw std::runtime_error("Error creating struct from json: " + status.message()); + } + + for (auto i = 0; i < arrow_schema->num_fields(); ++i) { + auto field = arrow_schema->field(i); + auto builder = array_builders.at(i); + if (field->name() == action->action_name()) { + status = builder->AppendScalar(*std::dynamic_pointer_cast(struct_scalar)); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } else { + status = builder->AppendNull(); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + } + } + + for (auto i = 0; i < arrow_schema->num_fields(); ++i) { + auto field = arrow_schema->field(i); + auto builder = array_builders.at(i); + if (field->name() == "version") { + auto status = builder->AppendScalar(arrow::NumericScalar(version)); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } else { + auto status = builder->AppendNull(); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + } + + + std::vector> final_arrays{}; + final_arrays.reserve(array_builders.size()); + for (const auto &build: array_builders) { + final_arrays.push_back(build->Finish().ValueOrDie()); + } + + auto table = arrow::Table::Make(arrow_schema, final_arrays); + + std::shared_ptr props = parquet::WriterProperties::Builder().compression(arrow::Compression::SNAPPY)->build(); + std::shared_ptr arrow_props = parquet::ArrowWriterProperties::Builder().store_schema()->build(); + + std::shared_ptr outfile; + ARROW_ASSIGN_OR_RAISE(outfile, arrow::io::BufferOutputStream::Create()); +// + ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), outfile, parquet::DEFAULT_MAX_ROW_GROUP_LENGTH, props, arrow_props)); + + auto buffer = outfile->Finish().ValueOrDie(); + const uint8_t *byte_data = buffer->data(); + int64_t byte_data_size = buffer->size(); + + std::stringstream outstream; + // Print the byte array + for (auto i = 0; i < byte_data_size; ++i) { + outstream << byte_data[i]; + } + + storage_->set_bytes("/_deeplake_log/" + branch_id + "/" + zero_pad(version) + ".checkpoint.parquet", outstream.str()); + + return arrow::Status::OK(); + } + + std::shared_ptr deeplog::open_arrow_istream(const storage::file_ref &file) const { + auto file_data = storage_->get_bytes(file.path); + std::string file_str = std::string(file_data.begin(), file_data.end()); + + auto buffer = arrow::Buffer::FromString(file_str); + + return arrow::Buffer::GetReader(buffer).ValueOrDie(); + } + + std::vector> deeplog::create_arrow_builders() const { + std::vector> array_builders{}; + for (auto field: arrow_schema->fields()) { + if (field->name() == "version") { + array_builders.push_back(arrow::MakeBuilder(field->type(), arrow::default_memory_pool()).ValueOrDie()); + } else { + std::vector> struct_builders{}; + for (auto struct_field: field->type()->fields()) { + struct_builders.push_back(arrow::MakeBuilder(struct_field->type(), arrow::default_memory_pool()).ValueOrDie()); + } + + array_builders.push_back(std::make_shared(arrow::StructBuilder(field->type(), arrow::default_memory_pool(), struct_builders))); + } + } + + return array_builders; + } +} // deeplake \ No newline at end of file diff --git a/cpp/deeplog/deeplog.hpp b/cpp/deeplog/deeplog.hpp new file mode 100644 index 0000000000..0754b87bf4 --- /dev/null +++ b/cpp/deeplog/deeplog.hpp @@ -0,0 +1,67 @@ +#pragma once + +#include +#include +#include "actions/add_file_action.hpp" +#include "actions/protocol_action.hpp" +#include "actions/metadata_action.hpp" +#include "actions/create_branch_action.hpp" +#include "actions/create_tensor_action.hpp" +#include "actions/create_commit_action.hpp" +#include "../storage/storage.hpp" + +namespace deeplog { + + const std::string META_BRANCH_ID = "_meta"; + + std::string zero_pad(const unsigned long &version); + + class deeplog { + public: + [[nodiscard]] static std::shared_ptr create(const std::shared_ptr &storage, const int &log_version); + + [[nodiscard]] static std::shared_ptr create(const std::string &path, const int &log_version); + + [[nodiscard]] static std::shared_ptr open(const std::string &path); + + [[nodiscard]] static std::shared_ptr open(const std::shared_ptr &storage); + + virtual int log_format() const; + + unsigned long version(const std::string &branch_id) const; + + std::vector> commits(const std::string &branch_id, const std::optional &version); + + bool commit(const std::string &branch_id, + const unsigned long &base_version, + const std::vector> &actions); + + void checkpoint(const std::string &branch_id); + + arrow::Result> action_data(const std::string &branch_id, const unsigned long &from, const std::optional &to) const; + + std::tuple>>, long> get_actions(const std::string &branch_id, const std::optional &to) const; + + protected: + + //only created through open() etc. + deeplog(const std::shared_ptr &storage); + + private: + + arrow::Result> read_checkpoint(const std::string &dir_path, const unsigned long &version) const; + + arrow::Status write_checkpoint(const std::string &branch_id, const unsigned long &version); + + long file_version(const std::string &path) const; + + const static std::shared_ptr arrow_schema; + + std::shared_ptr storage_; + + std::shared_ptr open_arrow_istream(const storage::file_ref &file) const; + + std::vector> create_arrow_builders() const; + }; + +} diff --git a/cpp/deeplog/deeplog_v3.cpp b/cpp/deeplog/deeplog_v3.cpp new file mode 100644 index 0000000000..147f6fb45b --- /dev/null +++ b/cpp/deeplog/deeplog_v3.cpp @@ -0,0 +1,10 @@ +#include "deeplog_v3.hpp" + +namespace deeplog { + + deeplog_v3::deeplog_v3(const std::shared_ptr &storage) : deeplog(storage){} + + int deeplog_v3::log_format() const { + return 3; + } +} \ No newline at end of file diff --git a/cpp/deeplog/deeplog_v3.hpp b/cpp/deeplog/deeplog_v3.hpp new file mode 100644 index 0000000000..b282531b8b --- /dev/null +++ b/cpp/deeplog/deeplog_v3.hpp @@ -0,0 +1,12 @@ +#pragma once + + +#include "deeplog.hpp" + +namespace deeplog { +class deeplog_v3 : public deeplog { + public: + deeplog_v3(const std::shared_ptr &storage); + int log_format() const override; + }; +} \ No newline at end of file diff --git a/cpp/deeplog/json_parser.cpp b/cpp/deeplog/json_parser.cpp new file mode 100644 index 0000000000..c3da542e1e --- /dev/null +++ b/cpp/deeplog/json_parser.cpp @@ -0,0 +1,168 @@ +#include "json_parser.hpp" +#include "spdlog/spdlog.h" +#include + +#include + +namespace deeplog { + arrow::Status json_parser::parse(const std::shared_ptr &buffer_reader, const std::shared_ptr &batch_builder) { + ARROW_ASSIGN_OR_RAISE(auto json_reader, arrow::json::StreamingReader::Make(buffer_reader, + arrow::json::ReadOptions::Defaults(), + arrow::json::ParseOptions::Defaults())); + + for (arrow::Result> maybe_json: *json_reader) { + if (!maybe_json.ok()) { + throw std::runtime_error("Error reading JSON: " + maybe_json.status().message()); + } + std::shared_ptr json = *maybe_json; + + auto json_columns = json->schema()->field_names(); + + for (int builder_column_id = 0; builder_column_id < batch_builder->num_fields(); ++builder_column_id) { + const auto builder_column_name = batch_builder->schema()->field(builder_column_id)->name(); + if (builder_column_name == "version") { + //doesn't come from json + ARROW_RETURN_NOT_OK(batch_builder->GetField(builder_column_id)->AppendNulls(json->num_rows())); + continue; + } + + const auto column_builder = batch_builder->GetFieldAs(builder_column_id); + if (column_builder == nullptr) { + throw std::runtime_error("Unexpected builder for field " + std::to_string(builder_column_id)); + } + + const auto json_column_it = std::find(json_columns.begin(), json_columns.end(), builder_column_name); + + if (json_column_it == json_columns.end()) { + spdlog::debug("No {} columns in JSON", builder_column_name); + + ARROW_RETURN_NOT_OK(column_builder->AppendNulls(json->num_rows())); + continue; + } + const auto json_column_id = static_cast(std::distance(json_columns.begin(), json_column_it)); + + for (int row = 0; row < json->num_rows(); ++row) { + ARROW_ASSIGN_OR_RAISE(auto json_column_value_scalar, json->column(json_column_id)->GetScalar(row)); + if (!json_column_value_scalar->is_valid) { + ARROW_RETURN_NOT_OK(column_builder->AppendNull()); + continue; + } + + auto json_column_value = std::dynamic_pointer_cast(json_column_value_scalar); + + if (json_column_value == nullptr) { + throw std::runtime_error("Unexpected json data type in " + std::to_string(json_column_id) + ": " + json_column_value_scalar->ToString()); + } + + auto column_type = column_builder->type(); + for (int builder_field_id = 0; builder_field_id < column_type->num_fields(); ++builder_field_id) { + auto field_name = column_type->field(builder_field_id)->name(); + auto field_builder = column_builder->field_builder(builder_field_id); + + std::vector json_struct_fields = {}; + for (const auto &field: json_column_value->type->fields()) { + json_struct_fields.push_back(field->name()); + } + const auto json_struct_field_it = std::find(json_struct_fields.begin(), json_struct_fields.end(), field_name); + + if (json_struct_field_it == json_struct_fields.end()) { + spdlog::debug("No {} records in JSON", field_name); + + ARROW_RETURN_NOT_OK(field_builder->AppendNull()); + continue; + } + const auto json_struct_field_id = static_cast(std::distance(json_struct_fields.begin(), json_struct_field_it)); + + auto json_field_value = json_column_value->value.at(json_struct_field_id); + if (!json_field_value->is_valid) { + ARROW_RETURN_NOT_OK(field_builder->AppendNull()); + continue; + } + + auto wanted_type = field_builder->type(); + ARROW_ASSIGN_OR_RAISE(auto converted_value, convert(json_field_value, wanted_type)); + + ARROW_RETURN_NOT_OK(field_builder->AppendScalar(*converted_value)); + } + ARROW_RETURN_NOT_OK(column_builder->Append()); //finish scalar + } + } + } + + return arrow::Status::OK(); + } + + arrow::Result> json_parser::convert(const std::shared_ptr &json_field_value, const std::shared_ptr &wanted_type) { + + if (wanted_type->Equals(json_field_value->type)) { + return json_field_value; + } + if (!json_field_value->is_valid) { + return arrow::MakeNullScalar(wanted_type); + } else if (json_field_value->type->Equals(arrow::int64())) { + ARROW_ASSIGN_OR_RAISE(auto new_value, arrow::MakeScalar(wanted_type, std::dynamic_pointer_cast(json_field_value)->value)); + return new_value; + } else if (json_field_value->type->Equals(arrow::utf8())) { + ARROW_ASSIGN_OR_RAISE(auto new_value, arrow::MakeScalar(wanted_type, std::dynamic_pointer_cast(json_field_value)->value)); + return new_value; + } else if (json_field_value->type->id() == arrow::Type::LIST) { + auto list_type = wanted_type->field(0)->type(); + ARROW_ASSIGN_OR_RAISE(auto array_builder, arrow::MakeBuilder(list_type)); + + auto json_list = std::dynamic_pointer_cast(json_field_value)->value; + for (auto i = 0; i < json_list->length(); ++i) { + ARROW_ASSIGN_OR_RAISE(auto current_value, json_list->GetScalar(i)); + ARROW_ASSIGN_OR_RAISE(auto converted, convert(current_value, list_type)); + ARROW_RETURN_NOT_OK(array_builder->AppendScalar(*converted)); + } + ARROW_ASSIGN_OR_RAISE(auto array, array_builder->Finish()); + ARROW_ASSIGN_OR_RAISE(auto new_value, arrow::MakeScalar(wanted_type, array)); + return new_value; + } else if (json_field_value->type->id() == arrow::Type::STRUCT) { + auto json_struct = std::dynamic_pointer_cast(json_field_value); + + if (wanted_type->id() == arrow::Type::MAP) { + auto wanted_type_struct = std::dynamic_pointer_cast(wanted_type->field(0)->type()); + const auto &wanted_key_type = arrow::utf8(); + auto wanted_value_type = wanted_type_struct->field(1)->type(); + + ARROW_ASSIGN_OR_RAISE(std::shared_ptr map_struct_builder, arrow::MakeBuilder(arrow::struct_({field("key", arrow::utf8(), false), field("value", wanted_value_type)}))); + + for (auto json_field: json_struct->type->fields()) { + ARROW_ASSIGN_OR_RAISE(auto converted_key, convert(arrow::MakeScalar(json_field->name()), wanted_key_type)); + ARROW_RETURN_NOT_OK(map_struct_builder->child(0)->AppendScalar(*converted_key)); + + ARROW_ASSIGN_OR_RAISE(auto original_json_field_value, json_struct->field(json_field->name())); + ARROW_ASSIGN_OR_RAISE(auto converted_value, convert(original_json_field_value, wanted_value_type)); + ARROW_RETURN_NOT_OK(map_struct_builder->child(1)->AppendScalar(*converted_value)); + + ARROW_RETURN_NOT_OK(std::dynamic_pointer_cast(map_struct_builder)->Append()); + } + + + ARROW_ASSIGN_OR_RAISE(auto map_array, map_struct_builder->Finish()); + + return std::make_shared(arrow::MapScalar(map_array)); + } else if (wanted_type->id() == arrow::Type::STRUCT) { + std::vector> struct_scalars = {}; + std::vector field_names = {}; + + for (auto i=0; inum_fields(); ++i) { + auto struct_field = wanted_type->field(i); + field_names.push_back(struct_field->name()); + + ARROW_ASSIGN_OR_RAISE(auto json_struct_field_value, json_struct->field(struct_field->name())); + ARROW_ASSIGN_OR_RAISE(auto converted_json_field_value, convert(json_struct_field_value, struct_field->type())); + + struct_scalars.push_back(converted_json_field_value); + } + + return arrow::StructScalar::Make(struct_scalars, field_names); + } else { + throw std::runtime_error("Unexpected struct mapping: " + wanted_type->name()); + } + } else { + throw std::runtime_error("Unexpected json type: " + json_field_value->type->name()); + } + } +} \ No newline at end of file diff --git a/cpp/deeplog/json_parser.hpp b/cpp/deeplog/json_parser.hpp new file mode 100644 index 0000000000..a623734be1 --- /dev/null +++ b/cpp/deeplog/json_parser.hpp @@ -0,0 +1,15 @@ +#pragma once + +#include +#include + +namespace deeplog { + class json_parser { + public: + static arrow::Status parse(const std::shared_ptr &json, const std::shared_ptr &batch_builder); + + private: + static arrow::Result> convert(const std::shared_ptr ¤t_value, const std::shared_ptr &wanted_type); + + }; +} diff --git a/cpp/deeplog/last_checkpoint.cpp b/cpp/deeplog/last_checkpoint.cpp new file mode 100644 index 0000000000..f75211f3d7 --- /dev/null +++ b/cpp/deeplog/last_checkpoint.cpp @@ -0,0 +1,8 @@ +#include "last_checkpoint.hpp" + +namespace deeplog { + + last_checkpoint::last_checkpoint() : version(0), size(0) {} + last_checkpoint::last_checkpoint(unsigned long version, long size) : version(version), size(size) {} + +} \ No newline at end of file diff --git a/cpp/deeplog/last_checkpoint.hpp b/cpp/deeplog/last_checkpoint.hpp new file mode 100644 index 0000000000..be29097566 --- /dev/null +++ b/cpp/deeplog/last_checkpoint.hpp @@ -0,0 +1,19 @@ +#pragma once + +#include + +namespace deeplog { + + struct last_checkpoint { + public: + + last_checkpoint(); + + last_checkpoint(unsigned long version, long size); + + unsigned long version; + long size; + }; + + NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(last_checkpoint, version, size); +} diff --git a/cpp/deeplog/metadata_snapshot.cpp b/cpp/deeplog/metadata_snapshot.cpp new file mode 100644 index 0000000000..1f3b01366c --- /dev/null +++ b/cpp/deeplog/metadata_snapshot.cpp @@ -0,0 +1,41 @@ +#include "metadata_snapshot.hpp" +#include + +namespace deeplog { + metadata_snapshot::metadata_snapshot(const std::shared_ptr<::deeplog::deeplog> &deeplog) : base_snapshot(META_BRANCH_ID, std::nullopt, deeplog) { + spdlog::debug("Metadata snapshot created for version {} with {} actions", version, actions_->size()); + } + + metadata_snapshot::metadata_snapshot(const unsigned long &version, const std::shared_ptr<::deeplog::deeplog> &deeplog) : base_snapshot(META_BRANCH_ID, version, deeplog) { + spdlog::debug("Metadata snapshot created for version {} with {} actions", version, actions_->size()); + } + + std::shared_ptr metadata_snapshot::update() const { + return std::make_shared(version, deeplog); + } + + std::shared_ptr metadata_snapshot::protocol() const { + return find_action(); + } + + std::shared_ptr metadata_snapshot::metadata() const { + return find_action(); + } + + std::vector> metadata_snapshot::branches() const { + return find_actions(); + } + + std::shared_ptr metadata_snapshot::find_branch(const std::string &address) const { + auto all_branches = branches(); + + std::input_iterator auto branch = std::ranges::find_if(all_branches.begin(), all_branches.end(), + [address](std::shared_ptr b) { return b->id == address || b->name == address; }); + if (branch == all_branches.end()) { + throw std::runtime_error("Branch '" + address + "' not found"); + } + + + return *branch; + } +} diff --git a/cpp/deeplog/metadata_snapshot.hpp b/cpp/deeplog/metadata_snapshot.hpp new file mode 100644 index 0000000000..bff4e2d177 --- /dev/null +++ b/cpp/deeplog/metadata_snapshot.hpp @@ -0,0 +1,25 @@ +#pragma once + +#include "deeplog.hpp" +#include "base_snapshot.hpp" + +namespace deeplog { + + class metadata_snapshot : public base_snapshot { + public: + metadata_snapshot(const std::shared_ptr<::deeplog::deeplog> &deeplog); + + metadata_snapshot(const unsigned long &version, const std::shared_ptr<::deeplog::deeplog> &deeplog); + + std::shared_ptr update() const override; + + std::shared_ptr protocol() const; + + std::shared_ptr metadata() const; + + std::vector> branches() const; + + std::shared_ptr find_branch(const std::string &address) const; + + }; +} \ No newline at end of file diff --git a/cpp/deeplog/optimistic_transaction.cpp b/cpp/deeplog/optimistic_transaction.cpp new file mode 100644 index 0000000000..d7946d8145 --- /dev/null +++ b/cpp/deeplog/optimistic_transaction.cpp @@ -0,0 +1,26 @@ +#include "optimistic_transaction.hpp" +#include "spdlog/spdlog.h" +#include + +namespace deeplog { + optimistic_transaction::optimistic_transaction(const std::shared_ptr<::deeplog::base_snapshot> &snapshot) : snapshot(snapshot), actions_({}) {} + + void optimistic_transaction::add(const std::shared_ptr &action) { + actions_.push_back(action); + } + + unsigned long optimistic_transaction::commit() { + auto snapshot_to_commit = snapshot; + + while (true) { + auto succeeded = snapshot_to_commit->deeplog->commit(snapshot->branch_id, snapshot->version, actions_); + if (succeeded) { + return snapshot_to_commit->version + 1; + } + + spdlog::debug("Commit failed, retrying"); + snapshot_to_commit = snapshot->update(); + } + + } +} \ No newline at end of file diff --git a/cpp/deeplog/optimistic_transaction.hpp b/cpp/deeplog/optimistic_transaction.hpp new file mode 100644 index 0000000000..90cbd82fb7 --- /dev/null +++ b/cpp/deeplog/optimistic_transaction.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include "snapshot.hpp" + +namespace deeplog { + class optimistic_transaction { + public: + optimistic_transaction(const std::shared_ptr &snapshot); + + public: + std::shared_ptr snapshot; + + public: + void add(const std::shared_ptr &action); + + unsigned long commit(); + + private: + std::vector> actions_; + }; + +} \ No newline at end of file diff --git a/cpp/deeplog/snapshot.cpp b/cpp/deeplog/snapshot.cpp new file mode 100644 index 0000000000..fea0cee677 --- /dev/null +++ b/cpp/deeplog/snapshot.cpp @@ -0,0 +1,30 @@ +#include "snapshot.hpp" +#include "metadata_snapshot.hpp" +#include + +namespace deeplog { + + snapshot::snapshot(std::string branch_id, const std::shared_ptr<::deeplog::deeplog> &deeplog) : + base_snapshot(branch_id, std::nullopt, deeplog) {} + + snapshot::snapshot(std::string branch_id, const unsigned long &version, const std::shared_ptr<::deeplog::deeplog> &deeplog) : + base_snapshot(branch_id, version, deeplog) {} + + std::shared_ptr snapshot::update() const { + return std::make_shared(branch_id, version, deeplog); + } + + std::vector> snapshot::data_files() { + return find_actions(); + } + + std::vector> snapshot::commits() { + return find_actions(); + + } + + std::vector> snapshot::tensors() { + return find_actions(); + } + +} \ No newline at end of file diff --git a/cpp/deeplog/snapshot.hpp b/cpp/deeplog/snapshot.hpp new file mode 100644 index 0000000000..020c6d3337 --- /dev/null +++ b/cpp/deeplog/snapshot.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include "deeplog.hpp" +#include "actions/create_commit_action.hpp" +#include "base_snapshot.hpp" + +namespace deeplog { + + class deeplog; + + class snapshot : public base_snapshot { + + public: + + snapshot(std::string branch_id, const std::shared_ptr<::deeplog::deeplog> &deeplog); + + snapshot(std::string branch_id, const unsigned long &version, const std::shared_ptr<::deeplog::deeplog> &deeplog); + + std::vector> data_files(); + + std::vector> tensors(); + + std::vector> commits(); + + std::shared_ptr update() const override; + }; +} \ No newline at end of file diff --git a/cpp/deeplog/util.cpp b/cpp/deeplog/util.cpp new file mode 100644 index 0000000000..c04d367fce --- /dev/null +++ b/cpp/deeplog/util.cpp @@ -0,0 +1,22 @@ +#include "util.hpp" +#include + +namespace deeplog { + std::string generate_id() { + std::random_device rd; + auto seed_data = std::array{}; + std::generate(std::begin(seed_data), std::end(seed_data), std::ref(rd)); + std::seed_seq seq(std::begin(seed_data), std::end(seed_data)); + std::mt19937 generator(seq); + + auto val = uuids::to_string(uuids::uuid_random_generator{generator}()); + val.erase(std::remove(val.begin(), val.end(), '-'), val.end()); + + return val; + } + + long current_timestamp() { + return std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + } +} diff --git a/cpp/deeplog/util.hpp b/cpp/deeplog/util.hpp new file mode 100644 index 0000000000..3174140992 --- /dev/null +++ b/cpp/deeplog/util.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include + +namespace deeplog { + std::string generate_id(); + + long current_timestamp(); +} diff --git a/cpp/py_api/CMakeLists.txt b/cpp/py_api/CMakeLists.txt new file mode 100644 index 0000000000..5373c1572f --- /dev/null +++ b/cpp/py_api/CMakeLists.txt @@ -0,0 +1,21 @@ +project(py_api) + +include(FetchContent) + +file(GLOB_RECURSE SOURCES "*.cpp") + +pybind11_add_module(py_api ${SOURCES}) + +target_link_libraries(py_api PUBLIC deeplog) +set_target_properties(py_api PROPERTIES OUTPUT_NAME "_deeplake") + +add_custom_command(TARGET py_api POST_BUILD + COMMAND "${CMAKE_COMMAND}" -E copy_directory + "$" + "${PYTHON_SOURCE}/_deeplake" + COMMENT "Copying module to python source directory") + +add_custom_command(TARGET py_api POST_BUILD + WORKING_DIRECTORY "${PYTHON_SOURCE}/_deeplake" + COMMAND "${PYTHON_EXECUTABLE}" -m pybind11_stubgen _deeplake -o . + COMMENT "Generating stubs") \ No newline at end of file diff --git a/cpp/py_api/deeplake.cpp b/cpp/py_api/deeplake.cpp new file mode 100644 index 0000000000..426b79ac0f --- /dev/null +++ b/cpp/py_api/deeplake.cpp @@ -0,0 +1,58 @@ +#include "deeplake.hpp" +#include +#include + +namespace py_api { + + void deeplake::pybind(pybind11::module &module) { + pybind11::class_<::py_api::logger, std::shared_ptr<::py_api::logger >>(module, "Logger") + .def_static("set_log_level", &::py_api::logger::set_log_level) + .def_static("debug", &::py_api::logger::debug) + .def_static("info", &::py_api::logger::info) + .def_static("warn", &::py_api::logger::warn) + .def_static("err", &::py_api::logger::err) + .def_static("critical", &::py_api::logger::critical); + } + + void logger::set_log_level(const std::string &level) { + auto lc_level = level; + std::transform(lc_level.begin(), lc_level.end(), lc_level.begin(), + [](unsigned char c){ return std::tolower(c); }); + + if (lc_level == "debug") { + spdlog::set_level(spdlog::level::debug); + } else if (lc_level == "info") { + spdlog::set_level(spdlog::level::info); + } else if (lc_level == "warn") { + spdlog::set_level(spdlog::level::warn); + } else if (lc_level == "err") { + spdlog::set_level(spdlog::level::err); + } else if (lc_level == "critical") { + spdlog::set_level(spdlog::level::critical); + } else if (lc_level == "off") { + spdlog::set_level(spdlog::level::off); + } + + spdlog::debug("Set log level to {}", level); + } + + void logger::debug(const std::string &msg) { + spdlog::debug(msg); + } + + void logger::info(const std::string &msg) { + spdlog::info(msg); + } + + void logger::warn(const std::string &msg) { + spdlog::warn(msg); + } + + void logger::err(const std::string &msg) { + spdlog::error(msg); + } + + void logger::critical(const std::string &msg) { + spdlog::critical(msg); + } +} diff --git a/cpp/py_api/deeplake.hpp b/cpp/py_api/deeplake.hpp new file mode 100644 index 0000000000..05664679f9 --- /dev/null +++ b/cpp/py_api/deeplake.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include +#include + +namespace py_api { + class logger { + public: + static void set_log_level(const std::string &level); + + static void debug(const std::string &msg); + + static void info(const std::string &msg); + + static void warn(const std::string &msg); + + static void err(const std::string &msg); + + static void critical(const std::string &msg); + + }; + + class deeplake { + + public: + static void pybind(pybind11::module &); + }; +} diff --git a/cpp/py_api/deeplog/actions/actions.cpp b/cpp/py_api/deeplog/actions/actions.cpp new file mode 100644 index 0000000000..8cc2b7c8ca --- /dev/null +++ b/cpp/py_api/deeplog/actions/actions.cpp @@ -0,0 +1,196 @@ +#include "actions.hpp" +#include "../../../deeplog/actions/add_file_action.hpp" +#include "../../../deeplog/actions/create_branch_action.hpp" +#include "../../../deeplog/actions/metadata_action.hpp" +#include "../../../deeplog/actions/protocol_action.hpp" +#include "../../../deeplog/actions/remove_file_action.hpp" +#include "../../../deeplog/actions/create_commit_action.hpp" +#include "../../../deeplog/actions/create_tensor_action.hpp" +#include +#include + +namespace py_api { + + std::shared_ptr create_tensor_action(std::string id, + std::string name, + std::optional dtype, + std::string htype, + const long &length, + const bool &is_link, + const bool &is_sequence, + const bool &hidden, + const std::optional &chunk_compression, + const std::optional &sample_compression, + const std::map> &links, + const std::optional &max_chunk_size, + const std::vector &min_shape, + const std::vector &max_shape, + const std::optional &tiling_threshold, + const std::optional &typestr, + const bool &verify, + std::string version) { + + std::map links_map{}; + for (const auto &link: links) { + std::optional flatten_sequence; + std::optional extend; + std::optional update; + + if (link.second.contains("flatten_sequence")) { + flatten_sequence = link.second.at("flatten_sequence").cast(); + } + if (link.second.contains("extend")) { + extend = link.second.at("extend").cast(); + } + if (link.second.contains("update")) { + update = link.second.at("update").cast(); + } + links_map.insert({link.first, deeplog::tensor_link(extend, + flatten_sequence, + update)}); + } + + return std::make_shared(std::move(id), std::move(name), std::move(dtype), std::move(htype), length, is_link, is_sequence, hidden, + chunk_compression, sample_compression, std::move(links_map), max_chunk_size, + min_shape, max_shape, tiling_threshold, typestr, verify, std::move(version)); + } + + void actions::pybind(pybind11::module &module) { + pybind11::class_>(module, "DeepLogAction"); + + pybind11::class_>(module, "AddFileAction") + .def(pybind11::init(), + pybind11::arg("path"), pybind11::arg("type"), pybind11::arg("size"), pybind11::arg("modification_time"), pybind11::arg("data_change"), pybind11::arg("num_samples")) + .def_readonly("path", &deeplog::add_file_action::path) + .def_readonly("type", &deeplog::add_file_action::type) + .def_readonly("size", &deeplog::add_file_action::size) + .def_readonly("modification_time", &deeplog::add_file_action::modification_time) + .def_readonly("num_samples", &deeplog::add_file_action::num_samples); + + pybind11::class_>(module, "CreateBranchAction") + .def(pybind11::init(), + pybind11::arg("id"), pybind11::arg("name"), pybind11::arg("from_id"), pybind11::arg("from_version")) + .def_readonly("id", &deeplog::create_branch_action::id) + .def_readonly("name", &deeplog::create_branch_action::name) + .def_readonly("from_id", &deeplog::create_branch_action::from_id) + .def_readonly("from_version", &deeplog::create_branch_action::from_version); + + + pybind11::class_>(module, "CreateCommitAction") + .def(pybind11::init, long>(), + pybind11::arg("id"), pybind11::arg("branch_id"), pybind11::arg("branch_version"), pybind11::arg("message"), pybind11::arg("commit_time")) + .def_readonly("id", &deeplog::create_commit_action::id) + .def_readonly("branch_id", &deeplog::create_commit_action::branch_id) + .def_readonly("branch_version", &deeplog::create_commit_action::branch_version) + .def_readonly("message", &deeplog::create_commit_action::message) + .def_readonly("commit_time", &deeplog::create_commit_action::commit_time); + + pybind11::class_>(module, "CreateTensorAction") + .def(pybind11::init(&create_tensor_action), + pybind11::arg("id"), + pybind11::arg("name"), + pybind11::arg("dtype"), + pybind11::arg("htype"), + pybind11::arg("length"), + pybind11::arg("is_link"), + pybind11::arg("is_sequence"), + pybind11::arg("hidden"), + pybind11::arg("chunk_compression"), + pybind11::arg("sample_compression"), + pybind11::arg("links"), + pybind11::arg("max_chunk_size"), + pybind11::arg("min_shape"), + pybind11::arg("max_shape"), + pybind11::arg("tiling_threshold"), + pybind11::arg("typestr"), + pybind11::arg("verify"), + pybind11::arg("version") + ) + .def(pybind11::init, std::string, long, bool, bool, bool, + std::optional, std::optional, std::map, + std::optional, + std::vector, + std::vector, + std::optional, + std::optional, + bool, + std::string>(), + pybind11::arg("id"), + pybind11::arg("name"), + pybind11::arg("dtype"), + pybind11::arg("htype"), + pybind11::arg("length"), + pybind11::arg("is_link"), + pybind11::arg("is_sequence"), + pybind11::arg("hidden"), + pybind11::arg("chunk_compression"), + pybind11::arg("sample_compression"), + pybind11::arg("links"), + pybind11::arg("max_chunk_size"), + pybind11::arg("min_shape"), + pybind11::arg("max_shape"), + pybind11::arg("tiling_threshold"), + pybind11::arg("typestr"), + pybind11::arg("verify"), + pybind11::arg("version") + ) + .def_readonly("id", &deeplog::create_tensor_action::id) + .def_readonly("name", &deeplog::create_tensor_action::name) + .def_readonly("dtype", &deeplog::create_tensor_action::dtype) + .def_readonly("htype", &deeplog::create_tensor_action::htype) + .def_readonly("length", &deeplog::create_tensor_action::length) + .def_readonly("is_link", &deeplog::create_tensor_action::is_link) + .def_readonly("is_sequence", &deeplog::create_tensor_action::is_sequence) + .def_readonly("hidden", &deeplog::create_tensor_action::hidden) + .def_readonly("chunk_compression", &deeplog::create_tensor_action::chunk_compression) + .def_readonly("sample_compression", &deeplog::create_tensor_action::sample_compression) + .def_readonly("links", &deeplog::create_tensor_action::links) + .def_readonly("max_chunk_size", &deeplog::create_tensor_action::max_chunk_size) + .def_readonly("min_shape", &deeplog::create_tensor_action::min_shape) + .def_readonly("max_shape", &deeplog::create_tensor_action::max_shape) + .def_readonly("tiling_threshold", &deeplog::create_tensor_action::tiling_threshold) + .def_readonly("typestr", &deeplog::create_tensor_action::typestr) + .def_readonly("verify", &deeplog::create_tensor_action::verify) + .def_readonly("version", &deeplog::create_tensor_action::version); + + pybind11::class_>(module, "MetadataAction") + .def(pybind11::init(), + pybind11::arg("id"), pybind11::arg("name"), pybind11::arg("description"), pybind11::arg("created_time")) + .def_readonly("id", &deeplog::metadata_action::id) + .def_readonly("name", &deeplog::metadata_action::name) + .def_readonly("description", &deeplog::metadata_action::description) + .def_readonly("created_time", &deeplog::metadata_action::created_time); + + pybind11::class_>(module, "ProtocolAction") + .def(pybind11::init(), + pybind11::arg("min_reader_version"), pybind11::arg("min_writer_version")) + .def_readonly("min_reader_version", &deeplog::protocol_action::min_reader_version) + .def_readonly("min_writer_version", &deeplog::protocol_action::min_writer_version); + + pybind11::class_>(module, "RemoveFileAction") + .def(pybind11::init(), + pybind11::arg("path"), pybind11::arg("size"), pybind11::arg("deletion_timestamp"), pybind11::arg("data_change")) + .def_readonly("path", &deeplog::remove_file_action::path) + .def_readonly("size", &deeplog::remove_file_action::size) + .def_readonly("deletion_timestamp", &deeplog::remove_file_action::deletion_time) + .def_readonly("data_change", &deeplog::remove_file_action::data_change); + + pybind11::class_>(module, "TensorLink") + .def(pybind11::init, std::string>(), + pybind11::arg("extend"), pybind11::arg("flatten_sequence"), pybind11::arg("update")) + .def_readonly("extend", &deeplog::tensor_link::extend) + .def_readonly("flatten_sequence", &deeplog::tensor_link::flatten_sequence) + .def_readonly("update", &deeplog::tensor_link::update) + .def("__getitem__", [](const deeplog::tensor_link &link, const std::string &key) -> std::variant, std::optional> { + if (key == "extend") { + return link.extend; + } else if (key == "flatten_sequence") { + return link.flatten_sequence; + } else if (key == "update") { + return link.update; + } else { + throw std::runtime_error("Invalid key"); + } + }); + } +} diff --git a/cpp/py_api/deeplog/actions/actions.hpp b/cpp/py_api/deeplog/actions/actions.hpp new file mode 100644 index 0000000000..7ac00f0b2c --- /dev/null +++ b/cpp/py_api/deeplog/actions/actions.hpp @@ -0,0 +1,15 @@ +#pragma once + +#include +#include + +#include "../../../deeplog/actions/action.hpp" + +namespace py_api { + + class actions { + + public: + static void pybind(pybind11::module &); + }; +} diff --git a/cpp/py_api/deeplog/deeplog.cpp b/cpp/py_api/deeplog/deeplog.cpp new file mode 100644 index 0000000000..d57bc44be6 --- /dev/null +++ b/cpp/py_api/deeplog/deeplog.cpp @@ -0,0 +1,57 @@ +#include + +#include +#include "../../deeplog/deeplog.hpp" +#include "deeplog.hpp" +#include "../../deeplog/snapshot.hpp" +#include "../../deeplog/metadata_snapshot.hpp" +#include "../../deeplog/deeplog_v3.hpp" +#include "../../deeplog/optimistic_transaction.hpp" +#include "../storage/py_storage.hpp" + +namespace py_api { + void deeplog::pybind(pybind11::module &module) { + pybind11::class_<::deeplog::deeplog, std::shared_ptr<::deeplog::deeplog>>(module, "DeepLog") + .def_static("open", [](pybind11::object storage) { + return ::deeplog::deeplog::open(std::make_shared(py_storage(std::move(storage)))); + }, pybind11::arg("storage")) + .def_static("create", [](pybind11::object storage, int log_format) { + return ::deeplog::deeplog::create(std::make_shared(py_storage(std::move(storage))), log_format); + }, pybind11::arg("storage"), pybind11::arg("log_format")) + .def("log_format", &::deeplog::deeplog::log_format) + .def("version", &::deeplog::deeplog::version) + .def("checkpoint", &::deeplog::deeplog::checkpoint); + + pybind11::class_<::deeplog::deeplog_v3, ::deeplog::deeplog, std::shared_ptr<::deeplog::deeplog_v3>>(module, "DeepLogV3") + .def("log_format", &::deeplog::deeplog::log_format); + + pybind11::class_<::deeplog::snapshot, std::shared_ptr<::deeplog::snapshot>>(module, "DeepLogSnapshot") + .def(pybind11::init &>(), + pybind11::arg("branch_id"), pybind11::arg("deeplog")) + .def(pybind11::init &>(), + pybind11::arg("branch_id"), pybind11::arg("version"), pybind11::arg("deeplog")) + .def("data_files", &::deeplog::snapshot::data_files) + .def("commits", &::deeplog::snapshot::commits) + .def("tensors", &::deeplog::snapshot::tensors) + .def_readonly("version", &::deeplog::snapshot::version) + .def_readonly("branch_id", &::deeplog::snapshot::branch_id); + + pybind11::class_<::deeplog::metadata_snapshot, std::shared_ptr<::deeplog::metadata_snapshot>>(module, "MetadataSnapshot") + .def(pybind11::init &>(), + pybind11::arg("deeplog")) + .def(pybind11::init &>(), + pybind11::arg("version"), pybind11::arg("deeplog")) + .def("protocol", &::deeplog::metadata_snapshot::protocol) + .def("metadata", &::deeplog::metadata_snapshot::metadata) + .def("branches", &::deeplog::metadata_snapshot::branches) + .def("find_branch", &::deeplog::metadata_snapshot::find_branch) + .def_readonly("version", &::deeplog::metadata_snapshot::version); + + pybind11::class_<::deeplog::optimistic_transaction>(module, "OptimisticTransaction") + .def(pybind11::init &>(), + pybind11::arg("snapshot")) + .def("add", &::deeplog::optimistic_transaction::add, pybind11::arg("action")) + .def("commit", &::deeplog::optimistic_transaction::commit) + .def_readonly("snapshot", &::deeplog::optimistic_transaction::snapshot); + } +} diff --git a/cpp/py_api/deeplog/deeplog.hpp b/cpp/py_api/deeplog/deeplog.hpp new file mode 100644 index 0000000000..e8aef56c6d --- /dev/null +++ b/cpp/py_api/deeplog/deeplog.hpp @@ -0,0 +1,9 @@ +#pragma once + +namespace py_api { + class deeplog { + + public: + static void pybind(pybind11::module &); + }; +} diff --git a/cpp/py_api/py_api.cpp b/cpp/py_api/py_api.cpp new file mode 100644 index 0000000000..44ba6943a6 --- /dev/null +++ b/cpp/py_api/py_api.cpp @@ -0,0 +1,16 @@ +#include +#include "deeplake.hpp" +#include "deeplog/deeplog.hpp" +#include "deeplog/actions/actions.hpp" + +PYBIND11_MAKE_OPAQUE(std::optional); + +PYBIND11_MODULE(_deeplake, mod_deeplake) { + auto mod_deeplog = mod_deeplake.def_submodule("_deeplog"); + auto mod_actions = mod_deeplog.def_submodule("_actions"); + + py_api::deeplake::pybind(mod_deeplake); + py_api::deeplog::pybind(mod_deeplog); + py_api::actions::pybind(mod_actions); + +} diff --git a/cpp/py_api/storage/py_storage.cpp b/cpp/py_api/storage/py_storage.cpp new file mode 100644 index 0000000000..364da3c2dd --- /dev/null +++ b/cpp/py_api/storage/py_storage.cpp @@ -0,0 +1,63 @@ +#include "py_storage.hpp" +#include "spdlog/spdlog.h" + +#include +#include + +namespace py_api { + + std::string correct_path(const std::string &path) { + std::string result = path; + if (result.find('/') == 0) { + result = result.substr(1); + } + return result; + } + + py_storage::py_storage(pybind11::object obj) : _wrapped_storage(std::move(obj)) { + + } + + ::storage::file_ref py_storage::file(const std::string &path) const { + try { + long bytes = _wrapped_storage.attr("get_object_size")(correct_path(path)).cast(); + return {path, bytes}; + } catch (const pybind11::error_already_set &e) { + spdlog::debug("File does not exist: {}", path); +// spdlog::debug("Caught exception: {}", e.what()); + return {path, -1}; + } + + } + + std::vector<::storage::file_ref> py_storage::list_files(const std::string &base_dir) const { + std::vector<::storage::file_ref> files {}; + + for (auto file_name: _wrapped_storage.attr("__iter__")()) { + files.push_back(file(file_name.cast())); + } + + return files; + } + + std::vector py_storage::get_bytes(const std::string &path) const { + auto bytes = _wrapped_storage.attr("get_bytes")(correct_path(path)).cast(); + std::vector result {}; + for (auto byte: bytes) { + result.push_back(byte.cast()); + } + + spdlog::debug("Read {} bytes from path {}", result.size(), path); + + return result; + + } + + void py_storage::set_bytes(const std::string &path, const std::string &data) const { + spdlog::debug("Writing {} bytes to path {}", data.size(), path); + + _wrapped_storage.attr("set_bytes")(correct_path(path), pybind11::bytes(data)); + _wrapped_storage.attr("flush")(); + } + +} \ No newline at end of file diff --git a/cpp/py_api/storage/py_storage.hpp b/cpp/py_api/storage/py_storage.hpp new file mode 100644 index 0000000000..a5c05c8024 --- /dev/null +++ b/cpp/py_api/storage/py_storage.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include +#include +#include "../../storage/storage.hpp" +#include "../../storage/file_ref.hpp" +#include "spdlog/spdlog.h" + +namespace py_api { + + class py_storage : public ::storage::storage { + + public: + py_storage(pybind11::object obj); + + ::storage::file_ref file(const std::string &path) const override; + + std::vector<::storage::file_ref> list_files(const std::string &base_dir) const override; + + std::vector get_bytes(const std::string &path) const override; + + void set_bytes(const std::string &path, const std::string &data) const override; + + private: + pybind11::object _wrapped_storage; + }; +} \ No newline at end of file diff --git a/cpp/storage/CMakeLists.txt b/cpp/storage/CMakeLists.txt new file mode 100644 index 0000000000..4c69069b6d --- /dev/null +++ b/cpp/storage/CMakeLists.txt @@ -0,0 +1,9 @@ +project(storage) + +include(FetchContent) + +file(GLOB_RECURSE SOURCES "*.cpp") + +add_library(storage ${SOURCES}) + +target_link_libraries(storage PUBLIC spdlog::spdlog) diff --git a/cpp/storage/file_ref.cpp b/cpp/storage/file_ref.cpp new file mode 100644 index 0000000000..9f564fadb4 --- /dev/null +++ b/cpp/storage/file_ref.cpp @@ -0,0 +1 @@ +#include "file_ref.hpp" diff --git a/cpp/storage/file_ref.hpp b/cpp/storage/file_ref.hpp new file mode 100644 index 0000000000..32f1856d34 --- /dev/null +++ b/cpp/storage/file_ref.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include + +namespace storage { + struct file_ref { + std::string path; + long size; + bool exists(); + + file_ref(std::string path, long size) : path(std::move(path)), size(size) {} + + bool operator < (const file_ref &other) const { + return path < other.path; + } + }; +} diff --git a/cpp/storage/local_storage.cpp b/cpp/storage/local_storage.cpp new file mode 100644 index 0000000000..a5f4faf394 --- /dev/null +++ b/cpp/storage/local_storage.cpp @@ -0,0 +1,84 @@ +#include "local_storage.hpp" + +#include +#include +#include +#include + +namespace storage { + + local_storage::local_storage(std::string path) : path_(std::filesystem::absolute(path)) { + std::filesystem::create_directories(path_); + } + + std::filesystem::path local_storage::full_path(const std::string &path) const { + auto sub_path = path; + if (sub_path.find('/') == 0) { + sub_path = path.substr(1); + } + return path_ / std::filesystem::path(sub_path); + } + + file_ref local_storage::file(const std::string &path) const { + auto file_path = full_path(path); + if (std::filesystem::exists(file_path)) { + if (std::filesystem::is_regular_file(file_path)) { + return file_ref(path, std::filesystem::file_size(file_path)); + } else { + return file_ref(path, 0); + } + } else { + return file_ref(path, -1); + } + } + + std::vector local_storage::list_files(const std::string &base_dir) const { + auto base_dir_path = full_path(base_dir); + std::vector files; + for (const auto &entry: std::filesystem::directory_iterator(base_dir_path)) { + if (entry.is_regular_file()) { + files.push_back(file_ref("/" + std::filesystem::relative(entry.path(), path_).string(), entry.file_size())); + } + } + return files; + } + + std::vector local_storage::get_bytes(const std::string &path) const { + auto final_path = full_path(path); + + auto file = std::ifstream(final_path); + + if (!file.is_open()) { + throw std::runtime_error("Error opening file: " + final_path.string()); + } + + file.seekg(0, std::ios::end); + std::streampos fileSize = file.tellg(); + file.seekg(0, std::ios::beg); + + if (fileSize <= 0) { + return {}; + } + + std::vector return_data(static_cast(fileSize)); + file.read(reinterpret_cast(return_data.data()), fileSize); + + + if (!file) { + throw std::runtime_error("Error reading file: " + final_path.string()); + } + file.close(); + + return return_data; + } + + void local_storage::set_bytes(const std::string &path, const std::string &data) const { + auto final_path = full_path(path); + std::filesystem::create_directories(final_path.parent_path()); + + std::ofstream stream {final_path}; + stream << data; + stream.close(); + } + +} \ No newline at end of file diff --git a/cpp/storage/local_storage.hpp b/cpp/storage/local_storage.hpp new file mode 100644 index 0000000000..611085d4f0 --- /dev/null +++ b/cpp/storage/local_storage.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include "storage.hpp" +#include +#include +#include + +namespace storage { + class local_storage : public ::storage::storage { + + public: + local_storage(std::string path); + + file_ref file(const std::string &path) const override; + + std::vector list_files(const std::string &base_dir) const override; + + std::vector get_bytes(const std::string &path) const override; + + void set_bytes(const std::string &path, const std::string &data) const override; + + private: + std::filesystem::path path_; + + std::filesystem::path full_path(const std::string &path) const; + }; +} diff --git a/cpp/storage/storage.cpp b/cpp/storage/storage.cpp new file mode 100644 index 0000000000..34b46e67bf --- /dev/null +++ b/cpp/storage/storage.cpp @@ -0,0 +1,8 @@ +#include "storage.hpp" + +namespace storage { + + bool file_ref::exists() { + return size >= 0; + } +} \ No newline at end of file diff --git a/cpp/storage/storage.hpp b/cpp/storage/storage.hpp new file mode 100644 index 0000000000..7fa7c99154 --- /dev/null +++ b/cpp/storage/storage.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include +#include +#include "file_ref.hpp" + +namespace storage { + class storage { + public: + virtual file_ref file(const std::string &path) const = 0; + + virtual std::vector list_files(const std::string &base_dir) const = 0; + + virtual std::vector get_bytes(const std::string &path) const = 0; + + virtual void set_bytes(const std::string &path, const std::string &data) const = 0; + }; +} diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt new file mode 100644 index 0000000000..785f181be2 --- /dev/null +++ b/cpp/tests/CMakeLists.txt @@ -0,0 +1,18 @@ +project(tests) + +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) +enable_testing() + + +file(GLOB_RECURSE TEST_SOURCES "*.cpp") + +add_executable(runTests ${TEST_SOURCES} ${BACKWARD_ENABLE}) +add_backward(runTests) + +target_link_libraries(runTests + deeplog + GTest::gtest_main +) + +include(GoogleTest) +gtest_discover_tests(runTests) diff --git a/cpp/tests/deeplog/actions/common_action_test.cpp b/cpp/tests/deeplog/actions/common_action_test.cpp new file mode 100644 index 0000000000..5fb24daaac --- /dev/null +++ b/cpp/tests/deeplog/actions/common_action_test.cpp @@ -0,0 +1,176 @@ +#include +#include +#include +#include +#include +#include "../../../deeplog/actions/action.hpp" +#include "../../../deeplog/actions/add_file_action.hpp" +#include "../../../deeplog/actions/create_branch_action.hpp" +#include "../../../deeplog/actions/create_commit_action.hpp" +#include "../../../deeplog/actions/create_tensor_action.hpp" +#include "../../../deeplog/actions/metadata_action.hpp" +#include "../../../deeplog/actions/protocol_action.hpp" +#include "../../../deeplog/actions/remove_file_action.hpp" +#include "../../../deeplog/json_parser.hpp" + +enum ActionDataContentsType { + FILLED, + NULLS, + NEGATIVES, + ZEROS, +}; + +template +class ActionSerializationTest : public testing::Test { +public: + void test_data_conversions(ActionDataContentsType test_type) { + auto original_action_json = nlohmann::json::object(); + for (const auto &field: ActionType::arrow_type->fields()) { + const auto field_type = field->type(); + if (test_type == NULLS && field->nullable()) { + original_action_json[field->name()] = nlohmann::json::value_t::null; + continue; + } + + if (field_type->Equals(arrow::utf8())) { + original_action_json[field->name()] = "my/path"; + } else if (field_type->Equals(arrow::uint64())) { + if (test_type == ZEROS) { + original_action_json[field->name()] = 0; + } else { + original_action_json[field->name()] = 873731; + } + } else if (field_type->Equals(arrow::int64()) || field_type->Equals(arrow::int32())) { + if (test_type == ZEROS) { + original_action_json[field->name()] = 0; + } else if (test_type == NEGATIVES) { + original_action_json[field->name()] = -54812; + } else { + original_action_json[field->name()] = 873731; + } + } else if (field_type->Equals(arrow::boolean())) { + if (test_type == NEGATIVES) { + original_action_json[field->name()] = false; + } else { + original_action_json[field->name()] = true; + } + } else if (field_type->Equals(arrow::list(arrow::uint64()))) { + std::vector data = {1, + 5, + 13131}; + + auto auto_data1 = std::map(); + + original_action_json[field->name()] = data; + } else if (field_type->Equals(deeplog::create_tensor_action::arrow_type->GetFieldByName("links")->type())) { + auto data = nlohmann::json::object(); + + data["key1"] = deeplog::tensor_link("ext1", true, "up1").to_json(); + data["key2"] = deeplog::tensor_link("ext2", true, "up2").to_json(); + + original_action_json[field->name()] = data; + } else { + throw std::runtime_error("No test data generation configured for type " + field_type->ToString()); + } + } + + auto table_json = "{\"x\":" + original_action_json.dump() + "}"; + auto input = std::make_shared(std::make_shared(table_json)); + + auto temp_schema = std::make_shared(arrow::FieldVector{ + arrow::field("x", ActionType::arrow_type), + }); + + std::shared_ptr buffer_builder = arrow::RecordBatchBuilder::Make(temp_schema, arrow::default_memory_pool(), 1).ValueOrDie(); + ASSERT_EQ("", deeplog::json_parser::parse(input, buffer_builder).message()); + + auto batch = buffer_builder->Flush().ValueOrDie(); + auto table = arrow::Table::FromRecordBatches({batch}).ValueOrDie(); + + auto parsed_scalar = std::dynamic_pointer_cast(table->GetColumnByName("x")->chunk(0)->GetScalar(0).ValueOrDie()); + + auto from_arrow_action = ActionType(parsed_scalar); + + EXPECT_EQ(original_action_json.dump(), from_arrow_action.to_json().dump()); + } +}; + + +using ActionTypes = ::testing::Types< + deeplog::add_file_action, + deeplog::create_branch_action, + deeplog::create_commit_action, + deeplog::create_tensor_action, + deeplog::metadata_action, + deeplog::protocol_action, + deeplog::remove_file_action +>; +TYPED_TEST_SUITE(ActionSerializationTest, ActionTypes); + + +TYPED_TEST(ActionSerializationTest, FilledData) { + this->test_data_conversions(FILLED); +} + +//TYPED_TEST(ActionSerializationTest, NullValues) { +// this->test_data_conversions(NULLS); +//} + +TYPED_TEST(ActionSerializationTest, NegativeValues) { + this->test_data_conversions(NEGATIVES); +} + +TYPED_TEST(ActionSerializationTest, ZeroValues) { + this->test_data_conversions(ZEROS); +} + +//class ActionDataHandlingTest : public ::testing::TestWithParam>> { +// +//}; + +//TEST_P(ActionDataHandlingTest, auto_test) { +// auto [test_type, handler] = GetParam(); +// +// auto original_action_json = nlohmann::json::object(); +// for (auto field: handler->arrow_type()->fields()) { +// if (test_type == NULLS && field->nullable()) { +// original_action_json[field->name()] = nlohmann::json::value_t::null; +// continue; +// } +// +// if (field->type()->Equals(arrow::utf8())) { +// original_action_json[field->name()] = "my/path"; +// } else if (field->type()->Equals(arrow::uint64())) { +// if (test_type == ZEROS) { +// original_action_json[field->name()] = 0; +// } else { +// original_action_json[field->name()] = 873731; +// } +// } else if (field->type()->Equals(arrow::boolean())) { +// if (test_type == NEGATIVES) { +// original_action_json[field->name()] = false; +// } else { +// original_action_json[field->name()] = true; +// } +// } else { +// throw std::runtime_error("Unsupported type " + field->type()->ToString()); +// } +// } +// +// auto table_json = "{\"x\":" + original_action_json.dump() + "}"; +// auto input = std::make_shared(std::make_shared(table_json)); +// +// auto parse_options = arrow::json::ParseOptions::Defaults(); +// parse_options.explicit_schema = std::make_shared(arrow::FieldVector{ +// arrow::field("x", handler->arrow_type()), +// }); +// +// auto table = arrow::json::TableReader::Make(arrow::default_memory_pool(), input, arrow::json::ReadOptions::Defaults(), +// parse_options).ValueOrDie()->Read().ValueOrDie(); +// +// auto parsed_scalar = std::dynamic_pointer_cast(table->GetColumnByName("x")->chunk(0)->GetScalar(0).ValueOrDie()); +// +// auto from_arrow_action = handler->create_action(parsed_scalar); +// +// EXPECT_EQ(original_action_json.dump(), from_arrow_action->to_json().dump()); +//} \ No newline at end of file diff --git a/cpp/tests/deeplog/base_test.cpp b/cpp/tests/deeplog/base_test.cpp new file mode 100644 index 0000000000..ddb21b617f --- /dev/null +++ b/cpp/tests/deeplog/base_test.cpp @@ -0,0 +1 @@ +#include "base_test.hpp" diff --git a/cpp/tests/deeplog/base_test.hpp b/cpp/tests/deeplog/base_test.hpp new file mode 100644 index 0000000000..d5f98f0374 --- /dev/null +++ b/cpp/tests/deeplog/base_test.hpp @@ -0,0 +1,31 @@ +#pragma once + +#include +#include + +class base_test : public ::testing::Test { +protected: + void SetUp() override { + if (std::filesystem::exists(test_dir)) { + std::filesystem::remove_all(test_dir); + } + } + + void TearDown() override { + if (std::filesystem::exists(test_dir)) { + std::filesystem::remove_all(test_dir); + } + } + + std::set list_log_files(const std::string &branch_id) { + auto files = std::set < std::string > (); + std::filesystem::path dir_path = {test_dir + "/_deeplake_log/" + branch_id + "/"}; + for (const auto &entry: std::filesystem::directory_iterator(dir_path)) { + files.insert(entry.path().string().substr((test_dir + "/_deeplake_log/" + branch_id + "/").size())); + } + + return files; + } + + std::string test_dir = "tmp/test"; +}; diff --git a/cpp/tests/deeplog/deeplog_test.cpp b/cpp/tests/deeplog/deeplog_test.cpp new file mode 100644 index 0000000000..86f86ee003 --- /dev/null +++ b/cpp/tests/deeplog/deeplog_test.cpp @@ -0,0 +1,271 @@ +#include +#include +#include "../../deeplog/deeplog.hpp" +#include "../../deeplog/actions/protocol_action.hpp" +#include "../../deeplog/actions/metadata_action.hpp" +#include "../../deeplog/last_checkpoint.hpp" +#include "../../deeplog/snapshot.hpp" +#include "../../deeplog/metadata_snapshot.hpp" +#include "../../deeplog/optimistic_transaction.hpp" +#include "base_test.hpp" +#include +#include +#include +#include +#include +#include +#include + +class DeeplogTest : public base_test { }; + + +TEST_F(DeeplogTest, create) { + auto log = deeplog::deeplog::create(test_dir, 4); + + ASSERT_TRUE(std::filesystem::exists({test_dir + "/_deeplake_log/"})); + ASSERT_EQ(std::set < std::string > {"00000000000000000001.json"}, list_log_files(deeplog::META_BRANCH_ID)); + + std::ifstream ifs(test_dir + "/_deeplake_log/_meta/00000000000000000001.json"); + std::ostringstream json_string_stream; + json_string_stream << ifs.rdbuf(); + auto json_string = json_string_stream.str(); + + EXPECT_FALSE(json_string.starts_with("[")); + EXPECT_TRUE(json_string.find("protocol") != std::string::npos); + EXPECT_TRUE(json_string.find("metadata") != std::string::npos); + EXPECT_TRUE(json_string.find("branch") != std::string::npos); + + auto meta_snapshot = deeplog::metadata_snapshot(log); + + EXPECT_EQ(1, meta_snapshot.branches().size()); + EXPECT_EQ("main", meta_snapshot.branches().at(0)->name); + EXPECT_EQ(4, meta_snapshot.protocol()->min_reader_version); + EXPECT_EQ(4, meta_snapshot.protocol()->min_writer_version); + + EXPECT_NE("", meta_snapshot.metadata()->id); + EXPECT_NE(0, meta_snapshot.metadata()->created_time); + EXPECT_FALSE(meta_snapshot.metadata()->name.has_value()); + EXPECT_FALSE(meta_snapshot.metadata()->description.has_value()); + + auto snapshot = deeplog::snapshot("main", 0, log); + const auto files = snapshot.data_files(); + EXPECT_EQ(0, files.size()); + + EXPECT_THROW(auto ignore = deeplog::deeplog::create(test_dir, 4), std::runtime_error) << "Should not be able to create log twice"; +} + +TEST_F(DeeplogTest, open) { + auto ignore = deeplog::deeplog::create(test_dir, 4); + + auto log = deeplog::deeplog::open(test_dir); + + EXPECT_EQ(1, log->version(deeplog::META_BRANCH_ID)); +} + +TEST_F(DeeplogTest, version) { + auto log = deeplog::deeplog::create(test_dir, 4); + EXPECT_EQ(1, log->version(deeplog::META_BRANCH_ID)); + + EXPECT_EQ(0, log->version(deeplog::metadata_snapshot(log).find_branch("main")->id)); +} + +TEST_F(DeeplogTest, find_branch) { + auto log = deeplog::deeplog::create(test_dir, 4); + + auto main_branch = deeplog::metadata_snapshot(log).find_branch("main"); + EXPECT_EQ("main", main_branch->name); + EXPECT_NE("", main_branch->id); + +} + + +TEST_F(DeeplogTest, commit_protocol) { + auto log = deeplog::deeplog::create(test_dir, 4); + + auto action = deeplog::protocol_action(5, 6); + log->commit(deeplog::META_BRANCH_ID, 1, {std::make_shared(action)}); + + EXPECT_EQ((std::set < std::string > {"00000000000000000001.json", "00000000000000000002.json"}), list_log_files(deeplog::META_BRANCH_ID)); + std::ifstream ifs(test_dir + "/_deeplake_log/" + deeplog::META_BRANCH_ID + "/00000000000000000001.json"); + std::ostringstream json_string_stream; + json_string_stream << ifs.rdbuf(); + auto json_string = json_string_stream.str(); + + EXPECT_NE(json_string.find("protocol"), std::string::npos); + + EXPECT_EQ(5, deeplog::metadata_snapshot(log).protocol()->min_reader_version); + EXPECT_EQ(6, deeplog::metadata_snapshot(log).protocol()->min_writer_version); +} + +TEST_F(DeeplogTest, commit_metadata) { + auto log = deeplog::deeplog::create(test_dir, 4); + + auto original_metadata = deeplog::metadata_snapshot(log).metadata(); + auto action = deeplog::metadata_action(original_metadata->id, "new name", "new desc", original_metadata->created_time); + log->commit(deeplog::META_BRANCH_ID, log->version(deeplog::META_BRANCH_ID), {std::make_shared(action)}); + + EXPECT_EQ((std::set < std::string > {"00000000000000000001.json", "00000000000000000002.json"}), list_log_files(deeplog::META_BRANCH_ID)); + std::ifstream ifs(test_dir + "/_deeplake_log/" + deeplog::META_BRANCH_ID + "/00000000000000000002.json"); + std::ostringstream json_string_stream; + json_string_stream << ifs.rdbuf(); + auto json_string = json_string_stream.str(); + + EXPECT_NE(json_string.find("metadata"), std::string::npos); + + auto new_metadata = deeplog::metadata_snapshot(log).metadata(); + EXPECT_EQ(original_metadata->id, new_metadata->id); + EXPECT_EQ(original_metadata->created_time, new_metadata->created_time); + EXPECT_EQ("new name", new_metadata->name); + EXPECT_EQ("new desc", new_metadata->description); +} + +TEST_F(DeeplogTest, commit_add_file) { + auto log = deeplog::deeplog::create(test_dir, 4); + auto main_id = deeplog::metadata_snapshot(log).find_branch("main")->id; + + auto action = deeplog::add_file_action("my/path", "chunk", 3, 45, true, 3); + log->commit(main_id, log->version(main_id), {std::make_shared(action)}); + + EXPECT_EQ((std::set < std::string > {"00000000000000000001.json"}), list_log_files(main_id)); + std::ifstream ifs(test_dir + "/_deeplake_log/" + main_id + "/00000000000000000001.json"); + std::ostringstream json_string_stream; + json_string_stream << ifs.rdbuf(); + auto json_string = json_string_stream.str(); + + EXPECT_NE(json_string.find("add"), std::string::npos); + + auto files = deeplog::snapshot(main_id, 1, log).data_files(); + + EXPECT_EQ(1, files.size()); + EXPECT_EQ("my/path", files.at(0)->path); + EXPECT_EQ(3, files.at(0)->size); + EXPECT_EQ(45, files.at(0)->modification_time); +} + +TEST_F(DeeplogTest, commit_create_branch) { + auto log = deeplog::deeplog::create(test_dir, 4); + + auto action = deeplog::create_branch_action("123", "branch1", deeplog::META_BRANCH_ID, 0); + log->commit(deeplog::META_BRANCH_ID, log->version(deeplog::META_BRANCH_ID), {std::make_shared(action)}); + + EXPECT_EQ((std::set < std::string > {"00000000000000000001.json", "00000000000000000002.json"}), list_log_files(deeplog::META_BRANCH_ID)); + std::ifstream ifs(test_dir + "/_deeplake_log/" + deeplog::META_BRANCH_ID + "/00000000000000000002.json"); + std::ostringstream json_string_stream; + json_string_stream << ifs.rdbuf(); + auto json_string = json_string_stream.str(); + + EXPECT_NE(json_string.find("branch"), std::string::npos); + + auto branches = deeplog::metadata_snapshot(log).branches(); + + EXPECT_EQ(2, branches.size()); + EXPECT_NE("", (branches).at(0)->id); + EXPECT_EQ("main", (branches).at(0)->name); + + EXPECT_EQ("123", (branches).at(1)->id); + EXPECT_EQ("branch1", (branches).at(1)->name); +} + +TEST_F(DeeplogTest, checkpoint) { + auto log = deeplog::deeplog::create(test_dir, 4); + + auto main_id = deeplog::metadata_snapshot(log).find_branch("main")->id; + auto original_metadata = deeplog::metadata_snapshot(log).metadata(); + for (int i = 0; i <= 3; ++i) { + auto action = deeplog::metadata_action(original_metadata->id, "name " + std::to_string(i), "desc " + std::to_string(i), original_metadata->created_time); + log->commit(deeplog::META_BRANCH_ID, log->version(deeplog::META_BRANCH_ID), {std::make_shared(action)}); + } + + for (int i = 0; i < 4; ++i) { + auto action = deeplog::add_file_action("my/path" + std::to_string(i), "chunk", 3, 45, true, 10); + log->commit(main_id, log->version(main_id), {std::make_shared(action)}); + } + + EXPECT_EQ(5, log->version(deeplog::META_BRANCH_ID)); + EXPECT_EQ(4, log->version(main_id)); + + EXPECT_EQ(5, list_log_files(deeplog::META_BRANCH_ID).size()); + EXPECT_EQ(4, list_log_files(main_id).size()); + + auto new_metadata = deeplog::metadata_snapshot(log).metadata(); + EXPECT_EQ(original_metadata->id, new_metadata->id); + EXPECT_EQ(original_metadata->created_time, new_metadata->created_time); + EXPECT_EQ("name 3", new_metadata->name); + EXPECT_EQ("desc 3", new_metadata->description); + + log->checkpoint(deeplog::META_BRANCH_ID); + EXPECT_TRUE(list_log_files(deeplog::META_BRANCH_ID).contains("00000000000000000005.checkpoint.parquet")); + EXPECT_TRUE(list_log_files(deeplog::META_BRANCH_ID).contains("_last_checkpoint.json")); + + std::ifstream ifs(test_dir + "/_deeplake_log/" + deeplog::META_BRANCH_ID + "/_last_checkpoint.json"); + deeplog::last_checkpoint checkpoint_content = nlohmann::json::parse(ifs).template get(); + EXPECT_EQ(5, checkpoint_content.version); + + + //delete json files so loads after checkpoint doesn't use it + for (auto file: list_log_files(deeplog::META_BRANCH_ID)) { + if (file != "_last_checkpoint.json" && file.ends_with(".json")) { + std::filesystem::remove(test_dir + "/_deeplake_log/" + deeplog::META_BRANCH_ID + "/" + file); + } + } + ASSERT_FALSE(list_log_files(deeplog::META_BRANCH_ID).contains("00000000000000000001.json")); + + auto new_log = deeplog::deeplog::open(test_dir); + new_metadata = deeplog::metadata_snapshot(new_log).metadata(); + EXPECT_EQ(5, new_log->version(deeplog::META_BRANCH_ID)); + EXPECT_EQ(original_metadata->id, new_metadata->id); + EXPECT_EQ("name 3", new_metadata->name); +} + +TEST_F(DeeplogTest, checkpoint_collapses_actions) { + auto log = deeplog::deeplog::create(test_dir, 4); + + auto original_metadata = deeplog::metadata_snapshot(log).metadata(); + + log->commit(deeplog::META_BRANCH_ID, log->version(deeplog::META_BRANCH_ID), {std::make_shared(deeplog::metadata_action(original_metadata->id, "first name", "first desc", original_metadata->created_time))}); + log->commit(deeplog::META_BRANCH_ID, log->version(deeplog::META_BRANCH_ID), {std::make_shared(deeplog::metadata_action(original_metadata->id, "final name", "final desc", original_metadata->created_time))}); + + log->checkpoint(deeplog::META_BRANCH_ID); + + ASSERT_TRUE(list_log_files(deeplog::META_BRANCH_ID).contains("00000000000000000003.checkpoint.parquet")); + + auto checkpoint_file = arrow::io::ReadableFile::Open(test_dir + "/_deeplake_log/" + deeplog::META_BRANCH_ID + "/00000000000000000003.checkpoint.parquet").ValueOrDie(); + std::unique_ptr arrow_reader; + EXPECT_TRUE(parquet::arrow::OpenFile(checkpoint_file, arrow::default_memory_pool(), &arrow_reader).ok()); + + std::shared_ptr table; + EXPECT_TRUE(arrow_reader->ReadTable(&table).ok()); + + auto metadata_values = arrow::compute::DropNull(table->GetColumnByName("metadata")).ValueOrDie(); + EXPECT_EQ(1, metadata_values.chunked_array()->length()); + EXPECT_EQ("final name", std::dynamic_pointer_cast(metadata_values.chunked_array()->GetScalar(0).ValueOrDie())->field("name").ValueOrDie()->ToString()); + EXPECT_EQ("final desc", std::dynamic_pointer_cast(metadata_values.chunked_array()->GetScalar(0).ValueOrDie())->field("description").ValueOrDie()->ToString()); +} + +TEST_F(DeeplogTest, manual) { + auto log = deeplog::deeplog::open("/Users/nvoxland/src/activeloopai/deeplake/tmp/write_deeplog_ds"); + auto version = log->version("a19ad90613ba4b5d930f5b25d100397b"); + + std::cout << version << std::endl; +} + +//TEST(IntTest, e2eTest) { +// auto test_dir = "../test-ds"; +//// auto log = deeplog::deeplog::create(test_dir, 4); +// auto log = deeplog::deeplog::open(test_dir); +// +// const auto ¤t_metadata = log->metadata(); +// std::cout << current_metadata.data->id << std::endl; +// +// for (auto file : log->data_files(deeplog::MAIN_BRANCH_ID, std::nullopt).data) { +// std::cout << file->path() << std::endl; +// } +// +// auto action = deeplog::metadata_action(current_metadata.data->id, "new name", "new desc", current_metadata.data->created_time); +// log->commit(deeplog::MAIN_BRANCH_ID, current_metadata.version, {&action}); +// +//// auto action = deeplog::add_file_action("path/to/file.txt", 15, deeplog::current_timestamp(), true); +//// log->commit(deeplog::MAIN_BRANCH_ID, 1, {&action}); +// +//// log->checkpoint(deeplog::MAIN_BRANCH_ID); +//} diff --git a/cpp/tests/deeplog/last_checkpoint_test.cpp b/cpp/tests/deeplog/last_checkpoint_test.cpp new file mode 100644 index 0000000000..602470cfd8 --- /dev/null +++ b/cpp/tests/deeplog/last_checkpoint_test.cpp @@ -0,0 +1,13 @@ +#include +#include +#include +#include "../../deeplog/last_checkpoint.hpp" + +TEST(LastCheckpoint, to_json) { + nlohmann::json j = deeplog::last_checkpoint(31, 1003); + EXPECT_EQ("{\"size\":1003,\"version\":31}", j.dump()); + + auto parsed = j.template get(); + EXPECT_EQ(1003, parsed.size); + EXPECT_EQ(31, parsed.version); +} \ No newline at end of file diff --git a/cpp/tests/deeplog/metadata_snapshot_test.cpp b/cpp/tests/deeplog/metadata_snapshot_test.cpp new file mode 100644 index 0000000000..41a054b95a --- /dev/null +++ b/cpp/tests/deeplog/metadata_snapshot_test.cpp @@ -0,0 +1,48 @@ +#include +#include +#include "../../deeplog/deeplog.hpp" +#include "../../deeplog/metadata_snapshot.hpp" +#include "base_test.hpp" + +class DeeplogMetadataSnapshotTest : public base_test { }; + +TEST_F(DeeplogMetadataSnapshotTest, construct) { + auto log = deeplog::deeplog::create(test_dir, 4); + + auto original_metadata = deeplog::metadata_snapshot(log).metadata(); + auto action = deeplog::metadata_action(original_metadata->id, "new name", "new desc", original_metadata->created_time); + log->commit(deeplog::META_BRANCH_ID, log->version(deeplog::META_BRANCH_ID), {std::make_shared(action)}); + + auto snapshot0 = deeplog::metadata_snapshot(0, log); + auto snapshot1 = deeplog::metadata_snapshot(1, log); + + EXPECT_EQ(0, snapshot0.version); + EXPECT_EQ(1, snapshot1.version); +} + +TEST_F(DeeplogMetadataSnapshotTest, branches) { + auto log = deeplog::deeplog::create(test_dir, 4); + + auto metadata = deeplog::metadata_snapshot(log); + + auto branches = metadata.branches(); + EXPECT_EQ(1, branches.size()); + EXPECT_EQ("main", branches.at(0)->name); + EXPECT_NE("", branches.at(0)->id); + EXPECT_FALSE(branches.at(0)->from_id.has_value()); + EXPECT_FALSE(branches.at(0)->from_version.has_value()); + + log->commit(deeplog::META_BRANCH_ID, 1, { +// std::make_shared(deeplog::protocol_action(5, 5)) + std::make_shared(deeplog::create_tensor_action("123", "tensor1", "text", "text", + 0, false, false, false, + std::nullopt, + std::nullopt, {}, std::nullopt, + {}, {}, std::nullopt, std::nullopt, true, "3.1")) + }); + + auto metadata2 = deeplog::metadata_snapshot(log); + EXPECT_EQ(2, metadata2.version); + branches = metadata2.branches(); + EXPECT_EQ(1, branches.size()); +} \ No newline at end of file diff --git a/cpp/tests/deeplog/optimistic_transaction_test.cpp b/cpp/tests/deeplog/optimistic_transaction_test.cpp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/cpp/tests/deeplog/snapshot_test.cpp b/cpp/tests/deeplog/snapshot_test.cpp new file mode 100644 index 0000000000..110c5aa002 --- /dev/null +++ b/cpp/tests/deeplog/snapshot_test.cpp @@ -0,0 +1,25 @@ +#include +#include +#include "../../deeplog/deeplog.hpp" +#include "../../deeplog/snapshot.hpp" +#include "../../deeplog/metadata_snapshot.hpp" +#include "base_test.hpp" + +class DeeplogSnapshotTest : public base_test { }; + +TEST_F(DeeplogSnapshotTest, construct) { + auto log = deeplog::deeplog::create(test_dir, 4); + auto main_id = deeplog::metadata_snapshot(log).find_branch("main")->id; + + auto action = deeplog::add_file_action("my/path", "chunk", 3, 45, true, 20); + log->commit(main_id, log->version(main_id), {std::make_shared(action)}); + + auto snapshot0 = deeplog::snapshot(main_id, 0, log); + auto snapshot1 = deeplog::snapshot(main_id, 1, log); + + EXPECT_EQ(0, snapshot0.version); + EXPECT_EQ(1, snapshot1.version); + + EXPECT_EQ(main_id, snapshot0.branch_id); + EXPECT_EQ(main_id, snapshot1.branch_id); +} diff --git a/cpp/tests/deeplog/util_test.cpp b/cpp/tests/deeplog/util_test.cpp new file mode 100644 index 0000000000..4d01e68bbf --- /dev/null +++ b/cpp/tests/deeplog/util_test.cpp @@ -0,0 +1,8 @@ +#include +#include "../../deeplog/util.hpp" + +TEST(UtilTest, generate_id) { + auto id = deeplog::generate_id(); + EXPECT_FALSE(id.empty()); + EXPECT_TRUE(id.find_first_of('-') == -1); +} \ No newline at end of file diff --git a/deeplake/__init__.py b/deeplake/__init__.py index 9f2ee7fdeb..63f2cefa89 100644 --- a/deeplake/__init__.py +++ b/deeplake/__init__.py @@ -33,6 +33,7 @@ from .htype import htype from .integrations import huggingface from .integrations import wandb +from _deeplake._deeplake import * compressions = list(SUPPORTED_COMPRESSIONS) htypes = sorted(list(HTYPE_CONFIGURATIONS)) diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py index 3d941e4837..a93ac2d027 100644 --- a/deeplake/api/dataset.py +++ b/deeplake/api/dataset.py @@ -1,5 +1,7 @@ import os +from deeplake.deeplog import DeepLog + import deeplake import jwt import pathlib @@ -101,6 +103,7 @@ def init( check_integrity: bool = True, lock_enabled: Optional[bool] = True, lock_timeout: Optional[int] = 0, + log_format: int = 4, ): """Returns a :class:`~deeplake.core.dataset.Dataset` object referencing either a new or existing dataset. @@ -253,6 +256,7 @@ def init( "verbose": verbose, "lock_timeout": lock_timeout, "lock_enabled": lock_enabled, + "log_format": log_format, } if access_method == "stream": @@ -363,6 +367,7 @@ def empty( lock_enabled: Optional[bool] = True, lock_timeout: Optional[int] = 0, verbose: bool = True, + log_format: int = 4, ) -> Dataset: """Creates an empty dataset @@ -386,7 +391,7 @@ def empty( verbose (bool): If True, logs will be printed. Defaults to True. lock_timeout (int): Number of seconds to wait before throwing a LockException. If None, wait indefinitely lock_enabled (bool): If true, the dataset manages a write lock. NOTE: Only set to False if you are managing concurrent access externally. - + log_format (int): The log format to use for the dataset. Can be: 3 or 4. Defaults to 3. Returns: Dataset: Dataset created using the arguments provided. @@ -462,6 +467,7 @@ def empty( "verbose": verbose, "lock_timeout": lock_timeout, "lock_enabled": lock_enabled, + "log_format": log_format, } ret = dataset._load(dataset_kwargs, create=True) return ret @@ -713,6 +719,15 @@ def _reset_and_load(storage, access_method, dataset_kwargs, address, err): @staticmethod def _load(dataset_kwargs, access_method=None, create=False, check_integrity=True): + if create: + dataset_kwargs["storage"].set_deeplog( + DeepLog.create(dataset_kwargs["storage"], dataset_kwargs["log_format"]) + ) + else: + dataset_kwargs["storage"].set_deeplog( + DeepLog.open(dataset_kwargs["storage"]) + ) + if access_method in ("stream", None): ret = dataset_factory(**dataset_kwargs) if create: diff --git a/deeplake/core/chunk_engine.py b/deeplake/core/chunk_engine.py index 7cb0d41c9f..3a7ae53cb9 100644 --- a/deeplake/core/chunk_engine.py +++ b/deeplake/core/chunk_engine.py @@ -39,6 +39,7 @@ ) from deeplake.core.tiling.serialize import break_into_tiles from deeplake.core.polygon import Polygons +from deeplake.deeplog.adapters import get_tensor_metadata, parse_commit_id from deeplake.util.casting import get_empty_text_like_sample, intelligent_cast from deeplake.util.empty_sample import is_empty_list from deeplake.util.shape_interval import ShapeInterval @@ -85,6 +86,7 @@ get_tensor_meta_key, get_tensor_tile_encoder_key, get_tensor_info_key, + split_chunk_key, ) from deeplake.util.exceptions import ( GetChunkError, @@ -106,9 +108,15 @@ get_compression_type, ) from deeplake.core.sample import Sample + +from deeplake.deeplog.adapters import parse_commit_id +from deeplake.deeplog import DeepLogSnapshot, OptimisticTransaction +from deeplake.deeplog.actions import UpdateTensorAction, AddFileAction + from itertools import chain, repeat from collections.abc import Iterable from PIL import Image # type: ignore +import time class ChunkEngine: @@ -181,7 +189,6 @@ def __init__( self.base_storage = get_base_storage(cache) self._meta_cache = meta_cache self.version_state = version_state - self.name = version_state["tensor_names"].get(self.key) self.compression = None self.chunk_class = BaseChunk @@ -247,6 +254,8 @@ def __init__( self.start_chunk = None self.link_creds: Optional[LinkCreds] = None + self._staged_transaction: Optional[OptimisticTransaction] = None + @property def sample_compression(self): return self._sample_compression @@ -307,11 +316,22 @@ def min_chunk_size(self): @property def tensor_meta(self): commit_id = self.commit_id - if self._tensor_meta is None or self._tensor_meta_commit_id != commit_id: + if self.base_storage.deeplog.log_format() < 4 and ( + self._tensor_meta is None or self._tensor_meta_commit_id != commit_id + ): key = get_tensor_meta_key(self.key, commit_id) self._tensor_meta = self.meta_cache.get_deeplake_object(key, TensorMeta) self._tensor_meta_commit_id = commit_id self.meta_cache.register_deeplake_object(key, self._tensor_meta) + elif self.base_storage.deeplog.log_format() >= 4 and ( + self._tensor_meta is None or self._tensor_meta_commit_id != commit_id + ): + branch_id, branch_version = parse_commit_id(self.commit_id) + self._tensor_meta = get_tensor_metadata( + self.key, self.base_storage.deeplog, branch_id, branch_version + ) + self._tensor_meta_commit_id = commit_id + return self._tensor_meta @property @@ -336,18 +356,21 @@ def chunk_id_encoder(self) -> ChunkIdEncoder: or self._chunk_id_encoder_commit_id != commit_id ): commit_id = self.commit_id - key = get_chunk_id_encoder_key(self.key, commit_id) - if not self.chunk_id_encoder_exists: - enc = ChunkIdEncoder(dtype=np.uint64) - try: - self.meta_cache[key] = enc - except ReadOnlyModeError: - pass + if self.base_storage.deeplog.log_format() < 4: + key = get_chunk_id_encoder_key(self.key, commit_id) + if not self.chunk_id_encoder_exists: + enc = ChunkIdEncoder(dtype=np.uint64) + try: + self.meta_cache[key] = enc + except ReadOnlyModeError: + pass + else: + enc = self.meta_cache.get_deeplake_object(key, ChunkIdEncoder) + self.meta_cache.register_deeplake_object(key, enc) else: - enc = self.meta_cache.get_deeplake_object(key, ChunkIdEncoder) + enc = self.get_chunk_id_encoder() self._chunk_id_encoder = enc self._chunk_id_encoder_commit_id = commit_id - self.meta_cache.register_deeplake_object(key, enc) return self._chunk_id_encoder @property @@ -454,18 +477,21 @@ def tile_encoder(self) -> TileEncoder: """Gets the tile encoder from cache, if one is not found it creates a blank encoder.""" commit_id = self.commit_id if self._tile_encoder is None or self._tile_encoder_commit_id != commit_id: - key = get_tensor_tile_encoder_key(self.key, commit_id) - if not self.tile_encoder_exists: - enc = TileEncoder() - try: - self.meta_cache[key] = enc - except ReadOnlyModeError: - pass + if self.base_storage.deeplog.log_format() < 4: + key = get_tensor_tile_encoder_key(self.key, commit_id) + if not self.tile_encoder_exists: + enc = TileEncoder() + try: + self.meta_cache[key] = enc + except ReadOnlyModeError: + pass + else: + enc = self.meta_cache.get_deeplake_object(key, TileEncoder) + self._tile_encoder = enc + self._tile_encoder_commit_id = commit_id + self.meta_cache.register_deeplake_object(key, enc) else: - enc = self.meta_cache.get_deeplake_object(key, TileEncoder) - self._tile_encoder = enc - self._tile_encoder_commit_id = commit_id - self.meta_cache.register_deeplake_object(key, enc) + self._tile_encoder = TileEncoder() return self._tile_encoder @property @@ -568,6 +594,31 @@ def last_appended_chunk(self, allow_copy=True) -> Optional[BaseChunk]: self.active_appended_chunk = chunk return chunk + def get_chunk_id_encoder(self): + branch_id, branch_version = parse_commit_id(self.commit_id) + snapshot = DeepLogSnapshot(branch_id, branch_version, self.base_storage.deeplog) + file_filter = ( + lambda file: file.type == "chunk" + and split_chunk_key(file.path)[1] == self.key + ) + add_files = tuple(filter(file_filter, snapshot.data_files())) + + encoded = np.zeros((len(add_files), 2), dtype=np.uint64) + + chunk_ids = encoded[:, 0] + num_samples = encoded[:, 1] + + for i, file in enumerate(add_files): + chunk_ids[i] = ChunkIdEncoder.id_from_name(split_chunk_key(file.path)[2]) + num_samples[i] = file.num_samples + + if len(num_samples) > 0: + num_samples[0] -= 1 + np.cumsum(num_samples, out=num_samples) + + cid = ChunkIdEncoder(encoded, dtype=np.uint64) + return cid + def get_chunk(self, chunk_key: str, partial_chunk_bytes=0) -> BaseChunk: chunk = self.cache.get_deeplake_object( chunk_key, @@ -582,19 +633,43 @@ def get_chunk(self, chunk_key: str, partial_chunk_bytes=0) -> BaseChunk: def get_chunk_from_chunk_id( self, chunk_id, copy: bool = False, partial_chunk_bytes=0 ) -> BaseChunk: - chunk_key = None - try: - chunk_name = ChunkIdEncoder.name_from_id(chunk_id) - chunk_commit_id, tkey = self.get_chunk_commit(chunk_name) - chunk_key = get_chunk_key(tkey, chunk_name, chunk_commit_id) - chunk = self.get_chunk(chunk_key, partial_chunk_bytes=partial_chunk_bytes) - chunk.key = chunk_key - chunk.id = chunk_id - if copy and chunk_commit_id != self.commit_id: - chunk = self.copy_chunk_to_new_commit(chunk, chunk_name) - return chunk - except Exception as e: - raise GetChunkError(chunk_key) from e + if self.base_storage.deeplog.log_format() < 4: + chunk_key = None + try: + chunk_name = ChunkIdEncoder.name_from_id(chunk_id) + chunk_commit_id, tkey = self.get_chunk_commit(chunk_name) + chunk_key = get_chunk_key(tkey, chunk_name, chunk_commit_id) + chunk = self.get_chunk( + chunk_key, partial_chunk_bytes=partial_chunk_bytes + ) + chunk.key = chunk_key + chunk.id = chunk_id + if copy and chunk_commit_id != self.commit_id: + chunk = self.copy_chunk_to_new_commit(chunk, chunk_name) + return chunk + + except Exception as e: + raise GetChunkError(chunk_key) from e + else: + try: + chunk_name = ChunkIdEncoder.name_from_id(chunk_id) + branch_id, branch_version = parse_commit_id(self.commit_id) + # NOTE: Can this search be avoided if chunk_id is the same as chunk_key? + snapshot = DeepLogSnapshot( + branch_id, branch_version, self.base_storage.deeplog + ) + for file in snapshot.data_files(): + if ( + file.type == "chunk" + and split_chunk_key(file.path)[2] == chunk_name + ): + chunk_key = file.path + break + return self.get_chunk( + chunk_key, partial_chunk_bytes=partial_chunk_bytes + ) + except Exception as e: + raise GetChunkError(chunk_name) from e def get_video_chunk(self, chunk_id, copy: bool = False): """Returns video chunks. Chunk will contain presigned url to the video instead of data if the chunk is large.""" @@ -808,7 +883,7 @@ def _samples_to_chunks( extra_args = {"lengths": lengths} current_chunk = start_chunk updated_chunks: List[Optional[str]] = [] - if current_chunk is None: + if current_chunk is None or self.base_storage.deeplog.log_format() >= 4: current_chunk = self._create_new_chunk( register and start_chunk_row is not None ) @@ -948,6 +1023,185 @@ def _samples_to_chunks( if not register: return updated_chunks, tiles + def _samples_to_chunks_v4( + self, + samples, + update_commit_diff: bool = False, + update_tensor_meta: bool = True, + progressbar: bool = False, + register_creds: bool = True, + pg_callback: Optional[Callable] = None, + return_samples: bool = False, + ignore_errors: bool = False, + ): + lengths = None + orig_meta_length = self.tensor_meta.length + incoming_num_samples = len(samples) + enc_ids: List[Optional[str]] = [] + enc_count = [0] + chunk_sizes = [] + + if self.tensor_meta.htype == "text" and ( + self.chunk_class != SampleCompressedChunk + ): + lengths = np.zeros(len(samples), dtype=np.uint32) + for i, s in enumerate(samples): + try: + s = s.numpy() + except AttributeError: + pass + try: + if s.dtype.name[:3] == "str": + lengths[i] = len(str(s.reshape(()))) + except AttributeError: + try: + lengths[i] = s.__len__() + except AttributeError: # None + lengths[i] = 0 + except TypeError: # Numpy scalar str + lengths[i] = str(s).__len__() + extra_args = {"lengths": lengths} + current_chunk = self._create_new_chunk(False) + current_chunk._update_tensor_meta_length = False + enc_ids.append(current_chunk.id) + enc = self.chunk_id_encoder + tiles: Dict[int, Tuple[Tuple[int, ...], Tuple[int, ...]]] = {} + if update_commit_diff: + commit_diff = self.commit_diff + if progressbar: + pbar = tqdm(total=len(samples)) + if not isinstance(samples, list) and not ( + isinstance(samples, np.ndarray) and self._numpy_extend_optimization_enabled + ): + # Note: in the future we can get rid of this conversion of sample compressed chunks too by predicting the compression ratio. + samples = list(samples) + verified_samples = [] + current_chunk_full = False + while len(samples) > 0: + if current_chunk_full: + num_samples_added = 0 + current_chunk_full = False + else: + initial_num_samples = len(samples) + num_samples_added = current_chunk.extend_if_has_space( + samples, update_tensor_meta=update_tensor_meta, ignore_errors=ignore_errors, **extra_args # type: ignore + ) # type: ignore + skipped_num_samples = initial_num_samples - len(samples) + incoming_num_samples -= skipped_num_samples + if register_creds: + self.register_new_creds(num_samples_added, samples) + if num_samples_added == 0: + chunk_sizes.append(current_chunk.nbytes) + current_chunk = self._create_new_chunk(False) + current_chunk._update_tensor_meta_length = False + enc_ids.append(current_chunk.id) + enc_count.append(0) + elif num_samples_added == PARTIAL_NUM_SAMPLES: + sample = samples[0] + if self.tensor_meta.is_link: + verified_samples.append(sample) + else: + if sample.is_first_write: + verified_samples.append(sample) + num_samples_added, samples, lengths = self._handle_tiled_sample( + enc, + True, + samples, + orig_meta_length, + incoming_num_samples, + None, + enc_count, + tiles, + lengths, + ) + if len(samples) > 0: + chunk_sizes.append(current_chunk.nbytes) + current_chunk = self._create_new_chunk(False) + current_chunk._update_tensor_meta_length = False + enc_ids.append(current_chunk.id) + enc_count.append(0) + elif num_samples_added == FAST_EXTEND_BAIL: + num_samples_added = 0 + samples = list(samples) + else: + current_chunk_full = True + verified_samples.extend(samples[:num_samples_added]) + num_samples_added, samples, lengths = self._handle_one_or_more_samples( + enc, + True, + samples, + num_samples_added, + [], + None, + current_chunk, + enc_count, + lengths, + ) + if progressbar: + pbar.update(num_samples_added) + elif pg_callback is not None: + pg_callback(num_samples_added) + + # add last chunk + chunk_sizes.append(current_chunk.nbytes) + + first_chunk_num_samples = enc_count[0] + self.tensor_meta.update_length(incoming_num_samples) + if enc_count: + enc_arr = enc._encoded + n = len(enc_arr) + if n: + enc_count[0] += enc_arr[-1, 1] + else: + enc_count[0] -= 1 + enc_last_seen = np.cumsum(enc_count, dtype=np.uint64) + arr = np.zeros((n + len(enc_ids), 2), dtype=np.uint64) + if n: + arr[:n] = enc_arr + new = arr[n:] + new[:, 0] = enc_ids + new[:, 1] = enc_last_seen + enc._encoded = arr + enc.is_dirty = True + + transaction = self.base_storage._staged_transaction + if transaction is None: + branch_id, version = parse_commit_id(self.commit_id) + snapshot = DeepLogSnapshot(branch_id, version, self.base_storage.deeplog) + transaction = OptimisticTransaction(snapshot) + self.base_storage._staged_transaction = transaction + + enc_count[0] = first_chunk_num_samples + assert len(enc_ids) == len(chunk_sizes) == len(enc_count) + for chunk_id, chunk_size, num_samples in zip(enc_ids, chunk_sizes, enc_count): + chunk_name = ChunkIdEncoder.name_from_id(chunk_id) + transaction.add( + AddFileAction( + get_chunk_key(self.key, chunk_name, self.commit_id), + "chunk", + chunk_size, + int(time.time()), + True, + num_samples, + ) + ) + transaction.add( + UpdateTensorAction( + id=self.key, name=self.name, **self.tensor_meta._action_kwargs() + ) + ) + + if update_commit_diff: + commit_diff.add_data(incoming_num_samples) + tenc = self.tile_encoder + tenc.entries.update(tiles) + tenc.is_dirty = True + if progressbar: + pbar.close() + + if return_samples: + return verified_samples + def _handle_one_or_more_samples( self, enc: ChunkIdEncoder, @@ -1036,16 +1290,26 @@ def _extend( samples, verified_samples = self._sanitize_samples( samples, pg_callback=pg_callback, ignore_errors=ignore_errors ) - samples = self._samples_to_chunks( - samples, - start_chunk=self.last_appended_chunk(allow_copy=False), - register=True, - progressbar=progressbar, - update_commit_diff=update_commit_diff, - pg_callback=pg_callback, - return_samples=True, - ignore_errors=ignore_errors, - ) + if self.base_storage.deeplog.log_format() < 4: + samples = self._samples_to_chunks( + samples, + start_chunk=self.last_appended_chunk(allow_copy=False), + register=True, + progressbar=progressbar, + update_commit_diff=update_commit_diff, + pg_callback=pg_callback, + return_samples=True, + ignore_errors=ignore_errors, + ) + else: + samples = self._samples_to_chunks_v4( + samples, + update_commit_diff=update_commit_diff, + progressbar=progressbar, + pg_callback=pg_callback, + return_samples=True, + ignore_errors=ignore_errors, + ) return verified_samples or samples def _extend_link_callback( diff --git a/deeplake/core/dataset/dataset.py b/deeplake/core/dataset/dataset.py index 2bc26dea03..126f090b9f 100644 --- a/deeplake/core/dataset/dataset.py +++ b/deeplake/core/dataset/dataset.py @@ -13,10 +13,12 @@ from tqdm import tqdm import deeplake +from deeplake.deeplog import DeepLogSnapshot, MetadataSnapshot from deeplake.core.index.index import IndexEntry from deeplake.core.link_creds import LinkCreds from deeplake.core.sample import Sample from deeplake.core.linked_sample import LinkedSample +from deeplake.deeplog.adapters import to_commit_id from deeplake.util.connect_dataset import connect_dataset_entry from deeplake.util.downsample import validate_downsampling from deeplake.util.version_control import ( @@ -144,6 +146,7 @@ from deeplake.util.pretty_print import summary_dataset from deeplake.core.dataset.view_entry import ViewEntry from deeplake.core.dataset.invalid_view import InvalidView +from deeplake.deeplog import DeepLog from deeplake.hooks import dataset_read from collections import defaultdict from itertools import chain @@ -310,6 +313,10 @@ def maybe_flush(self): self._flush_vc_info() self.storage.flush() + @property + def deeplog(self) -> DeepLog: + return self.storage.deeplog + @property def num_samples(self) -> int: """Returns the length of the smallest tensor. @@ -834,6 +841,7 @@ def _create_tensor( overwrite=True, **meta_kwargs, ) + self._refresh_snapshots() meta: DatasetMeta = self.meta ffw_dataset_meta(meta) meta.add_tensor(name, key, hidden=hidden) @@ -1350,6 +1358,19 @@ def _get_commit_id_for_address(self, address, version_state): ) return commit_id + def _refresh_snapshots(self): + """ + If using deeplog, update the version state. + This needs to be called after any transaction commits to move the version read along + """ + if self.deeplog.log_format() > 3: + branch = self.version_state["branch"] + old_version_state = self.version_state + self.version_state = None + self._load_version_info(branch) + self.version_state.update(old_version_state) + load_meta(self) + def _load_version_info(self, address=None): """Loads data from version_control_file otherwise assume it doesn't exist and load all empty""" if self.version_state: @@ -1359,39 +1380,83 @@ def _load_version_info(self, address=None): address = "main" version_state = {} - try: + if self.storage.deeplog.log_format() < 4: try: - version_info = load_version_info(self.storage) + try: + version_info = load_version_info(self.storage) + except Exception as e: + version_info = rebuild_version_info(self.storage) + if version_info is None: + raise e + version_state["branch_commit_map"] = version_info["branch_commit_map"] + version_state["commit_node_map"] = version_info["commit_node_map"] + + commit_id = self._get_commit_id_for_address(address, version_state) + + version_state["commit_id"] = commit_id + version_state["commit_node"] = version_state["commit_node_map"][ + commit_id + ] + version_state["branch"] = version_state["commit_node"].branch except Exception as e: - version_info = rebuild_version_info(self.storage) - if version_info is None: - raise e - version_state["branch_commit_map"] = version_info["branch_commit_map"] - version_state["commit_node_map"] = version_info["commit_node_map"] + if isinstance(e, CheckoutError): + raise e from None + if address != "main": + raise CheckoutError( + f"Address {address} not found. Ensure the commit id / branch name is correct." + ) + branch = "main" + version_state["branch"] = branch + version_state["branch_commit_map"] = {} + version_state["commit_node_map"] = {} + # used to identify that this is the first commit so its data will not be in similar directory structure to the rest + commit_id = FIRST_COMMIT_ID + commit_node = CommitNode(branch, commit_id) + version_state["commit_id"] = commit_id + version_state["commit_node"] = commit_node + version_state["branch_commit_map"][branch] = commit_id + version_state["commit_node_map"][commit_id] = commit_node + else: + deeplog = self.storage.deeplog + metadata_snapshot = MetadataSnapshot(deeplog) + branch_id = metadata_snapshot.find_branch(address).id + snapshot = DeepLogSnapshot(branch_id, deeplog) + branch_data = metadata_snapshot.branches() + commit_data = snapshot.commits() + + branch_names = {} + branch_ids = {} + for branch in branch_data: + branch_names[branch.id] = branch.name + branch_ids[branch.name] = branch.id - commit_id = self._get_commit_id_for_address(address, version_state) + version_state["branch_commit_map"] = {} + version_state["commit_node_map"] = {} + for commit_info in commit_data: + version_state["commit_node_map"][ + to_commit_id(commit_info.branch_id, commit_info.branch_version) + ] = CommitNode( + branch_names[commit_info.branch_id], + to_commit_id(commit_info.branch_id, commit_info.branch_version), + ) + + for branch_info in branch_data: + # create head commit for branch + branch_version = self.storage.deeplog.version(branch_info.id) + head_commit_id = to_commit_id(branch_info.id, branch_version) + if head_commit_id not in version_state["commit_node_map"]: + version_state["commit_node_map"][head_commit_id] = CommitNode( + branch_names[branch_info.id], head_commit_id + ) + + version_state["branch_commit_map"][branch_info.name] = branch_info.id + + commit_id = to_commit_id(snapshot.branch_id, snapshot.version) version_state["commit_id"] = commit_id version_state["commit_node"] = version_state["commit_node_map"][commit_id] version_state["branch"] = version_state["commit_node"].branch - except Exception as e: - if isinstance(e, CheckoutError): - raise e from None - if address != "main": - raise CheckoutError( - f"Address {address} not found. Ensure the commit id / branch name is correct." - ) - branch = "main" - version_state["branch"] = branch - version_state["branch_commit_map"] = {} - version_state["commit_node_map"] = {} - # used to identify that this is the first commit so its data will not be in similar directory structure to the rest - commit_id = FIRST_COMMIT_ID - commit_node = CommitNode(branch, commit_id) - version_state["commit_id"] = commit_id - version_state["commit_node"] = commit_node - version_state["branch_commit_map"][branch] = commit_id - version_state["commit_node_map"][commit_id] = commit_node + # keeps track of the full unindexed tensors version_state["full_tensors"] = {} version_state["tensor_names"] = {} @@ -1931,11 +1996,12 @@ def _populate_meta(self, address: Optional[str] = None, verbose=True): # cannot create a new dataset when in read_only mode. raise CouldNotCreateNewDatasetException(self.path) meta = DatasetMeta() - key = get_dataset_meta_key(self.version_state["commit_id"]) self.version_state["meta"] = meta - self.storage.register_deeplake_object(key, meta) - self._register_dataset() - self.flush() + if self.deeplog.log_format() < 4: + key = get_dataset_meta_key(self.version_state["commit_id"]) + self.storage.register_deeplake_object(key, meta) + self._register_dataset() + self.flush() def _register_dataset(self): if not self.__dict__["org_id"]: @@ -1996,21 +2062,22 @@ def _set_read_only(self, value: bool, err: bool): storage.next_storage.enable_readonly() self._unlock() else: - try: - locked = self._lock(err=err) - if locked: - self.storage.disable_readonly() - if ( - isinstance(storage, LRUCache) - and storage.next_storage is not None - ): - storage.next_storage.disable_readonly() - else: + if storage.deeplog.log_format() < 4: + try: + locked = self._lock(err=err) + if locked: + self.storage.disable_readonly() + if ( + isinstance(storage, LRUCache) + and storage.next_storage is not None + ): + storage.next_storage.disable_readonly() + else: + self.__dict__["_read_only"] = True + except LockedException as e: self.__dict__["_read_only"] = True - except LockedException as e: - self.__dict__["_read_only"] = True - if err: - raise e + if err: + raise e @read_only.setter @invalid_view_op @@ -2500,7 +2567,7 @@ def flush(self): self.storage.flush() def _flush_vc_info(self): - if self._vc_info_updated: + if self.deeplog.log_format() < 4 and self._vc_info_updated: save_version_info(self.version_state, self.storage) for node in self.version_state["commit_node_map"].values(): if node._info_updated: diff --git a/deeplake/core/meta/encode/base_encoder.py b/deeplake/core/meta/encode/base_encoder.py index 9650ee5461..758865512c 100644 --- a/deeplake/core/meta/encode/base_encoder.py +++ b/deeplake/core/meta/encode/base_encoder.py @@ -73,6 +73,8 @@ def __init__(self, encoded=None, dtype=ENCODING_DTYPE): encoded = np.array(encoded, dtype=self.dtype) if encoded is None: + if self.__class__.__name__ == "ChunkEngine": + print("Reinitializing...") encoded = np.zeros((0, self._num_columns), dtype=self.dtype) self._encoded = encoded diff --git a/deeplake/core/meta/tensor_meta.py b/deeplake/core/meta/tensor_meta.py index 529ff40df5..d22b70032f 100644 --- a/deeplake/core/meta/tensor_meta.py +++ b/deeplake/core/meta/tensor_meta.py @@ -157,11 +157,7 @@ def set_htype(self, htype: str, **kwargs): if not kwargs: kwargs = HTYPE_CONFIGURATIONS[htype] - _validate_htype_exists(htype) - _validate_htype_overwrites(htype, kwargs) - _replace_unspecified_values(htype, kwargs) - _validate_required_htype_overwrites(htype, kwargs) - _format_values(htype, kwargs) + validate_and_process_kwargs(htype, kwargs) required_meta = _required_meta_from_htype(htype) required_meta.update(kwargs) @@ -239,6 +235,26 @@ def nbytes(self): def __str__(self): return str(self.__getstate__()) + def _action_kwargs(self): + return { + "dtype": self.dtype, + "htype": self.htype, + "length": self.length, + "is_link": self.is_link, + "is_sequence": self.is_sequence, + "hidden": self.hidden, + "chunk_compression": self.chunk_compression, + "sample_compression": self.sample_compression, + "links": self.links, + "max_chunk_size": self.max_chunk_size, + "min_shape": self.min_shape, + "max_shape": self.max_shape, + "tiling_threshold": self.tiling_threshold, + "typestr": self.typestr, + "verify": self.verify, + "version": self.version, + } + def _validate_links(links: dict): if not isinstance(links, dict): @@ -418,3 +434,11 @@ def _is_dtype_supported_by_numpy(dtype: str) -> bool: return True except: return False + + +def validate_and_process_kwargs(htype, kwargs): + _validate_htype_exists(htype) + _validate_htype_overwrites(htype, kwargs) + _replace_unspecified_values(htype, kwargs) + _validate_required_htype_overwrites(htype, kwargs) + _format_values(htype, kwargs) diff --git a/deeplake/core/storage/azure.py b/deeplake/core/storage/azure.py index 00635d9f73..ae5564f984 100644 --- a/deeplake/core/storage/azure.py +++ b/deeplake/core/storage/azure.py @@ -8,6 +8,7 @@ from deeplake.client.client import DeepLakeBackendClient from deeplake.util.exceptions import PathNotEmptyException from deeplake.util.path import relpath +from deeplake.deeplog.actions import AddFileAction try: from azure.identity import DefaultAzureCredential @@ -102,6 +103,7 @@ def __setitem__(self, path, content): f"{self.root_folder}/{path}" ) blob_client.upload_blob(content, overwrite=True) + self.deeplog.commit([AddFileAction(path)]) def __getitem__(self, path): return self.get_bytes(path) diff --git a/deeplake/core/storage/gcs.py b/deeplake/core/storage/gcs.py index 394854ea75..0bb66774d3 100644 --- a/deeplake/core/storage/gcs.py +++ b/deeplake/core/storage/gcs.py @@ -32,6 +32,7 @@ PathNotEmptyException, ) from deeplake.client.client import DeepLakeBackendClient +from deeplake.deeplog.actions import AddFileAction def _remove_protocol_from_path(path: str) -> str: @@ -422,6 +423,7 @@ def __setitem__(self, key, value): elif isinstance(value, bytearray): value = bytes(value) blob.upload_from_string(value, retry=self.retry) + self.deeplog.commit([AddFileAction(key)]) def __iter__(self): """Iterating over the structure.""" diff --git a/deeplake/core/storage/google_drive.py b/deeplake/core/storage/google_drive.py index 06a0743a51..c312b1c659 100644 --- a/deeplake/core/storage/google_drive.py +++ b/deeplake/core/storage/google_drive.py @@ -6,6 +6,7 @@ import pickle from typing import Dict, Optional, Union from deeplake.util.hash import hash_inputs +from deeplake.deeplog.actions import AddFileAction import logging try: @@ -318,9 +319,11 @@ def __setitem__(self, path, content): parent_id = self.root_id file = self._create_file(basename, FILE, parent_id, content) self._set_id(path, file.get("id")) + self.deeplog.commit([AddFileAction(path)]) return self._write_to_file(id, content) + self.deeplog.commit([AddFileAction(path)]) return def __delitem__(self, path): diff --git a/deeplake/core/storage/provider.py b/deeplake/core/storage/provider.py index da00099964..7557100837 100644 --- a/deeplake/core/storage/provider.py +++ b/deeplake/core/storage/provider.py @@ -1,8 +1,9 @@ from abc import ABC, abstractmethod from collections.abc import MutableMapping -from typing import Optional, Set, Sequence, Dict +from typing import Optional, Set, Sequence, Dict, Union from deeplake.constants import BYTE_PADDING +from deeplake.deeplog import DeepLog from deeplake.util.assert_byte_indexes import assert_byte_indexes from deeplake.util.exceptions import ReadOnlyModeError from deeplake.util.keys import get_dataset_lock_key @@ -72,6 +73,20 @@ def get_bytes( assert_byte_indexes(start_byte, end_byte) return self[path][start_byte:end_byte] + def set_deeplog(self, deeplog: DeepLog): + from deeplake.util.remove_cache import get_base_storage + + storage_to_use = get_base_storage(self) + storage_to_use._deeplog = deeplog + storage_to_use._staged_transaction = None + + @property + def deeplog(self) -> Union[DeepLog, None]: + from deeplake.util.remove_cache import get_base_storage + + storage_to_use = get_base_storage(self) + return storage_to_use._deeplog + @abstractmethod def __setitem__(self, path: str, value: bytes): """Sets the object present at the path with the value diff --git a/deeplake/core/storage/s3.py b/deeplake/core/storage/s3.py index d2fb1f0a9c..f33bee5d66 100644 --- a/deeplake/core/storage/s3.py +++ b/deeplake/core/storage/s3.py @@ -19,6 +19,7 @@ ) from deeplake.util.path import relpath from deeplake.util.warnings import always_warn +from deeplake.deeplog.actions import AddFileAction from botocore.exceptions import ( ReadTimeoutError, ConnectionError, @@ -157,6 +158,7 @@ def _set(self, path, content): Key=path, ContentType="application/octet-stream", # signifies binary data ) + self.deeplog.commit([AddFileAction(path)]) def __setitem__(self, path, content): """Sets the object present at the path with the value diff --git a/deeplake/core/tensor.py b/deeplake/core/tensor.py index 36170bb299..02b452947b 100644 --- a/deeplake/core/tensor.py +++ b/deeplake/core/tensor.py @@ -1,6 +1,7 @@ import deeplake from deeplake.core.linked_chunk_engine import LinkedChunkEngine from deeplake.core.storage.lru_cache import LRUCache +from deeplake.deeplog.adapters import get_tensor_metadata, parse_commit_id from deeplake.util.downsample import apply_partial_downsample from deeplake.util.invalid_view_op import invalid_view_op from deeplake.core.version_control.commit_chunk_map import CommitChunkMap @@ -35,6 +36,7 @@ get_sample_id_tensor_key, get_sample_info_tensor_key, get_sample_shape_tensor_key, + tensor_exists_in_log, ) from deeplake.util.modified import get_modified_indexes from deeplake.util.class_label import convert_to_text @@ -65,11 +67,20 @@ parse_mesh_to_dict, get_mesh_vertices, ) -from deeplake.util.htype import parse_complex_htype +from deeplake.deeplog.actions import CreateTensorAction +from deeplake.deeplog import DeepLogSnapshot, OptimisticTransaction, atomic +from deeplake.deeplog.adapters import parse_commit_id, to_commit_id + from deeplake.htype import ( - HTYPE_CONVERSION_LHS, - HTYPE_CONSTRAINTS, HTYPE_SUPPORTED_COMPRESSIONS, + HTYPE_CONSTRAINTS, + HTYPE_CONVERSION_LHS, +) +from deeplake.util.htype import parse_complex_htype +from deeplake.htype import htype as HTYPE +from deeplake.core.meta.tensor_meta import ( + validate_and_process_kwargs, + _required_meta_from_htype, ) import warnings import webbrowser @@ -107,24 +118,48 @@ def create_tensor( commit_id = version_state["commit_id"] if not overwrite and tensor_exists(key, storage, commit_id): raise TensorAlreadyExistsError(key) - - meta_key = get_tensor_meta_key(key, commit_id) - meta = TensorMeta( - htype=htype, + tensor_options = dict( sample_compression=sample_compression, chunk_compression=chunk_compression, **kwargs, ) - storage[meta_key] = meta # type: ignore - if commit_id != FIRST_COMMIT_ID: - cmap_key = get_tensor_commit_chunk_map_key(key, commit_id) - cmap = CommitChunkMap() - storage[cmap_key] = cmap # type: ignore - - diff_key = get_tensor_commit_diff_key(key, commit_id) - diff = CommitDiff(created=True) - storage[diff_key] = diff # type: ignore + if storage.deeplog.log_format() < 4: + meta_key = get_tensor_meta_key(key, commit_id) + meta = TensorMeta(htype=htype, **tensor_options) + storage[meta_key] = meta # type: ignore + + if commit_id != FIRST_COMMIT_ID: + cmap_key = get_tensor_commit_chunk_map_key(key, commit_id) + cmap = CommitChunkMap() + storage[cmap_key] = cmap # type: ignore + + diff_key = get_tensor_commit_diff_key(key, commit_id) + diff = CommitDiff(created=True) + storage[diff_key] = diff # type: ignore + else: + if htype in (None, UNSPECIFIED): + htype = HTYPE.DEFAULT + validate_and_process_kwargs(htype, tensor_options) + + required_meta = _required_meta_from_htype(htype) + info_keys = required_meta.pop("_info", []) + for info_key in info_keys: + required_meta.pop(info_key) + required_meta.update(tensor_options) + if not required_meta.get("links"): + required_meta["links"] = {} + required_meta["version"] = deeplake.__version__ + + deeplog = storage.deeplog + branch_id, branch_version = parse_commit_id(commit_id) + snapshot = DeepLogSnapshot(branch_id, branch_version, deeplog) + transaction = OptimisticTransaction(snapshot) + transaction.add(CreateTensorAction(id=key, name=key, **required_meta)) + transaction.commit() + version_state["commit_id"] = to_commit_id( + branch_id, deeplog.version(branch_id) + ) # TODO: have a function to update version_state on any update def delete_tensor(key: str, dataset): @@ -220,7 +255,7 @@ class Tensor: def __init__( self, key: str, - dataset, + dataset: "deeplake.core.dataset.Dataset", index: Optional[Index] = None, is_iteration: bool = False, chunk_engine: Optional[ChunkEngine] = None, @@ -251,13 +286,31 @@ def __init__( self.is_iteration = is_iteration commit_id = self.version_state["commit_id"] - if not self.is_iteration and not tensor_exists( - self.key, self.storage, commit_id - ): - raise TensorDoesNotExistError(self.key) + if dataset.storage.deeplog.log_format() < 4: + if not self.is_iteration and not tensor_exists( + self.key, self.storage, commit_id + ): + raise TensorDoesNotExistError(self.key) + + meta_key = get_tensor_meta_key(self.key, commit_id) + meta = self.storage.get_deeplake_object(meta_key, TensorMeta) + else: + branch_id, branch_version = parse_commit_id(commit_id) + if not self.is_iteration and not tensor_exists_in_log( + dataset.storage.deeplog, + self.key, + branch_id, + branch_version, + ): + raise TensorDoesNotExistError(self.key) + + meta = get_tensor_metadata( + self.key, + dataset.storage.deeplog, + branch_id, + branch_version, + ) - meta_key = get_tensor_meta_key(self.key, commit_id) - meta = self.storage.get_deeplake_object(meta_key, TensorMeta) if chunk_engine is not None: self.chunk_engine = chunk_engine elif meta.is_link: @@ -289,6 +342,7 @@ def _write_initialization(self): ].chunk_engine @invalid_view_op + @atomic def extend( self, samples: Union[np.ndarray, Sequence[InputSample], "Tensor"], @@ -396,6 +450,7 @@ def info(self, value): raise TypeError("Info must be set with type Dict") @invalid_view_op + @atomic def append(self, sample: InputSample): """Appends a single sample to the end of the tensor. Can be an array, scalar value, or the return value from :func:`deeplake.read`, which can be used to load files. See examples down below. diff --git a/deeplake/deeplog/__init__.py b/deeplake/deeplog/__init__.py new file mode 100644 index 0000000000..bd4b214aca --- /dev/null +++ b/deeplake/deeplog/__init__.py @@ -0,0 +1,2 @@ +from _deeplake._deeplake._deeplog import * +from .deeplog import * diff --git a/deeplake/deeplog/actions/__init__.py b/deeplake/deeplog/actions/__init__.py new file mode 100644 index 0000000000..7ffc34c2e3 --- /dev/null +++ b/deeplake/deeplog/actions/__init__.py @@ -0,0 +1 @@ +from .actions import * diff --git a/deeplake/deeplog/actions/actions.py b/deeplake/deeplog/actions/actions.py new file mode 100644 index 0000000000..5f03e0b62d --- /dev/null +++ b/deeplake/deeplog/actions/actions.py @@ -0,0 +1,5 @@ +from _deeplake._deeplake._deeplog._actions import * + + +class UpdateTensorAction(CreateTensorAction): + pass diff --git a/deeplake/deeplog/adapters.py b/deeplake/deeplog/adapters.py new file mode 100644 index 0000000000..8899fee16a --- /dev/null +++ b/deeplake/deeplog/adapters.py @@ -0,0 +1,47 @@ +from typing import Tuple + +from deeplake.constants import FIRST_COMMIT_ID +from deeplake.deeplog import DeepLog, DeepLogSnapshot + + +def get_tensor_metadata( + key: str, deeplog: DeepLog, branch_id: str, branch_version: int +): + from deeplake.core.meta import TensorMeta + + snapshot = DeepLogSnapshot(branch_id, branch_version, deeplog) + create_tensor = {tensor.id: tensor for tensor in snapshot.tensors()}[key] + + meta = TensorMeta() + meta.name = create_tensor.name + meta.htype = create_tensor.htype + meta.dtype = create_tensor.dtype + meta.typestr = create_tensor.typestr + meta.min_shape = create_tensor.min_shape + meta.max_shape = create_tensor.max_shape + meta.length = create_tensor.length + meta.sample_compression = create_tensor.sample_compression + meta.chunk_compression = create_tensor.chunk_compression + meta.max_chunk_size = create_tensor.max_chunk_size + meta.tiling_threshold = create_tensor.tiling_threshold + meta.hidden = create_tensor.hidden + meta.links = create_tensor.links + meta.is_sequence = create_tensor.is_sequence + meta.is_link = create_tensor.is_link + meta.verify = create_tensor.verify + + return meta + + +def parse_commit_id(commit_id: str) -> Tuple[str, int]: + branch_id, _, branch_version = commit_id.partition("-") + + if branch_id == FIRST_COMMIT_ID: + branch_id = "" + return branch_id, int(branch_version) + + +def to_commit_id(branch_id: str, branch_version: int) -> str: + if branch_id == "": + branch_id = FIRST_COMMIT_ID + return branch_id + "-" + str(branch_version) diff --git a/deeplake/deeplog/deeplog.py b/deeplake/deeplog/deeplog.py new file mode 100644 index 0000000000..83223ec791 --- /dev/null +++ b/deeplake/deeplog/deeplog.py @@ -0,0 +1,39 @@ +from typing import List, Union, Generic, TypeVar +from deeplake.deeplog.actions import ( + DeepLogAction, + AddFileAction, + CreateBranchAction, + ProtocolAction, + MetadataAction, + CreateTensorAction, + CreateCommitAction, +) +from deeplake.deeplog.adapters import to_commit_id +from functools import wraps +import deeplake + + +def atomic(func): + @wraps(func) + def inner(*args, **kwargs): + self = args[0] + assert isinstance(self, deeplake.Tensor) + chunk_engine = self.chunk_engine + storage = chunk_engine.base_storage + if storage.deeplog.log_format() >= 4 and storage._staged_transaction is None: + root_call = True + else: + root_call = False + func(*args, **kwargs) + if root_call: + chunk_engine.cache.flush() + staged_transaction = storage._staged_transaction + if staged_transaction: + staged_transaction.commit() + storage._staged_transaction = None + branch_id = staged_transaction.snapshot.branch_id + self.version_state["commit_id"] = to_commit_id( + branch_id, storage.deeplog.version(branch_id) + ) + + return inner diff --git a/deeplake/util/keys.py b/deeplake/util/keys.py index d20900568e..d0a3c610a9 100644 --- a/deeplake/util/keys.py +++ b/deeplake/util/keys.py @@ -1,4 +1,5 @@ import posixpath +from typing import Tuple from deeplake.constants import ( CHUNKS_FOLDER, COMMIT_INFO_FILENAME, @@ -25,6 +26,7 @@ QUERIES_FILENAME, QUERIES_LOCK_FILENAME, ) +from deeplake.deeplog import DeepLog, DeepLogSnapshot from deeplake.util.exceptions import ( S3GetError, S3GetAccessError, @@ -39,6 +41,12 @@ def get_chunk_key(key: str, chunk_name: str, commit_id: str) -> str: return "/".join(("versions", commit_id, key, CHUNKS_FOLDER, f"{chunk_name}")) +def split_chunk_key(key: str) -> Tuple[str, str, str]: + key = key.strip("/") + _, commit_id, key, _, chunk_name = key.split("/") + return commit_id, key, chunk_name + + def get_dataset_meta_key(commit_id: str) -> str: # dataset meta is always relative to the `StorageProvider`'s root if commit_id == FIRST_COMMIT_ID: @@ -195,7 +203,9 @@ def dataset_exists(storage, commit_id=None) -> bool: """ try: return ( - get_dataset_meta_key(commit_id or FIRST_COMMIT_ID) in storage + "_deeplake_log/_meta/00000000000000000001.json" in storage + or "_deeplake_log/_meta/_last_checkpoint.json" in storage + or get_dataset_meta_key(commit_id or FIRST_COMMIT_ID) in storage or get_version_control_info_key() in storage ) except S3GetAccessError as err: @@ -212,6 +222,13 @@ def tensor_exists(key: str, storage, commit_id: str) -> bool: return False +def tensor_exists_in_log( + deeplog: DeepLog, tensor_id: str, branch: str, branch_version: int +) -> bool: + snapshot = DeepLogSnapshot(branch, branch_version, deeplog) + return tensor_id in [tensor.id for tensor in snapshot.tensors()] + + def get_queries_key() -> str: return QUERIES_FILENAME diff --git a/deeplake/util/storage.py b/deeplake/util/storage.py index f9877f8796..6fbae3dd19 100644 --- a/deeplake/util/storage.py +++ b/deeplake/util/storage.py @@ -1,3 +1,4 @@ +from deeplake.deeplog import DeepLog, DeepLogV3 from deeplake.util.agreement import handle_dataset_agreements from deeplake.util.cache_chain import generate_chain from deeplake.constants import LOCAL_CACHE_PREFIX, MB @@ -234,6 +235,7 @@ def get_storage_and_cache_chain( ) if storage.read_only: storage_chain.enable_readonly() + return storage, storage_chain diff --git a/deeplake/util/version_control.py b/deeplake/util/version_control.py index 3e0a08f3a0..efdc689272 100644 --- a/deeplake/util/version_control.py +++ b/deeplake/util/version_control.py @@ -21,6 +21,7 @@ from deeplake.core.version_control.commit_chunk_map import CommitChunkMap # type: ignore from deeplake.core.storage import LRUCache from deeplake.core.lock import Lock, PersistentLock +from deeplake.deeplog.adapters import parse_commit_id from deeplake.util.exceptions import ( CheckoutError, CommitError, @@ -49,6 +50,7 @@ from deeplake.util.path import relpath from deeplake.util.remove_cache import get_base_storage from deeplake.hooks import dataset_committed +from deeplake.deeplog import DeepLog, DeepLogSnapshot from datetime import datetime import deeplake.core.dataset @@ -1093,6 +1095,28 @@ def _get_dataset_meta_at_commit(storage, commit_id): return meta +def _get_deeplog_meta_at_commit( + deeplog: DeepLog, branch_id: str, branch_version: int +) -> DatasetMeta: + meta = DatasetMeta() + tensor_data = DeepLogSnapshot(branch_id, branch_version, deeplog).tensors() + + meta.tensors = [action.id for action in tensor_data] + meta.tensor_names = {action.name: action.id for action in tensor_data} + meta.hidden_tensors = [action.id for action in tensor_data if action.hidden] + meta.version = deeplake.__version__ + meta.default_index = [ + { + "start": None, + "step": None, + "stop": None, + } + ] + meta.groups = [] + + return meta + + def load_meta(dataset: "deeplake.core.dataset.Dataset"): """Loads the meta info for the version state.""" from deeplake.core.tensor import Tensor @@ -1102,7 +1126,15 @@ def load_meta(dataset: "deeplake.core.dataset.Dataset"): storage.clear_deeplake_objects() dataset._info = None dataset._ds_diff = None - meta = _get_dataset_meta_at_commit(storage, version_state["commit_id"]) + + deeplog = dataset.storage.deeplog + if deeplog.log_format() < 4: + meta = _get_dataset_meta_at_commit(storage, version_state["commit_id"]) + else: + branch_id, branch_version = parse_commit_id(version_state["commit_id"]) + meta = _get_deeplog_meta_at_commit( + dataset.storage.deeplog, branch_id, branch_version + ) ffw_dataset_meta(meta) version_state["meta"] = meta