From 60c34d8c884b6dab38076000822d4c9104dfb206 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Thu, 16 Apr 2020 19:43:17 -0400 Subject: [PATCH 01/39] Created pycompress directory and modified root CMakeLists --- CMakeLists.txt | 38 +++++++++++ pysuccinct/CMakeLists.txt | 28 ++++++++ pysuccinct/pycompress.cpp | 136 ++++++++++++++++++++++++++++++++++++++ pysuccinct/pycompress.py | 61 +++++++++++++++++ 4 files changed, 263 insertions(+) create mode 100644 pysuccinct/CMakeLists.txt create mode 100644 pysuccinct/pycompress.cpp create mode 100644 pysuccinct/pycompress.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 7022727..24f673e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,43 @@ cmake_minimum_required(VERSION 2.8) project(succinct CXX) +FIND_PACKAGE(PythonInterp "3.7") + +if (PYTHONINTERP_FOUND) + if (UNIX AND NOT APPLE) + if (PYTHON_VERSION_MAJOR EQUAL 3) + FIND_PACKAGE(Boost COMPONENTS python${PYTHON_VERSION_SUFFIX}) + FIND_PACKAGE(PythonInterp 3) + FIND_PACKAGE(PythonLibs 3 REQUIRED) + else() + FIND_PACKAGE(Boost COMPONENTS python) + FIND_PACKAGE(PythonInterp) + FIND_PACKAGE(PythonLibs REQUIRED) + endif() + else() + if (PYTHON_VERSION_MAJOR EQUAL 3) + FIND_PACKAGE(Boost COMPONENTS python${PYTHON_VERSION_MAJOR}${PYTHON_VERSION_MINOR}) + FIND_PACKAGE(PythonInterp 3) + FIND_PACKAGE(PythonLibs 3 REQUIRED) + else() + FIND_PACKAGE(Boost COMPONENTS python${PYTHON_VERSION_MAJOR}${PYTHON_VERSION_MINOR}) + FIND_PACKAGE(PythonInterp) + FIND_PACKAGE(PythonLibs REQUIRED) + endif() + endif() +else() + message("Python not found") +endif() + +message(STATUS "PYTHON_LIBRARIES = ${PYTHON_LIBRARIES}") +message(STATUS "PYTHON_EXECUTABLE = ${PYTHON_EXECUTABLE}") +message(STATUS "PYTHON_INCLUDE_DIRS = ${PYTHON_INCLUDE_DIRS}") +message(STATUS "Boost_LIBRARIES = ${Boost_LIBRARIES}") + +ENABLE_TESTING() +INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS}) +LINK_LIBRARIES(${Boost_LIBRARIES} ${PYTHON_LIBRARIES}) # Deprecated but so convenient! + include(CheckCXXCompilerFlag) CHECK_CXX_COMPILER_FLAG("-std=c++11" COMPILER_SUPPORTS_CXX11) if(COMPILER_SUPPORTS_CXX11) @@ -24,6 +61,7 @@ add_subdirectory(examples) add_subdirectory(sharded) add_subdirectory(sharded-kv) add_subdirectory(bench) +add_subdirectory(pysuccinct) # Testing framework # Build google testing framework diff --git a/pysuccinct/CMakeLists.txt b/pysuccinct/CMakeLists.txt new file mode 100644 index 0000000..cb7e1c4 --- /dev/null +++ b/pysuccinct/CMakeLists.txt @@ -0,0 +1,28 @@ +PYTHON_ADD_MODULE(pycompress pycompress.cpp) +FILE(COPY pycompress.py DESTINATION .) + +cmake_minimum_required(VERSION 2.8) +project(succinct-pysuccinct CXX) + +include(CheckCXXCompilerFlag) +CHECK_CXX_COMPILER_FLAG("-std=c++11" COMPILER_SUPPORTS_CXX11) +if(COMPILER_SUPPORTS_CXX11) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +else() + CHECK_CXX_COMPILER_FLAG("-std=c++0x" COMPILER_SUPPORTS_CXX0X) + if(COMPILER_SUPPORTS_CXX0X) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x") + else() + message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a different C++ compiler.") + endif() +endif() +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") + +set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) + +file(MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + +set(INCLUDE include ../core/include) +include_directories(${INCLUDE}) + +target_link_libraries(pycompress succinct) diff --git a/pysuccinct/pycompress.cpp b/pysuccinct/pycompress.cpp new file mode 100644 index 0000000..b519f26 --- /dev/null +++ b/pysuccinct/pycompress.cpp @@ -0,0 +1,136 @@ +#include +#include + +#include "succinct_shard.h" +#include "succinct_file.h" +#include "npa/npa.h" + +/** + * Example program that takes an input file and compresses it using Succinct. + */ + +/** + * Prints usage + */ +void print_usage(char *exec) { + fprintf( + stderr, + "Usage: %s [-s sa_sampling_rate] [-i isa_sampling_rate] [-x sampling_scheme] [-n npa_sampling_rate] [-r npa_encoding_scheme] [-t input_type] [file]\n", + exec); +} + +/** + * Converts integer option to SamplingScheme + */ +SamplingScheme SamplingSchemeFromOption(int opt) { + switch (opt) { + case 0: { + fprintf(stderr, "Sampling Scheme = Flat Sample by Index\n"); + return SamplingScheme::FLAT_SAMPLE_BY_INDEX; + } + case 1: { + fprintf(stderr, "Sampling Scheme = Flat Sample by Value\n"); + return SamplingScheme::FLAT_SAMPLE_BY_VALUE; + } + case 2: { + fprintf(stderr, "Sampling Scheme = Layered Sample by Index\n"); + return SamplingScheme::LAYERED_SAMPLE_BY_INDEX; + } + case 3: { + fprintf(stderr, + "Sampling Scheme = Opportunistic Layered Sample by Index\n"); + return SamplingScheme::OPPORTUNISTIC_LAYERED_SAMPLE_BY_INDEX; + } + default: { + fprintf(stderr, "Sampling Scheme = Flat Sample by Index\n"); + return SamplingScheme::FLAT_SAMPLE_BY_INDEX; + } + } +} + +/** + * Converts integer option to NPAEncodingScheme + */ +NPA::NPAEncodingScheme EncodingSchemeFromOption(int opt) { + switch (opt) { + case 0: { + fprintf(stderr, "NPA Encoding Scheme = Elias Delta\n"); + return NPA::NPAEncodingScheme::ELIAS_DELTA_ENCODED; + } + case 1: { + fprintf(stderr, "NPA Encoding Scheme = Elias Gamma\n"); + return NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED; + } + case 2: { + fprintf(stderr, "NPA Encoding Scheme = Wavelet Tree\n"); + return NPA::NPAEncodingScheme::WAVELET_TREE_ENCODED; + } + default: { + fprintf(stderr, "NPA Encoding Scheme = Elias Gamma\n"); + return NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED; + } + } +} + +struct File{ + File(std::string inputpath, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, + uint32_t npa_sampling_rate, int sampling_opt, int npa_opt){ + this->inputpath = inputpath; + this->sa_sampling_rate = sa_sampling_rate; + this->isa_sampling_rate = isa_sampling_rate; + this->npa_sampling_rate = npa_sampling_rate; + this->sampling_scheme = SamplingSchemeFromOption(sampling_opt); + this->npa_encoding_scheme = EncodingSchemeFromOption(npa_opt); + } + + //File members + std::string inputpath; + uint32_t sa_sampling_rate; + uint32_t isa_sampling_rate; + uint32_t npa_sampling_rate; + SamplingScheme sampling_scheme; + NPA::NPAEncodingScheme npa_encoding_scheme; + + void compressFile(){ + // The following compresses an input file at "inputpath" in memory + // as a flat file (no structure) using the compression parameters + // passed in (sampling rates, etc.). + // Leave the arguments unspecified to use default values. + auto *fd = new SuccinctFile(inputpath, + SuccinctMode::CONSTRUCT_IN_MEMORY, + sa_sampling_rate, isa_sampling_rate, + npa_sampling_rate, sampling_scheme, + sampling_scheme, npa_encoding_scheme); + + // Serialize the compressed representation to disk at the location .succinct + fd->Serialize(inputpath + ".succinct"); + delete fd; + } + + void compressShard(){ + // The following compresses an input file at "inputpath" in memory + // as a buffer containing key-value pairs. It uses newline '\n' to + // differentiate between successive values, and assigns the line number + // as the key for the corresponding value. + auto *fd = new SuccinctShard(0, inputpath, + SuccinctMode::CONSTRUCT_IN_MEMORY, + sa_sampling_rate, isa_sampling_rate, + npa_sampling_rate, sampling_scheme, + sampling_scheme, npa_encoding_scheme); + + // Serialize the compressed representation to disk at the location .succinct + fd->Serialize(inputpath + ".succinct"); + delete fd; + } + +}; + +#include +using namespace boost::python; + +BOOST_PYTHON_MODULE(pycompress){ + class_("File", init()) + .def("compressFile", &File::compressFile) + .def("compressShard", &File::compressShard) + ; +} \ No newline at end of file diff --git a/pysuccinct/pycompress.py b/pysuccinct/pycompress.py new file mode 100644 index 0000000..8608266 --- /dev/null +++ b/pysuccinct/pycompress.py @@ -0,0 +1,61 @@ +import pycompress +import sys +import getopt + +#Loop through arguments to change default values and get input path using getopt +try: + optlist, args = getopt.getopt(sys.argv[1:], 's:i:x:n:r:t:') +except getopt.GetoptError as err: + sys.exit(2) + +#Default values +sa_sampling_rate = 32 +isa_sampling_rate = 32 +sampling_scheme = 0 +npa_sampling_rate = 128 +npa_encoding_scheme = 1 +type = "file" +inputpath = "test.txt" + +#Modify default values +for o, a in optlist: + if o == "-s": + sa_sampling_rate = int(a) + #print("sampling rate is ", sa_sampling_rate, "\n") + elif o == "-i": + isa_sampling_rate = int(a) + elif o == "-x": + sampling_scheme = int(a) + elif o == "-n": + npa_sampling_rate = int(a) + elif o == "-r": + npa_encoding_scheme = int(a) + elif o == "-t": + type = a + else: + assert False, "unhandled option" + + +#Process input path +if (len(args) != 1): + #Should have 1 argument left for input path + sys.exit(2) +else: + #Set the last unparsed element to the input path + inputpath = str(args[0]) + +#Create the file struct and compress +file = pycompress.File(inputpath, sa_sampling_rate, + isa_sampling_rate, npa_sampling_rate, + sampling_scheme, npa_encoding_scheme) + +#Compress file or shard depending on type +if (type == "file"): + #Compress file + file.compressFile() +elif (type == "kv"): + #Compress shard + file.compressShard() +else: + #Error + print("Invalid type\n") From 324fb9629ba0ac65cbd2d3e031f5d2130828bb8a Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Thu, 16 Apr 2020 19:49:34 -0400 Subject: [PATCH 02/39] Error checking in pycompress.py --- pysuccinct/pycompress.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pysuccinct/pycompress.py b/pysuccinct/pycompress.py index 8608266..b28c7cd 100644 --- a/pysuccinct/pycompress.py +++ b/pysuccinct/pycompress.py @@ -2,11 +2,16 @@ import sys import getopt + if (sys.argc < 2 or sys.argc > 12): + print("Invalid number of arguments \n") + sys.exit(2) + #Loop through arguments to change default values and get input path using getopt try: optlist, args = getopt.getopt(sys.argv[1:], 's:i:x:n:r:t:') except getopt.GetoptError as err: - sys.exit(2) + print("Get opt error\n") + sys.exit(2) #Default values sa_sampling_rate = 32 @@ -33,12 +38,14 @@ elif o == "-t": type = a else: - assert False, "unhandled option" + print("Unhandled option\n") + sys.exit(2) #Process input path if (len(args) != 1): #Should have 1 argument left for input path + print("File path not found\n") sys.exit(2) else: #Set the last unparsed element to the input path @@ -59,3 +66,4 @@ else: #Error print("Invalid type\n") + sys.exit(2) From cfe93d14b3e4481b796b9e82bdd68ace212fc2e9 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Fri, 1 May 2020 01:37:59 -0400 Subject: [PATCH 03/39] Wrapped search, count and query functions for use in pyquery_file.py --- .vscode/settings.json | 33 +++++++++++++ pysuccinct/CMakeLists.txt | 4 ++ pysuccinct/pycompress.cpp | 5 +- pysuccinct/pycompress.py | 7 +-- pysuccinct/pyquery_file.cpp | 98 +++++++++++++++++++++++++++++++++++++ pysuccinct/pyquery_file.py | 78 +++++++++++++++++++++++++++++ 6 files changed, 220 insertions(+), 5 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 pysuccinct/pyquery_file.cpp create mode 100644 pysuccinct/pyquery_file.py diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..bfae82d --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,33 @@ +{ + "files.associations": { + "__config": "cpp", + "__nullptr": "cpp", + "cstddef": "cpp", + "exception": "cpp", + "initializer_list": "cpp", + "new": "cpp", + "optional": "cpp", + "stdexcept": "cpp", + "type_traits": "cpp", + "typeinfo": "cpp", + "variant": "cpp", + "algorithm": "cpp", + "cctype": "cpp", + "cmath": "cpp", + "csignal": "cpp", + "cstdarg": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "__locale": "cpp", + "ios": "cpp", + "complex": "cpp", + "__bit_reference": "cpp", + "bitset": "cpp", + "chrono": "cpp", + "map": "cpp", + "set": "cpp", + "unordered_map": "cpp" + } +} \ No newline at end of file diff --git a/pysuccinct/CMakeLists.txt b/pysuccinct/CMakeLists.txt index cb7e1c4..12c6c98 100644 --- a/pysuccinct/CMakeLists.txt +++ b/pysuccinct/CMakeLists.txt @@ -1,6 +1,9 @@ PYTHON_ADD_MODULE(pycompress pycompress.cpp) FILE(COPY pycompress.py DESTINATION .) +PYTHON_ADD_MODULE(pyquery_file pyquery_file.cpp) +FILE(COPY pyquery_file.py DESTINATION .) + cmake_minimum_required(VERSION 2.8) project(succinct-pysuccinct CXX) @@ -26,3 +29,4 @@ set(INCLUDE include ../core/include) include_directories(${INCLUDE}) target_link_libraries(pycompress succinct) +target_link_libraries(pyquery_file succinct) diff --git a/pysuccinct/pycompress.cpp b/pysuccinct/pycompress.cpp index b519f26..8ed5b8a 100644 --- a/pysuccinct/pycompress.cpp +++ b/pysuccinct/pycompress.cpp @@ -5,6 +5,9 @@ #include "succinct_file.h" #include "npa/npa.h" +#include +using namespace boost::python; + /** * Example program that takes an input file and compresses it using Succinct. */ @@ -125,8 +128,6 @@ struct File{ }; -#include -using namespace boost::python; BOOST_PYTHON_MODULE(pycompress){ class_("File", init()) diff --git a/pysuccinct/pycompress.py b/pysuccinct/pycompress.py index b28c7cd..a653267 100644 --- a/pysuccinct/pycompress.py +++ b/pysuccinct/pycompress.py @@ -2,7 +2,9 @@ import sys import getopt - if (sys.argc < 2 or sys.argc > 12): +#Argument size error checking +argc = len(sys.argv) +if (argc < 2 or argc > 12): print("Invalid number of arguments \n") sys.exit(2) @@ -20,13 +22,12 @@ npa_sampling_rate = 128 npa_encoding_scheme = 1 type = "file" -inputpath = "test.txt" +inputpath = "" #Modify default values for o, a in optlist: if o == "-s": sa_sampling_rate = int(a) - #print("sampling rate is ", sa_sampling_rate, "\n") elif o == "-i": isa_sampling_rate = int(a) elif o == "-x": diff --git a/pysuccinct/pyquery_file.cpp b/pysuccinct/pyquery_file.cpp new file mode 100644 index 0000000..8a64296 --- /dev/null +++ b/pysuccinct/pyquery_file.cpp @@ -0,0 +1,98 @@ +#include +#include +#include +#include +#include +#include + +#include "succinct_file.h" + +#include +using namespace boost::python; + +/** + * Prints usage. + */ +void print_usage(char *exec) { + fprintf(stderr, "Usage: %s [-m mode] [file]\n", exec); +} + +void print_valid_cmds() { + std::cerr + << "Command must be one of: search [query], count [query], extract [offset] [length]\n"; +} + +typedef unsigned long long int timestamp_t; + +static timestamp_t get_timestamp() { + struct timeval now{}; + gettimeofday(&now, nullptr); + + return (now.tv_usec + (time_t) now.tv_sec * 1000000); +} + +struct QueryFile { + QueryFile(std::string filename, uint32_t mode){ + this->s_file = nullptr; + if (mode == 0) { + // If mode is set to 0, compress the input file. + // Use default parameters. + std::cout << "Constructing Succinct data structures...\n"; + this->s_file = new SuccinctFile(filename); + std::cout << "Serializing Succinct data structures...\n"; + this->s_file->Serialize(filename + ".succinct"); + } else { + // If mode is set to 1, read the serialized data structures from disk. + // The serialized data structures must exist at .succinct. + std::cout << "De-serializing Succinct data structures...\n"; + this->s_file = new SuccinctFile(filename, SuccinctMode::LOAD_IN_MEMORY); + } + std::cout << "Done. Starting Succinct Shell...\n"; + print_valid_cmds(); + } + + //QueryFile members + SuccinctFile *s_file; + + //Wrapped search command + void search(std::string arg){ + std::vector results; + timestamp_t start = get_timestamp(); + s_file->Search(results, arg); + timestamp_t tot_time = get_timestamp() - start; + std::cout << "Found " << results.size() << " results in " << tot_time + << "us:\n"; + for (auto res : results) { + std::cout << res << ", "; + } + std::cout << std::endl; + } + + //Wrapped count command + void count(std::string arg){ + timestamp_t start = get_timestamp(); + int64_t count = s_file->Count(arg); + timestamp_t tot_time = get_timestamp() - start; + std::cout << "Count = " << count << "; Time taken: " << tot_time + << "us\n"; + } + + //Wrapped extract command + void extract(uint64_t offset, uint64_t length){ + timestamp_t start = get_timestamp(); + std::string result; + s_file->Extract(result, offset, length); + timestamp_t tot_time = get_timestamp() - start; + std::cout << "Extracted string = " << result << "; Time taken: " + << tot_time << "us\n"; + } +}; + +BOOST_PYTHON_MODULE(pyquery_file){ + class_("QueryFile", init()) + .def("search", &QueryFile::search) + .def("count", &QueryFile::count) + .def("extract", &QueryFile::extract) + ; + +} \ No newline at end of file diff --git a/pysuccinct/pyquery_file.py b/pysuccinct/pyquery_file.py new file mode 100644 index 0000000..a441ea6 --- /dev/null +++ b/pysuccinct/pyquery_file.py @@ -0,0 +1,78 @@ +import pyquery_file +import sys +import getopt + +def RepresentsInt(s): + try: + int(s) + return True + except ValueError: + return False + +#Argument size error checking +argc = len(sys.argv) +if (argc < 2 or argc > 12): + print("Invalid number of arguments \n") + sys.exit(2) + +#Loop through arguments to change default values and get input path using getopt +try: + optlist, args = getopt.getopt(sys.argv[1:], 'm:') +except getopt.GetoptError as err: + print("Get opt error\n") + sys.exit(2) + +#Default values +mode = 0 +filename = "" + +#Modify default values +for o, a in optlist: + if o == "-m": + mode = int(a) + else: + printf("Invalid Option") + +#Process filename +if (len(args) != 1): + #Should have 1 argument left for filename + print("File path not found\n") + sys.exit(2) +else: + #Set the last unparsed element to the filename + filename = str(args[0]) + +#Create the pyquery_file struct and run command given on next input +q = pyquery_file.QueryFile(filename, mode) +# q.ProcessCommands() + +#parse through line by line +while (True): + line = input("succinct> ") + line = line.split(" ", 1) + #Search command + if (line[0] == "search"): + if (len(line) != 2): + print("Could not parse command: ") + else: + q.search(line[1].strip()) + elif(line[0] == "count"): + if (len(line) != 2): + print("Could not parse command: "]) + else: + q.count(line[1].strip()) + elif(line[0] == "extract"): + if (len(line) == 1): + print("Could not parse command: ") + line = line[1].split(" ", 1) + offset = line[0].strip() + length = line[1].strip() + if (len(line) != 2 or RepresentsInt(offset) == False or RepresentsInt(length) == False): + print("Could not parse command: ") + else: + q.extract(int(line[0].strip()), int(line[1].strip())) + elif(line[0] == "exit"): + break + else: + print("Unsupported command") + print("Command must be one of: search [query], count [query], extract [offset] [length]") \ No newline at end of file From 9d2f2f215aa09a872d4207405c4f8ac235d16e3c Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Fri, 1 May 2020 02:29:00 -0400 Subject: [PATCH 04/39] Wrapped search, count and get functions for use in pyquery_kv.py --- pysuccinct/CMakeLists.txt | 4 ++ pysuccinct/pyquery_file.py | 2 +- pysuccinct/pyquery_kv.cpp | 102 +++++++++++++++++++++++++++++++++++++ pysuccinct/pyquery_kv.py | 73 ++++++++++++++++++++++++++ 4 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 pysuccinct/pyquery_kv.cpp create mode 100644 pysuccinct/pyquery_kv.py diff --git a/pysuccinct/CMakeLists.txt b/pysuccinct/CMakeLists.txt index 12c6c98..12948b3 100644 --- a/pysuccinct/CMakeLists.txt +++ b/pysuccinct/CMakeLists.txt @@ -4,6 +4,9 @@ FILE(COPY pycompress.py DESTINATION .) PYTHON_ADD_MODULE(pyquery_file pyquery_file.cpp) FILE(COPY pyquery_file.py DESTINATION .) +PYTHON_ADD_MODULE(pyquery_kv pyquery_kv.cpp) +FILE(COPY pyquery_kv.py DESTINATION .) + cmake_minimum_required(VERSION 2.8) project(succinct-pysuccinct CXX) @@ -30,3 +33,4 @@ include_directories(${INCLUDE}) target_link_libraries(pycompress succinct) target_link_libraries(pyquery_file succinct) +target_link_libraries(pyquery_kv succinct) \ No newline at end of file diff --git a/pysuccinct/pyquery_file.py b/pysuccinct/pyquery_file.py index a441ea6..0af16a3 100644 --- a/pysuccinct/pyquery_file.py +++ b/pysuccinct/pyquery_file.py @@ -2,6 +2,7 @@ import sys import getopt +#Try catch block for non integer argument checking def RepresentsInt(s): try: int(s) @@ -44,7 +45,6 @@ def RepresentsInt(s): #Create the pyquery_file struct and run command given on next input q = pyquery_file.QueryFile(filename, mode) -# q.ProcessCommands() #parse through line by line while (True): diff --git a/pysuccinct/pyquery_kv.cpp b/pysuccinct/pyquery_kv.cpp new file mode 100644 index 0000000..af8eda3 --- /dev/null +++ b/pysuccinct/pyquery_kv.cpp @@ -0,0 +1,102 @@ +#include +#include +#include +#include +#include +#include + +#include "succinct_shard.h" + +#include +using namespace boost::python; + +/** + * Prints usage. + */ +void print_usage(char *exec) { + fprintf(stderr, "Usage: %s [-m mode] [file]\n", exec); +} + +void print_valid_cmds() { + std::cerr + << "Command must be one of: search [query], count [query], get [key]\n"; +} + +typedef unsigned long long int timestamp_t; + +static timestamp_t get_timestamp() { + struct timeval now{}; + gettimeofday(&now, nullptr); + + return (now.tv_usec + (time_t) now.tv_sec * 1000000); +} + +struct QueryKv{ + QueryKv(std::string filename, uint32_t mode){ + this->s_file = nullptr; + if (mode == 0) { + // If mode is set to 0, compress the input file. + // Use default parameters. + std::cout << "Constructing Succinct data structures...\n"; + s_file = new SuccinctShard(0, filename); + + std::cout << "Serializing Succinct data structures...\n"; + s_file->Serialize(filename + ".succinct"); + } else { + // If mode is set to 1, read the serialized data structures from disk. + // The serialized data structures must exist at .succinct. + std::cout << "De-serializing Succinct data structures...\n"; + s_file = new SuccinctShard(0, filename, SuccinctMode::LOAD_IN_MEMORY); + } + std::cout << "Done. Starting Succinct Shell...\n"; + print_valid_cmds(); + } + + //QueryKv members + SuccinctShard *s_file; + + //Wrapped search command + void search(std::string arg){ + std::set results; + timestamp_t start = get_timestamp(); + s_file->Search(results, arg); + timestamp_t tot_time = get_timestamp() - start; + std::cout << "Found " << results.size() << " results in " << tot_time + << "us:\n"; + for (auto res : results) { + std::cout << res << ", "; + } + std::cout << std::endl; + } + + //Wrapped count command + void count(std::string arg){ + timestamp_t start = get_timestamp(); + int64_t count = s_file->Count(arg); + timestamp_t tot_time = get_timestamp() - start; + std::cout << "Count = " << count << "; Time taken: " << tot_time + << "us\n"; + } + + //Wrapped get command + void get(uint64_t key){ + timestamp_t start = get_timestamp(); + std::string result; + s_file->Get(result, key); + timestamp_t tot_time = get_timestamp() - start; + std::cout << "Value = " << result << "; Time taken: " + << tot_time << "us\n"; + } + +}; + +BOOST_PYTHON_MODULE(pyquery_kv){ + class_("QueryKv", init()) + .def("search", &QueryKv::search) + .def("count", &QueryKv::count) + .def("get", &QueryKv::get) + ; + +} + + diff --git a/pysuccinct/pyquery_kv.py b/pysuccinct/pyquery_kv.py new file mode 100644 index 0000000..ee78e6f --- /dev/null +++ b/pysuccinct/pyquery_kv.py @@ -0,0 +1,73 @@ +import pyquery_kv +import sys +import getopt + +#Try catch block for non integer argument checking +def RepresentsInt(s): + try: + int(s) + return True + except ValueError: + return False + +#Argument size error checking +argc = len(sys.argv) +if (argc < 2 or argc > 12): + print("Invalid number of arguments \n") + sys.exit(2) + +#Loop through arguments to change default values and get input path using getopt +try: + optlist, args = getopt.getopt(sys.argv[1:], 'm:') +except getopt.GetoptError as err: + print("Get opt error\n") + sys.exit(2) + +#Default values +mode = 0 +filename = "" + +#Modify default values +for o, a in optlist: + if o == "-m": + mode = int(a) + else: + printf("Invalid Option") + +#Process filename +if (len(args) != 1): + #Should have 1 argument left for filename + print("File path not found\n") + sys.exit(2) +else: + #Set the last unparsed element to the filename + filename = str(args[0]) + +#Create the pyquery_file struct and run command given on next input +q = pyquery_kv.QueryKv(filename, mode) + +#parse through line by line +while (True): + line = input("succinct> ") + line = line.split(" ", 1) + if (line[0] == "search"): + if (len(line) != 2): + print("Could not parse command: ") + else: + q.search(line[1].strip()) + elif(line[0] == "count"): + if (len(line) != 2): + print("Could not parse command: ") + else: + q.count(line[1].strip()) + elif(line[0] == "get"): + key = line[1].strip() + if (len(line) != 2 or RepresentsInt(key) == False): + print("Could not parse command: ") + else: + q.get((int(key))) + elif(line[0] == "exit"): + break + else: + print("Unsupported command") + print("Command must be one of: search [query], count [query], get [key]") \ No newline at end of file From bb0a628b49200ad562e308685290351d4c9e6ce5 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Fri, 1 May 2020 03:52:43 -0400 Subject: [PATCH 05/39] Added more error checking. Wrapped search, count, get for semistructured --- .vscode/settings.json | 3 +- pysuccinct/CMakeLists.txt | 6 +- pysuccinct/pycompress.py | 8 +- pysuccinct/pyquery_file.py | 8 +- pysuccinct/pyquery_kv.py | 12 ++- pysuccinct/pyquery_semistructured.cpp | 104 ++++++++++++++++++++++++++ pysuccinct/pyquery_semistructured.py | 79 +++++++++++++++++++ 7 files changed, 207 insertions(+), 13 deletions(-) create mode 100644 pysuccinct/pyquery_semistructured.cpp create mode 100644 pysuccinct/pyquery_semistructured.py diff --git a/.vscode/settings.json b/.vscode/settings.json index bfae82d..8870183 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -28,6 +28,7 @@ "chrono": "cpp", "map": "cpp", "set": "cpp", - "unordered_map": "cpp" + "unordered_map": "cpp", + "__node_handle": "cpp" } } \ No newline at end of file diff --git a/pysuccinct/CMakeLists.txt b/pysuccinct/CMakeLists.txt index 12948b3..d7ca332 100644 --- a/pysuccinct/CMakeLists.txt +++ b/pysuccinct/CMakeLists.txt @@ -7,6 +7,9 @@ FILE(COPY pyquery_file.py DESTINATION .) PYTHON_ADD_MODULE(pyquery_kv pyquery_kv.cpp) FILE(COPY pyquery_kv.py DESTINATION .) +PYTHON_ADD_MODULE(pyquery_semistructured pyquery_semistructured.cpp) +FILE(COPY pyquery_semistructured.py DESTINATION .) + cmake_minimum_required(VERSION 2.8) project(succinct-pysuccinct CXX) @@ -33,4 +36,5 @@ include_directories(${INCLUDE}) target_link_libraries(pycompress succinct) target_link_libraries(pyquery_file succinct) -target_link_libraries(pyquery_kv succinct) \ No newline at end of file +target_link_libraries(pyquery_kv succinct) +target_link_libraries(pyquery_semistructured succinct) \ No newline at end of file diff --git a/pysuccinct/pycompress.py b/pysuccinct/pycompress.py index a653267..f8aad9c 100644 --- a/pysuccinct/pycompress.py +++ b/pysuccinct/pycompress.py @@ -5,14 +5,14 @@ #Argument size error checking argc = len(sys.argv) if (argc < 2 or argc > 12): - print("Invalid number of arguments \n") + print("Usage: %s [-m mode] [file]") sys.exit(2) #Loop through arguments to change default values and get input path using getopt try: optlist, args = getopt.getopt(sys.argv[1:], 's:i:x:n:r:t:') except getopt.GetoptError as err: - print("Get opt error\n") + print("Get opt error") sys.exit(2) #Default values @@ -39,14 +39,14 @@ elif o == "-t": type = a else: - print("Unhandled option\n") + printf("Invalid Option") sys.exit(2) #Process input path if (len(args) != 1): #Should have 1 argument left for input path - print("File path not found\n") + print("Usage: %s [-m mode] [file]") sys.exit(2) else: #Set the last unparsed element to the input path diff --git a/pysuccinct/pyquery_file.py b/pysuccinct/pyquery_file.py index 0af16a3..1c7396d 100644 --- a/pysuccinct/pyquery_file.py +++ b/pysuccinct/pyquery_file.py @@ -13,14 +13,14 @@ def RepresentsInt(s): #Argument size error checking argc = len(sys.argv) if (argc < 2 or argc > 12): - print("Invalid number of arguments \n") + print("Usage: %s [-m mode] [file]") sys.exit(2) #Loop through arguments to change default values and get input path using getopt try: optlist, args = getopt.getopt(sys.argv[1:], 'm:') except getopt.GetoptError as err: - print("Get opt error\n") + print(("Get opt error")) sys.exit(2) #Default values @@ -33,11 +33,12 @@ def RepresentsInt(s): mode = int(a) else: printf("Invalid Option") + sys.exit(2) #Process filename if (len(args) != 1): #Should have 1 argument left for filename - print("File path not found\n") + print("Usage: %s [-m mode] [file]") sys.exit(2) else: #Set the last unparsed element to the filename @@ -64,6 +65,7 @@ def RepresentsInt(s): elif(line[0] == "extract"): if (len(line) == 1): print("Could not parse command: ") + continue line = line[1].split(" ", 1) offset = line[0].strip() length = line[1].strip() diff --git a/pysuccinct/pyquery_kv.py b/pysuccinct/pyquery_kv.py index ee78e6f..8903e11 100644 --- a/pysuccinct/pyquery_kv.py +++ b/pysuccinct/pyquery_kv.py @@ -13,14 +13,14 @@ def RepresentsInt(s): #Argument size error checking argc = len(sys.argv) if (argc < 2 or argc > 12): - print("Invalid number of arguments \n") + print("Usage: %s [-m mode] [file]") sys.exit(2) #Loop through arguments to change default values and get input path using getopt try: optlist, args = getopt.getopt(sys.argv[1:], 'm:') except getopt.GetoptError as err: - print("Get opt error\n") + print("Get opt error") sys.exit(2) #Default values @@ -32,12 +32,13 @@ def RepresentsInt(s): if o == "-m": mode = int(a) else: - printf("Invalid Option") + print("Invalid Option") + sys.exit(2) #Process filename if (len(args) != 1): #Should have 1 argument left for filename - print("File path not found\n") + print("Usage: %s [-m mode] [file]") sys.exit(2) else: #Set the last unparsed element to the filename @@ -61,6 +62,9 @@ def RepresentsInt(s): else: q.count(line[1].strip()) elif(line[0] == "get"): + if (len(line) == 1): + print("Could not parse command: ") + continue key = line[1].strip() if (len(line) != 2 or RepresentsInt(key) == False): print("Could not parse command: ") diff --git a/pysuccinct/pyquery_semistructured.cpp b/pysuccinct/pyquery_semistructured.cpp new file mode 100644 index 0000000..bdc5327 --- /dev/null +++ b/pysuccinct/pyquery_semistructured.cpp @@ -0,0 +1,104 @@ +#include +#include +#include +#include +#include + +#include "succinct_semistructured_shard.h" + +#include +using namespace boost::python; + +/** + * Prints usage. + */ +void print_usage(char *exec) { + fprintf(stderr, "Usage: %s [-m mode] [file]\n", exec); +} + +void print_valid_cmds() { + std::cerr << "Command must be one of:\n" + << "\t\tsearch [attr_key] [attr_val]\n" + << "\t\tcount [attr_key] [attr_val]\n" + << "\t\tget [key] [attr_key]\n"; +} + +typedef unsigned long long int timestamp_t; + +static timestamp_t get_timestamp() { + struct timeval now{}; + gettimeofday(&now, nullptr); + + return (now.tv_usec + (time_t) now.tv_sec * 1000000); +} + +struct QuerySemistructured{ + QuerySemistructured(std::string filename, uint32_t mode){ + this->s_file = nullptr; + if (mode == 0) { + // If mode is set to 0, compress the input file. + // Use default parameters. + std::cout << "Constructing Succinct data structures...\n"; + s_file = new SuccinctSemistructuredShard(filename); + + std::cout << "Serializing Succinct data structures...\n"; + s_file->Serialize(filename + ".succinct"); + } else { + // If mode is set to 1, read the serialized data structures from disk. + // The serialized data structures must exist at .succinct. + std::cout << "De-serializing Succinct data structures...\n"; + s_file = new SuccinctSemistructuredShard(filename, + SuccinctMode::LOAD_IN_MEMORY); + } + std::cout << "Done. Starting Succinct Shell...\n"; + print_valid_cmds(); + } + + //QuerySemistructured members + SuccinctSemistructuredShard *s_file; + + //Wrapped search command + void search(const std::string attr_key, const std::string attr_val){ + std::set results; + timestamp_t start = get_timestamp(); + s_file->SearchAttribute(results, attr_key, attr_val); + timestamp_t tot_time = get_timestamp() - start; + std::cout << "Found " << results.size() << " records in " << tot_time + << "us; Matching keys:\n"; + for (auto res : results) { + std::cout << res << ", "; + } + std::cout << std::endl; + } + + //Wrapped count command + void count(const std::string attr_key, const std::string attr_val){ + timestamp_t start = get_timestamp(); + int64_t count = s_file->CountAttribute(attr_key, attr_val); + timestamp_t tot_time = get_timestamp() - start; + std::cout << "Number of matching records = " << count << "; Time taken: " + << tot_time << "us\n"; + } + + //Wrapped get command + void get(int64_t key, std::string attr_key){ + timestamp_t start = get_timestamp(); + std::string result; + s_file->Get(result, key, attr_key); + timestamp_t tot_time = get_timestamp() - start; + std::cout << "Value = " << result << "; Time taken: " << tot_time + << "us\n"; + } + +}; + +BOOST_PYTHON_MODULE(pyquery_semistructured){ + class_("QuerySemistructured", init()) + .def("search", &QuerySemistructured::search) + .def("count", &QuerySemistructured::count) + .def("get", &QuerySemistructured::get) + ; + +} + + diff --git a/pysuccinct/pyquery_semistructured.py b/pysuccinct/pyquery_semistructured.py new file mode 100644 index 0000000..0c69552 --- /dev/null +++ b/pysuccinct/pyquery_semistructured.py @@ -0,0 +1,79 @@ +import pyquery_semistructured +import sys +import getopt + +#Try catch block for non integer argument checking +def RepresentsInt(s): + try: + int(s) + return True + except ValueError: + return False + +#Argument size error checking +argc = len(sys.argv) +if (argc < 2 or argc > 12): + print("Usage: %s [-m mode] [file]") + sys.exit(2) + +#Loop through arguments to change default values and get input path using getopt +try: + optlist, args = getopt.getopt(sys.argv[1:], 'm:') +except getopt.GetoptError as err: + print("Get opt error") + sys.exit(2) + +#Default values +mode = 0 +filename = "" + +#Modify default values +for o, a in optlist: + if o == "-m": + mode = int(a) + else: + printf("Invalid Option") + sys.exit(2) + +#Process filename +if (len(args) != 1): + #Should have 1 argument left for filename + print("Usage: %s [-m mode] [file]") + sys.exit(2) +else: + #Set the last unparsed element to the filename + filename = str(args[0]) + +#Create the pyquery_file struct and run command given on next input +q = pyquery_semistructured.QuerySemistructured(filename, mode) + +#parse through line by line +while (True): + line = input("succinct> ") + line = line.split(" ") + if (line[0] == "search"): + if (len(line) != 3): + print("Could not parse command: ") + else: + q.search(line[1].strip(), line[2].strip()) + elif(line[0] == "count"): + if (len(line) != 3): + print("Could not parse command: ") + else: + q.count(line[1].strip(), line[2].strip()) + elif(line[0] == "get"): + if (len(line) == 1): + print("Could not parse command: ") + continue + line = line[1].split(" ", 1) + key = line[0].strip() + attr_key = line[1].strip() + if (len(line) != 2 or RepresentsInt(key) == False): + print("Could not parse command: ") + else: + q.get((int(key), attr_key)) + elif(line[0] == "exit"): + break + else: + print("Unsupported command") + print("Command must be one of:\n\t\tsearch [attr_key] [attr_val]\n\t\tcount [attr_key] [attr_val]\n\t\tget [key] [attr_key]") \ No newline at end of file From c94ffa448447ae83878f99b35a05da2719aa3810 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Fri, 1 May 2020 03:56:21 -0400 Subject: [PATCH 06/39] Modified line index error in pyquery_semistructured.py --- pysuccinct/pyquery_semistructured.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pysuccinct/pyquery_semistructured.py b/pysuccinct/pyquery_semistructured.py index 0c69552..466bee3 100644 --- a/pysuccinct/pyquery_semistructured.py +++ b/pysuccinct/pyquery_semistructured.py @@ -65,10 +65,9 @@ def RepresentsInt(s): if (len(line) == 1): print("Could not parse command: ") continue - line = line[1].split(" ", 1) - key = line[0].strip() - attr_key = line[1].strip() - if (len(line) != 2 or RepresentsInt(key) == False): + key = line[1].strip() + attr_key = line[2].strip() + if (len(line) != 3 or RepresentsInt(key) == False): print("Could not parse command: ") else: q.get((int(key), attr_key)) From ee199a1c558f1646b77d2ba723e9728438ad7135 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Wed, 6 May 2020 20:58:21 -0400 Subject: [PATCH 07/39] Reformatted files to google-style and removed unused code --- .gitignore | 3 ++ CMakeLists.txt | 36 ++++++------- pysuccinct/CMakeLists.txt | 30 +++-------- pysuccinct/pycompress.cpp | 72 +++++++++++-------------- pysuccinct/pycompress.py | 23 ++++---- pysuccinct/pyquery_file.cpp | 75 ++++++++------------------ pysuccinct/pyquery_file.py | 29 +++++----- pysuccinct/pyquery_kv.cpp | 75 ++++++++------------------ pysuccinct/pyquery_kv.py | 26 ++++----- pysuccinct/pyquery_semistructured.cpp | 77 ++++++++------------------- pysuccinct/pyquery_semistructured.py | 26 ++++----- 11 files changed, 175 insertions(+), 297 deletions(-) diff --git a/.gitignore b/.gitignore index ebe31c4..69d6976 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,9 @@ *.lib lib/* +# IDE files +.vscode/settings.json + # Executables *.exe *.out diff --git a/CMakeLists.txt b/CMakeLists.txt index 24f673e..1746186 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,28 +1,28 @@ cmake_minimum_required(VERSION 2.8) project(succinct CXX) -FIND_PACKAGE(PythonInterp "3.7") +find_package(PythonInterp "3.7") if (PYTHONINTERP_FOUND) if (UNIX AND NOT APPLE) if (PYTHON_VERSION_MAJOR EQUAL 3) - FIND_PACKAGE(Boost COMPONENTS python${PYTHON_VERSION_SUFFIX}) - FIND_PACKAGE(PythonInterp 3) - FIND_PACKAGE(PythonLibs 3 REQUIRED) + find_package(Boost COMPONENTS python${PYTHON_VERSION_SUFFIX}) + find_package(PythonInterp 3) + find_package(PythonLibs 3 REQUIRED) else() - FIND_PACKAGE(Boost COMPONENTS python) - FIND_PACKAGE(PythonInterp) - FIND_PACKAGE(PythonLibs REQUIRED) + find_package(Boost COMPONENTS python) + find_package(PythonInterp) + find_package(PythonLibs REQUIRED) endif() else() if (PYTHON_VERSION_MAJOR EQUAL 3) - FIND_PACKAGE(Boost COMPONENTS python${PYTHON_VERSION_MAJOR}${PYTHON_VERSION_MINOR}) - FIND_PACKAGE(PythonInterp 3) - FIND_PACKAGE(PythonLibs 3 REQUIRED) + find_package(Boost COMPONENTS python${PYTHON_VERSION_MAJOR}${PYTHON_VERSION_MINOR}) + find_package(PythonInterp 3) + find_package(PythonLibs 3 REQUIRED) else() - FIND_PACKAGE(Boost COMPONENTS python${PYTHON_VERSION_MAJOR}${PYTHON_VERSION_MINOR}) - FIND_PACKAGE(PythonInterp) - FIND_PACKAGE(PythonLibs REQUIRED) + find_package(Boost COMPONENTS python${PYTHON_VERSION_MAJOR}${PYTHON_VERSION_MINOR}) + find_package(PythonInterp) + find_package(PythonLibs REQUIRED) endif() endif() else() @@ -34,16 +34,16 @@ message(STATUS "PYTHON_EXECUTABLE = ${PYTHON_EXECUTABLE}") message(STATUS "PYTHON_INCLUDE_DIRS = ${PYTHON_INCLUDE_DIRS}") message(STATUS "Boost_LIBRARIES = ${Boost_LIBRARIES}") -ENABLE_TESTING() -INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS}) -LINK_LIBRARIES(${Boost_LIBRARIES} ${PYTHON_LIBRARIES}) # Deprecated but so convenient! +enable_testing() +include_directories(${Boost_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS}) +link_libraries(${Boost_LIBRARIES} ${PYTHON_LIBRARIES}) include(CheckCXXCompilerFlag) -CHECK_CXX_COMPILER_FLAG("-std=c++11" COMPILER_SUPPORTS_CXX11) +check_cxx_compiler_flag("-std=c++11" COMPILER_SUPPORTS_CXX11) if(COMPILER_SUPPORTS_CXX11) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") else() - CHECK_CXX_COMPILER_FLAG("-std=c++0x" COMPILER_SUPPORTS_CXX0X) + check_cxx_compiler_flag("-std=c++0x" COMPILER_SUPPORTS_CXX0X) if(COMPILER_SUPPORTS_CXX0X) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x") else() diff --git a/pysuccinct/CMakeLists.txt b/pysuccinct/CMakeLists.txt index d7ca332..9def46f 100644 --- a/pysuccinct/CMakeLists.txt +++ b/pysuccinct/CMakeLists.txt @@ -1,32 +1,18 @@ -PYTHON_ADD_MODULE(pycompress pycompress.cpp) -FILE(COPY pycompress.py DESTINATION .) +python_add_module(pycompress pycompress.cpp) +file(COPY pycompress.py DESTINATION .) -PYTHON_ADD_MODULE(pyquery_file pyquery_file.cpp) -FILE(COPY pyquery_file.py DESTINATION .) +python_add_module(pyquery_file pyquery_file.cpp) +file(COPY pyquery_file.py DESTINATION .) -PYTHON_ADD_MODULE(pyquery_kv pyquery_kv.cpp) -FILE(COPY pyquery_kv.py DESTINATION .) +python_add_module(pyquery_kv pyquery_kv.cpp) +file(COPY pyquery_kv.py DESTINATION .) -PYTHON_ADD_MODULE(pyquery_semistructured pyquery_semistructured.cpp) -FILE(COPY pyquery_semistructured.py DESTINATION .) +python_add_module(pyquery_semistructured pyquery_semistructured.cpp) +file(COPY pyquery_semistructured.py DESTINATION .) cmake_minimum_required(VERSION 2.8) project(succinct-pysuccinct CXX) -include(CheckCXXCompilerFlag) -CHECK_CXX_COMPILER_FLAG("-std=c++11" COMPILER_SUPPORTS_CXX11) -if(COMPILER_SUPPORTS_CXX11) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") -else() - CHECK_CXX_COMPILER_FLAG("-std=c++0x" COMPILER_SUPPORTS_CXX0X) - if(COMPILER_SUPPORTS_CXX0X) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x") - else() - message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a different C++ compiler.") - endif() -endif() -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") - set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) file(MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) diff --git a/pysuccinct/pycompress.cpp b/pysuccinct/pycompress.cpp index 8ed5b8a..311eb52 100644 --- a/pysuccinct/pycompress.cpp +++ b/pysuccinct/pycompress.cpp @@ -9,19 +9,9 @@ using namespace boost::python; /** - * Example program that takes an input file and compresses it using Succinct. + * Program that wraps succinct's compression functions for python use via boost */ -/** - * Prints usage - */ -void print_usage(char *exec) { - fprintf( - stderr, - "Usage: %s [-s sa_sampling_rate] [-i isa_sampling_rate] [-x sampling_scheme] [-n npa_sampling_rate] [-r npa_encoding_scheme] [-t input_type] [file]\n", - exec); -} - /** * Converts integer option to SamplingScheme */ @@ -75,63 +65,63 @@ NPA::NPAEncodingScheme EncodingSchemeFromOption(int opt) { } } -struct File{ +struct File { File(std::string inputpath, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, uint32_t npa_sampling_rate, int sampling_opt, int npa_opt){ - this->inputpath = inputpath; - this->sa_sampling_rate = sa_sampling_rate; - this->isa_sampling_rate = isa_sampling_rate; - this->npa_sampling_rate = npa_sampling_rate; - this->sampling_scheme = SamplingSchemeFromOption(sampling_opt); - this->npa_encoding_scheme = EncodingSchemeFromOption(npa_opt); + this->input_path_ = inputpath; + this->sa_sampling_rate_ = sa_sampling_rate; + this->isa_sampling_rate_ = isa_sampling_rate; + this->npa_sampling_rate_ = npa_sampling_rate; + this->sampling_scheme_ = SamplingSchemeFromOption(sampling_opt); + this->npa_encoding_scheme_ = EncodingSchemeFromOption(npa_opt); } - //File members - std::string inputpath; - uint32_t sa_sampling_rate; - uint32_t isa_sampling_rate; - uint32_t npa_sampling_rate; - SamplingScheme sampling_scheme; - NPA::NPAEncodingScheme npa_encoding_scheme; - - void compressFile(){ + void CompressFile() { // The following compresses an input file at "inputpath" in memory // as a flat file (no structure) using the compression parameters // passed in (sampling rates, etc.). // Leave the arguments unspecified to use default values. - auto *fd = new SuccinctFile(inputpath, + auto *fd = new SuccinctFile(input_path_, SuccinctMode::CONSTRUCT_IN_MEMORY, - sa_sampling_rate, isa_sampling_rate, - npa_sampling_rate, sampling_scheme, - sampling_scheme, npa_encoding_scheme); + sa_sampling_rate_, isa_sampling_rate_, + npa_sampling_rate_, sampling_scheme_, + sampling_scheme_, npa_encoding_scheme_); // Serialize the compressed representation to disk at the location .succinct - fd->Serialize(inputpath + ".succinct"); + fd->Serialize(input_path_ + ".succinct"); delete fd; } - void compressShard(){ + void CompressShard() { // The following compresses an input file at "inputpath" in memory // as a buffer containing key-value pairs. It uses newline '\n' to // differentiate between successive values, and assigns the line number // as the key for the corresponding value. - auto *fd = new SuccinctShard(0, inputpath, + auto *fd = new SuccinctShard(0, input_path_, SuccinctMode::CONSTRUCT_IN_MEMORY, - sa_sampling_rate, isa_sampling_rate, - npa_sampling_rate, sampling_scheme, - sampling_scheme, npa_encoding_scheme); + sa_sampling_rate_, isa_sampling_rate_, + npa_sampling_rate_, sampling_scheme_, + sampling_scheme_, npa_encoding_scheme_); // Serialize the compressed representation to disk at the location .succinct - fd->Serialize(inputpath + ".succinct"); + fd->Serialize(input_path_ + ".succinct"); delete fd; } -}; + //File members + std::string input_path_; + uint32_t sa_sampling_rate_; + uint32_t isa_sampling_rate_; + uint32_t npa_sampling_rate_; + SamplingScheme sampling_scheme_; + NPA::NPAEncodingScheme npa_encoding_scheme_; +}; +//Boost Python module BOOST_PYTHON_MODULE(pycompress){ class_("File", init()) - .def("compressFile", &File::compressFile) - .def("compressShard", &File::compressShard) + .def("CompressFile", &File::CompressFile) + .def("CompressShard", &File::CompressShard) ; } \ No newline at end of file diff --git a/pysuccinct/pycompress.py b/pysuccinct/pycompress.py index f8aad9c..871cb09 100644 --- a/pysuccinct/pycompress.py +++ b/pysuccinct/pycompress.py @@ -2,20 +2,20 @@ import sys import getopt -#Argument size error checking +# Argument size error checking argc = len(sys.argv) if (argc < 2 or argc > 12): print("Usage: %s [-m mode] [file]") sys.exit(2) -#Loop through arguments to change default values and get input path using getopt +# Loop through arguments to change default values and get input path using getopt try: optlist, args = getopt.getopt(sys.argv[1:], 's:i:x:n:r:t:') except getopt.GetoptError as err: print("Get opt error") sys.exit(2) -#Default values +# Default values sa_sampling_rate = 32 isa_sampling_rate = 32 sampling_scheme = 0 @@ -24,7 +24,7 @@ type = "file" inputpath = "" -#Modify default values +# Modify default values for o, a in optlist: if o == "-s": sa_sampling_rate = int(a) @@ -43,28 +43,23 @@ sys.exit(2) -#Process input path +# Process input path if (len(args) != 1): - #Should have 1 argument left for input path print("Usage: %s [-m mode] [file]") sys.exit(2) else: - #Set the last unparsed element to the input path inputpath = str(args[0]) -#Create the file struct and compress +# Create the file struct and compress file = pycompress.File(inputpath, sa_sampling_rate, isa_sampling_rate, npa_sampling_rate, sampling_scheme, npa_encoding_scheme) -#Compress file or shard depending on type +# Compress file or shard depending on type if (type == "file"): - #Compress file - file.compressFile() + file.CompressFile() elif (type == "kv"): - #Compress shard - file.compressShard() + file.CompressShard() else: - #Error print("Invalid type\n") sys.exit(2) diff --git a/pysuccinct/pyquery_file.cpp b/pysuccinct/pyquery_file.cpp index 8a64296..e36ba92 100644 --- a/pysuccinct/pyquery_file.cpp +++ b/pysuccinct/pyquery_file.cpp @@ -11,88 +11,57 @@ using namespace boost::python; /** - * Prints usage. + * Program that wraps succinct's query file functions for python use via boost */ -void print_usage(char *exec) { - fprintf(stderr, "Usage: %s [-m mode] [file]\n", exec); -} - -void print_valid_cmds() { - std::cerr - << "Command must be one of: search [query], count [query], extract [offset] [length]\n"; -} - -typedef unsigned long long int timestamp_t; - -static timestamp_t get_timestamp() { - struct timeval now{}; - gettimeofday(&now, nullptr); - - return (now.tv_usec + (time_t) now.tv_sec * 1000000); -} struct QueryFile { - QueryFile(std::string filename, uint32_t mode){ - this->s_file = nullptr; + QueryFile(std::string filename, uint32_t mode) { + this->s_file_ = nullptr; if (mode == 0) { // If mode is set to 0, compress the input file. // Use default parameters. std::cout << "Constructing Succinct data structures...\n"; - this->s_file = new SuccinctFile(filename); + this->s_file_ = new SuccinctFile(filename); std::cout << "Serializing Succinct data structures...\n"; - this->s_file->Serialize(filename + ".succinct"); + this->s_file_->Serialize(filename + ".succinct"); } else { // If mode is set to 1, read the serialized data structures from disk. // The serialized data structures must exist at .succinct. std::cout << "De-serializing Succinct data structures...\n"; - this->s_file = new SuccinctFile(filename, SuccinctMode::LOAD_IN_MEMORY); + this->s_file_ = new SuccinctFile(filename, SuccinctMode::LOAD_IN_MEMORY); } - std::cout << "Done. Starting Succinct Shell...\n"; - print_valid_cmds(); } - //QueryFile members - SuccinctFile *s_file; - //Wrapped search command - void search(std::string arg){ + std::vector Search(const std::string& arg) { std::vector results; - timestamp_t start = get_timestamp(); - s_file->Search(results, arg); - timestamp_t tot_time = get_timestamp() - start; - std::cout << "Found " << results.size() << " results in " << tot_time - << "us:\n"; - for (auto res : results) { - std::cout << res << ", "; - } - std::cout << std::endl; + s_file_->Search(results, arg); + return results; } //Wrapped count command - void count(std::string arg){ - timestamp_t start = get_timestamp(); - int64_t count = s_file->Count(arg); - timestamp_t tot_time = get_timestamp() - start; - std::cout << "Count = " << count << "; Time taken: " << tot_time - << "us\n"; + int64_t Count(const std::string& arg) { + int64_t count = s_file_->Count(arg); + return count; } //Wrapped extract command - void extract(uint64_t offset, uint64_t length){ - timestamp_t start = get_timestamp(); + std::string Extract(uint64_t offset, uint64_t length) { std::string result; - s_file->Extract(result, offset, length); - timestamp_t tot_time = get_timestamp() - start; - std::cout << "Extracted string = " << result << "; Time taken: " - << tot_time << "us\n"; + s_file_->Extract(result, offset, length); + return result; } + + //QueryFile members + SuccinctFile *s_file_; }; +//Boost Python module BOOST_PYTHON_MODULE(pyquery_file){ class_("QueryFile", init()) - .def("search", &QueryFile::search) - .def("count", &QueryFile::count) - .def("extract", &QueryFile::extract) + .def("Search", &QueryFile::Search) + .def("Count", &QueryFile::Count) + .def("Extract", &QueryFile::Extract) ; } \ No newline at end of file diff --git a/pysuccinct/pyquery_file.py b/pysuccinct/pyquery_file.py index 1c7396d..e11adec 100644 --- a/pysuccinct/pyquery_file.py +++ b/pysuccinct/pyquery_file.py @@ -2,7 +2,7 @@ import sys import getopt -#Try catch block for non integer argument checking +# Try catch block for non integer argument checking def RepresentsInt(s): try: int(s) @@ -10,24 +10,24 @@ def RepresentsInt(s): except ValueError: return False -#Argument size error checking +# Argument size error checking argc = len(sys.argv) if (argc < 2 or argc > 12): print("Usage: %s [-m mode] [file]") sys.exit(2) -#Loop through arguments to change default values and get input path using getopt +# Loop through arguments to change default values and get input path using getopt try: optlist, args = getopt.getopt(sys.argv[1:], 'm:') except getopt.GetoptError as err: print(("Get opt error")) sys.exit(2) -#Default values +# Default values mode = 0 filename = "" -#Modify default values +# Modify default values for o, a in optlist: if o == "-m": mode = int(a) @@ -35,33 +35,32 @@ def RepresentsInt(s): printf("Invalid Option") sys.exit(2) -#Process filename +# Process filename if (len(args) != 1): - #Should have 1 argument left for filename print("Usage: %s [-m mode] [file]") sys.exit(2) else: - #Set the last unparsed element to the filename filename = str(args[0]) -#Create the pyquery_file struct and run command given on next input +# Create the pyquery_file struct and run command given on next input q = pyquery_file.QueryFile(filename, mode) +print("Done. Starting Succinct Shell...") +print("Command must be one of: search [query], count [query], extract [offset] [length]") -#parse through line by line +# Parse through line by line while (True): line = input("succinct> ") line = line.split(" ", 1) - #Search command if (line[0] == "search"): if (len(line) != 2): print("Could not parse command: ") else: - q.search(line[1].strip()) + q.Search(line[1].strip()) elif(line[0] == "count"): if (len(line) != 2): - print("Could not parse command: "]) + print("Could not parse command: ") else: - q.count(line[1].strip()) + q.Count(line[1].strip()) elif(line[0] == "extract"): if (len(line) == 1): print("Could not parse command: ") @@ -72,7 +71,7 @@ def RepresentsInt(s): if (len(line) != 2 or RepresentsInt(offset) == False or RepresentsInt(length) == False): print("Could not parse command: ") else: - q.extract(int(line[0].strip()), int(line[1].strip())) + q.Extract(int(line[0].strip()), int(line[1].strip())) elif(line[0] == "exit"): break else: diff --git a/pysuccinct/pyquery_kv.cpp b/pysuccinct/pyquery_kv.cpp index af8eda3..49a22dd 100644 --- a/pysuccinct/pyquery_kv.cpp +++ b/pysuccinct/pyquery_kv.cpp @@ -11,90 +11,59 @@ using namespace boost::python; /** - * Prints usage. + * Program that wraps succinct's query kv functions for python use via boost */ -void print_usage(char *exec) { - fprintf(stderr, "Usage: %s [-m mode] [file]\n", exec); -} - -void print_valid_cmds() { - std::cerr - << "Command must be one of: search [query], count [query], get [key]\n"; -} - -typedef unsigned long long int timestamp_t; - -static timestamp_t get_timestamp() { - struct timeval now{}; - gettimeofday(&now, nullptr); - - return (now.tv_usec + (time_t) now.tv_sec * 1000000); -} struct QueryKv{ - QueryKv(std::string filename, uint32_t mode){ - this->s_file = nullptr; + QueryKv(std::string filename, uint32_t mode) { + this->s_file_ = nullptr; if (mode == 0) { // If mode is set to 0, compress the input file. // Use default parameters. std::cout << "Constructing Succinct data structures...\n"; - s_file = new SuccinctShard(0, filename); + s_file_ = new SuccinctShard(0, filename); std::cout << "Serializing Succinct data structures...\n"; - s_file->Serialize(filename + ".succinct"); + s_file_->Serialize(filename + ".succinct"); } else { // If mode is set to 1, read the serialized data structures from disk. // The serialized data structures must exist at .succinct. std::cout << "De-serializing Succinct data structures...\n"; - s_file = new SuccinctShard(0, filename, SuccinctMode::LOAD_IN_MEMORY); + s_file_ = new SuccinctShard(0, filename, SuccinctMode::LOAD_IN_MEMORY); } - std::cout << "Done. Starting Succinct Shell...\n"; - print_valid_cmds(); } - //QueryKv members - SuccinctShard *s_file; - //Wrapped search command - void search(std::string arg){ + std::set Search(const std::string &arg) { std::set results; - timestamp_t start = get_timestamp(); - s_file->Search(results, arg); - timestamp_t tot_time = get_timestamp() - start; - std::cout << "Found " << results.size() << " results in " << tot_time - << "us:\n"; - for (auto res : results) { - std::cout << res << ", "; - } - std::cout << std::endl; + s_file_->Search(results, arg); + return results; } //Wrapped count command - void count(std::string arg){ - timestamp_t start = get_timestamp(); - int64_t count = s_file->Count(arg); - timestamp_t tot_time = get_timestamp() - start; - std::cout << "Count = " << count << "; Time taken: " << tot_time - << "us\n"; + int64_t Count(const std::string &arg) { + int64_t count = s_file_->Count(arg); + return count; } //Wrapped get command - void get(uint64_t key){ - timestamp_t start = get_timestamp(); + std::string Get(uint64_t key) { std::string result; - s_file->Get(result, key); - timestamp_t tot_time = get_timestamp() - start; - std::cout << "Value = " << result << "; Time taken: " - << tot_time << "us\n"; + s_file_->Get(result, key); + return result; } + //QueryKv members + SuccinctShard *s_file_; + }; +//Boost Python module BOOST_PYTHON_MODULE(pyquery_kv){ class_("QueryKv", init()) - .def("search", &QueryKv::search) - .def("count", &QueryKv::count) - .def("get", &QueryKv::get) + .def("Search", &QueryKv::Search) + .def("Count", &QueryKv::Count) + .def("Get", &QueryKv::Get) ; } diff --git a/pysuccinct/pyquery_kv.py b/pysuccinct/pyquery_kv.py index 8903e11..b55592c 100644 --- a/pysuccinct/pyquery_kv.py +++ b/pysuccinct/pyquery_kv.py @@ -2,7 +2,7 @@ import sys import getopt -#Try catch block for non integer argument checking +# Try catch block for non integer argument checking def RepresentsInt(s): try: int(s) @@ -10,24 +10,24 @@ def RepresentsInt(s): except ValueError: return False -#Argument size error checking +# Argument size error checking argc = len(sys.argv) if (argc < 2 or argc > 12): print("Usage: %s [-m mode] [file]") sys.exit(2) -#Loop through arguments to change default values and get input path using getopt +# Loop through arguments to change default values and get input path using getopt try: optlist, args = getopt.getopt(sys.argv[1:], 'm:') except getopt.GetoptError as err: print("Get opt error") sys.exit(2) -#Default values +# Default values mode = 0 filename = "" -#Modify default values +# Modify default values for o, a in optlist: if o == "-m": mode = int(a) @@ -35,19 +35,19 @@ def RepresentsInt(s): print("Invalid Option") sys.exit(2) -#Process filename +# Process filename if (len(args) != 1): - #Should have 1 argument left for filename print("Usage: %s [-m mode] [file]") sys.exit(2) else: - #Set the last unparsed element to the filename filename = str(args[0]) -#Create the pyquery_file struct and run command given on next input +# Create the pyquery_file struct and run command given on next input q = pyquery_kv.QueryKv(filename, mode) +print("Done. Starting Succinct Shell...") +print("Command must be one of: search [query], count [query], get [key]") -#parse through line by line +# Parse through line by line while (True): line = input("succinct> ") line = line.split(" ", 1) @@ -55,12 +55,12 @@ def RepresentsInt(s): if (len(line) != 2): print("Could not parse command: ") else: - q.search(line[1].strip()) + q.Search(line[1].strip()) elif(line[0] == "count"): if (len(line) != 2): print("Could not parse command: ") else: - q.count(line[1].strip()) + q.Count(line[1].strip()) elif(line[0] == "get"): if (len(line) == 1): print("Could not parse command: ") @@ -69,7 +69,7 @@ def RepresentsInt(s): if (len(line) != 2 or RepresentsInt(key) == False): print("Could not parse command: ") else: - q.get((int(key))) + q.Get((int(key))) elif(line[0] == "exit"): break else: diff --git a/pysuccinct/pyquery_semistructured.cpp b/pysuccinct/pyquery_semistructured.cpp index bdc5327..860e89f 100644 --- a/pysuccinct/pyquery_semistructured.cpp +++ b/pysuccinct/pyquery_semistructured.cpp @@ -10,93 +10,60 @@ using namespace boost::python; /** - * Prints usage. + * Program that wraps succinct's query semistructured functions for python use via boost */ -void print_usage(char *exec) { - fprintf(stderr, "Usage: %s [-m mode] [file]\n", exec); -} - -void print_valid_cmds() { - std::cerr << "Command must be one of:\n" - << "\t\tsearch [attr_key] [attr_val]\n" - << "\t\tcount [attr_key] [attr_val]\n" - << "\t\tget [key] [attr_key]\n"; -} - -typedef unsigned long long int timestamp_t; - -static timestamp_t get_timestamp() { - struct timeval now{}; - gettimeofday(&now, nullptr); - - return (now.tv_usec + (time_t) now.tv_sec * 1000000); -} struct QuerySemistructured{ - QuerySemistructured(std::string filename, uint32_t mode){ - this->s_file = nullptr; + QuerySemistructured(std::string filename, uint32_t mode) { + this->s_file_ = nullptr; if (mode == 0) { // If mode is set to 0, compress the input file. // Use default parameters. std::cout << "Constructing Succinct data structures...\n"; - s_file = new SuccinctSemistructuredShard(filename); + s_file_ = new SuccinctSemistructuredShard(filename); std::cout << "Serializing Succinct data structures...\n"; - s_file->Serialize(filename + ".succinct"); + s_file_->Serialize(filename + ".succinct"); } else { // If mode is set to 1, read the serialized data structures from disk. // The serialized data structures must exist at .succinct. std::cout << "De-serializing Succinct data structures...\n"; - s_file = new SuccinctSemistructuredShard(filename, + s_file_ = new SuccinctSemistructuredShard(filename, SuccinctMode::LOAD_IN_MEMORY); } - std::cout << "Done. Starting Succinct Shell...\n"; - print_valid_cmds(); } - //QuerySemistructured members - SuccinctSemistructuredShard *s_file; - //Wrapped search command - void search(const std::string attr_key, const std::string attr_val){ + std::set Search(const std::string &attr_key, const std::string &attr_val) { std::set results; - timestamp_t start = get_timestamp(); - s_file->SearchAttribute(results, attr_key, attr_val); - timestamp_t tot_time = get_timestamp() - start; - std::cout << "Found " << results.size() << " records in " << tot_time - << "us; Matching keys:\n"; - for (auto res : results) { - std::cout << res << ", "; - } - std::cout << std::endl; + s_file_->SearchAttribute(results, attr_key, attr_val); + return results; } //Wrapped count command - void count(const std::string attr_key, const std::string attr_val){ - timestamp_t start = get_timestamp(); - int64_t count = s_file->CountAttribute(attr_key, attr_val); - timestamp_t tot_time = get_timestamp() - start; - std::cout << "Number of matching records = " << count << "; Time taken: " - << tot_time << "us\n"; + int64_t Count(const std::string &attr_key, const std::string &attr_val) { + int64_t count = s_file_->CountAttribute(attr_key, attr_val); + return count; } //Wrapped get command - void get(int64_t key, std::string attr_key){ - timestamp_t start = get_timestamp(); + std::string Get(int64_t key, std::string attr_key) { std::string result; - s_file->Get(result, key, attr_key); - timestamp_t tot_time = get_timestamp() - start; - std::cout << "Value = " << result << "; Time taken: " << tot_time - << "us\n"; + s_file_->Get(result, key, attr_key); + return result; } + //QuerySemistructured members + SuccinctSemistructuredShard *s_file_; + }; +//Boost Python module BOOST_PYTHON_MODULE(pyquery_semistructured){ class_("QuerySemistructured", init()) - .def("search", &QuerySemistructured::search) - .def("count", &QuerySemistructured::count) - .def("get", &QuerySemistructured::get) + .def("Search", &QuerySemistructured::Search) + .def("Count", &QuerySemistructured::Count) + .def("Get", &QuerySemistructured::Get) ; } diff --git a/pysuccinct/pyquery_semistructured.py b/pysuccinct/pyquery_semistructured.py index 466bee3..d461975 100644 --- a/pysuccinct/pyquery_semistructured.py +++ b/pysuccinct/pyquery_semistructured.py @@ -2,7 +2,7 @@ import sys import getopt -#Try catch block for non integer argument checking +# Try catch block for non integer argument checking def RepresentsInt(s): try: int(s) @@ -10,24 +10,24 @@ def RepresentsInt(s): except ValueError: return False -#Argument size error checking +# Argument size error checking argc = len(sys.argv) if (argc < 2 or argc > 12): print("Usage: %s [-m mode] [file]") sys.exit(2) -#Loop through arguments to change default values and get input path using getopt +# Loop through arguments to change default values and get input path using getopt try: optlist, args = getopt.getopt(sys.argv[1:], 'm:') except getopt.GetoptError as err: print("Get opt error") sys.exit(2) -#Default values +# Default values mode = 0 filename = "" -#Modify default values +# Modify default values for o, a in optlist: if o == "-m": mode = int(a) @@ -35,19 +35,19 @@ def RepresentsInt(s): printf("Invalid Option") sys.exit(2) -#Process filename +# Process filename if (len(args) != 1): - #Should have 1 argument left for filename print("Usage: %s [-m mode] [file]") sys.exit(2) else: - #Set the last unparsed element to the filename filename = str(args[0]) -#Create the pyquery_file struct and run command given on next input +# Create the pyquery_file struct and run command given on next input q = pyquery_semistructured.QuerySemistructured(filename, mode) +print("Done. Starting Succinct Shell...") +print("Command must be one of:\n\t\tsearch [attr_key] [attr_val]\n\t\tcount [attr_key] [attr_val]\n\t\tget [key] [attr_key]") -#parse through line by line +# Parse through line by line while (True): line = input("succinct> ") line = line.split(" ") @@ -55,12 +55,12 @@ def RepresentsInt(s): if (len(line) != 3): print("Could not parse command: ") else: - q.search(line[1].strip(), line[2].strip()) + q.Search(line[1].strip(), line[2].strip()) elif(line[0] == "count"): if (len(line) != 3): print("Could not parse command: ") else: - q.count(line[1].strip(), line[2].strip()) + q.Count(line[1].strip(), line[2].strip()) elif(line[0] == "get"): if (len(line) == 1): print("Could not parse command: ") @@ -70,7 +70,7 @@ def RepresentsInt(s): if (len(line) != 3 or RepresentsInt(key) == False): print("Could not parse command: ") else: - q.get((int(key), attr_key)) + q.Get((int(key), attr_key)) elif(line[0] == "exit"): break else: From c625f3f9ae07b38bc7298209a5dc64e2bf353cbf Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Fri, 8 May 2020 12:08:34 -0400 Subject: [PATCH 08/39] Reorganized Modules, all 3 types have query and compression functions --- .vscode/settings.json | 6 +- pysuccinct/CMakeLists.txt | 22 ++-- pysuccinct/pycompress.cpp | 127 -------------------- pysuccinct/pycompress.py | 65 ----------- pysuccinct/pyfile.cpp | 160 ++++++++++++++++++++++++++ pysuccinct/pyfile.py | 98 ++++++++++++++++ pysuccinct/pykv.cpp | 150 ++++++++++++++++++++++++ pysuccinct/pykv.py | 99 ++++++++++++++++ pysuccinct/pyquery_file.cpp | 67 ----------- pysuccinct/pyquery_file.py | 79 ------------- pysuccinct/pyquery_kv.cpp | 71 ------------ pysuccinct/pyquery_kv.py | 77 ------------- pysuccinct/pyquery_semistructured.cpp | 71 ------------ pysuccinct/pyquery_semistructured.py | 78 ------------- pysuccinct/pysemistructured.cpp | 152 ++++++++++++++++++++++++ pysuccinct/pysemistructured.py | 97 ++++++++++++++++ 16 files changed, 769 insertions(+), 650 deletions(-) delete mode 100644 pysuccinct/pycompress.cpp delete mode 100644 pysuccinct/pycompress.py create mode 100644 pysuccinct/pyfile.cpp create mode 100644 pysuccinct/pyfile.py create mode 100644 pysuccinct/pykv.cpp create mode 100644 pysuccinct/pykv.py delete mode 100644 pysuccinct/pyquery_file.cpp delete mode 100644 pysuccinct/pyquery_file.py delete mode 100644 pysuccinct/pyquery_kv.cpp delete mode 100644 pysuccinct/pyquery_kv.py delete mode 100644 pysuccinct/pyquery_semistructured.cpp delete mode 100644 pysuccinct/pyquery_semistructured.py create mode 100644 pysuccinct/pysemistructured.cpp create mode 100644 pysuccinct/pysemistructured.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 8870183..66a9512 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -29,6 +29,8 @@ "map": "cpp", "set": "cpp", "unordered_map": "cpp", - "__node_handle": "cpp" - } + "__node_handle": "cpp", + "cwchar": "cpp" + }, + "python.linting.enabled": false } \ No newline at end of file diff --git a/pysuccinct/CMakeLists.txt b/pysuccinct/CMakeLists.txt index 9def46f..571090d 100644 --- a/pysuccinct/CMakeLists.txt +++ b/pysuccinct/CMakeLists.txt @@ -1,14 +1,11 @@ -python_add_module(pycompress pycompress.cpp) -file(COPY pycompress.py DESTINATION .) +python_add_module(pyfile pyfile.cpp) +file(COPY pyfile.py DESTINATION .) -python_add_module(pyquery_file pyquery_file.cpp) -file(COPY pyquery_file.py DESTINATION .) +python_add_module(pykv pykv.cpp) +file(COPY pykv.py DESTINATION .) -python_add_module(pyquery_kv pyquery_kv.cpp) -file(COPY pyquery_kv.py DESTINATION .) - -python_add_module(pyquery_semistructured pyquery_semistructured.cpp) -file(COPY pyquery_semistructured.py DESTINATION .) +python_add_module(pysemistructured pysemistructured.cpp) +file(COPY pysemistructured.py DESTINATION .) cmake_minimum_required(VERSION 2.8) project(succinct-pysuccinct CXX) @@ -20,7 +17,6 @@ file(MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) set(INCLUDE include ../core/include) include_directories(${INCLUDE}) -target_link_libraries(pycompress succinct) -target_link_libraries(pyquery_file succinct) -target_link_libraries(pyquery_kv succinct) -target_link_libraries(pyquery_semistructured succinct) \ No newline at end of file +target_link_libraries(pyfile succinct) +target_link_libraries(pykv succinct) +target_link_libraries(pysemistructured succinct) \ No newline at end of file diff --git a/pysuccinct/pycompress.cpp b/pysuccinct/pycompress.cpp deleted file mode 100644 index 311eb52..0000000 --- a/pysuccinct/pycompress.cpp +++ /dev/null @@ -1,127 +0,0 @@ -#include -#include - -#include "succinct_shard.h" -#include "succinct_file.h" -#include "npa/npa.h" - -#include -using namespace boost::python; - -/** - * Program that wraps succinct's compression functions for python use via boost - */ - -/** - * Converts integer option to SamplingScheme - */ -SamplingScheme SamplingSchemeFromOption(int opt) { - switch (opt) { - case 0: { - fprintf(stderr, "Sampling Scheme = Flat Sample by Index\n"); - return SamplingScheme::FLAT_SAMPLE_BY_INDEX; - } - case 1: { - fprintf(stderr, "Sampling Scheme = Flat Sample by Value\n"); - return SamplingScheme::FLAT_SAMPLE_BY_VALUE; - } - case 2: { - fprintf(stderr, "Sampling Scheme = Layered Sample by Index\n"); - return SamplingScheme::LAYERED_SAMPLE_BY_INDEX; - } - case 3: { - fprintf(stderr, - "Sampling Scheme = Opportunistic Layered Sample by Index\n"); - return SamplingScheme::OPPORTUNISTIC_LAYERED_SAMPLE_BY_INDEX; - } - default: { - fprintf(stderr, "Sampling Scheme = Flat Sample by Index\n"); - return SamplingScheme::FLAT_SAMPLE_BY_INDEX; - } - } -} - -/** - * Converts integer option to NPAEncodingScheme - */ -NPA::NPAEncodingScheme EncodingSchemeFromOption(int opt) { - switch (opt) { - case 0: { - fprintf(stderr, "NPA Encoding Scheme = Elias Delta\n"); - return NPA::NPAEncodingScheme::ELIAS_DELTA_ENCODED; - } - case 1: { - fprintf(stderr, "NPA Encoding Scheme = Elias Gamma\n"); - return NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED; - } - case 2: { - fprintf(stderr, "NPA Encoding Scheme = Wavelet Tree\n"); - return NPA::NPAEncodingScheme::WAVELET_TREE_ENCODED; - } - default: { - fprintf(stderr, "NPA Encoding Scheme = Elias Gamma\n"); - return NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED; - } - } -} - -struct File { - File(std::string inputpath, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, - uint32_t npa_sampling_rate, int sampling_opt, int npa_opt){ - this->input_path_ = inputpath; - this->sa_sampling_rate_ = sa_sampling_rate; - this->isa_sampling_rate_ = isa_sampling_rate; - this->npa_sampling_rate_ = npa_sampling_rate; - this->sampling_scheme_ = SamplingSchemeFromOption(sampling_opt); - this->npa_encoding_scheme_ = EncodingSchemeFromOption(npa_opt); - } - - void CompressFile() { - // The following compresses an input file at "inputpath" in memory - // as a flat file (no structure) using the compression parameters - // passed in (sampling rates, etc.). - // Leave the arguments unspecified to use default values. - auto *fd = new SuccinctFile(input_path_, - SuccinctMode::CONSTRUCT_IN_MEMORY, - sa_sampling_rate_, isa_sampling_rate_, - npa_sampling_rate_, sampling_scheme_, - sampling_scheme_, npa_encoding_scheme_); - - // Serialize the compressed representation to disk at the location .succinct - fd->Serialize(input_path_ + ".succinct"); - delete fd; - } - - void CompressShard() { - // The following compresses an input file at "inputpath" in memory - // as a buffer containing key-value pairs. It uses newline '\n' to - // differentiate between successive values, and assigns the line number - // as the key for the corresponding value. - auto *fd = new SuccinctShard(0, input_path_, - SuccinctMode::CONSTRUCT_IN_MEMORY, - sa_sampling_rate_, isa_sampling_rate_, - npa_sampling_rate_, sampling_scheme_, - sampling_scheme_, npa_encoding_scheme_); - - // Serialize the compressed representation to disk at the location .succinct - fd->Serialize(input_path_ + ".succinct"); - delete fd; - } - - //File members - std::string input_path_; - uint32_t sa_sampling_rate_; - uint32_t isa_sampling_rate_; - uint32_t npa_sampling_rate_; - SamplingScheme sampling_scheme_; - NPA::NPAEncodingScheme npa_encoding_scheme_; - -}; - -//Boost Python module -BOOST_PYTHON_MODULE(pycompress){ - class_("File", init()) - .def("CompressFile", &File::CompressFile) - .def("CompressShard", &File::CompressShard) - ; -} \ No newline at end of file diff --git a/pysuccinct/pycompress.py b/pysuccinct/pycompress.py deleted file mode 100644 index 871cb09..0000000 --- a/pysuccinct/pycompress.py +++ /dev/null @@ -1,65 +0,0 @@ -import pycompress -import sys -import getopt - -# Argument size error checking -argc = len(sys.argv) -if (argc < 2 or argc > 12): - print("Usage: %s [-m mode] [file]") - sys.exit(2) - -# Loop through arguments to change default values and get input path using getopt -try: - optlist, args = getopt.getopt(sys.argv[1:], 's:i:x:n:r:t:') -except getopt.GetoptError as err: - print("Get opt error") - sys.exit(2) - -# Default values -sa_sampling_rate = 32 -isa_sampling_rate = 32 -sampling_scheme = 0 -npa_sampling_rate = 128 -npa_encoding_scheme = 1 -type = "file" -inputpath = "" - -# Modify default values -for o, a in optlist: - if o == "-s": - sa_sampling_rate = int(a) - elif o == "-i": - isa_sampling_rate = int(a) - elif o == "-x": - sampling_scheme = int(a) - elif o == "-n": - npa_sampling_rate = int(a) - elif o == "-r": - npa_encoding_scheme = int(a) - elif o == "-t": - type = a - else: - printf("Invalid Option") - sys.exit(2) - - -# Process input path -if (len(args) != 1): - print("Usage: %s [-m mode] [file]") - sys.exit(2) -else: - inputpath = str(args[0]) - -# Create the file struct and compress -file = pycompress.File(inputpath, sa_sampling_rate, - isa_sampling_rate, npa_sampling_rate, - sampling_scheme, npa_encoding_scheme) - -# Compress file or shard depending on type -if (type == "file"): - file.CompressFile() -elif (type == "kv"): - file.CompressShard() -else: - print("Invalid type\n") - sys.exit(2) diff --git a/pysuccinct/pyfile.cpp b/pysuccinct/pyfile.cpp new file mode 100644 index 0000000..21d2644 --- /dev/null +++ b/pysuccinct/pyfile.cpp @@ -0,0 +1,160 @@ +#include +#include +#include +#include +#include +#include + +#include "succinct_file.h" + +#include +using namespace boost::python; + +/** + * Program that wraps succinct's query file functions for python use via boost + */ + +/** + * Converts integer option to SamplingScheme + */ +SamplingScheme SamplingSchemeFromOption(int opt) { + switch (opt) { + case 0: { + fprintf(stderr, "Sampling Scheme = Flat Sample by Index\n"); + return SamplingScheme::FLAT_SAMPLE_BY_INDEX; + } + case 1: { + fprintf(stderr, "Sampling Scheme = Flat Sample by Value\n"); + return SamplingScheme::FLAT_SAMPLE_BY_VALUE; + } + case 2: { + fprintf(stderr, "Sampling Scheme = Layered Sample by Index\n"); + return SamplingScheme::LAYERED_SAMPLE_BY_INDEX; + } + case 3: { + fprintf(stderr, + "Sampling Scheme = Opportunistic Layered Sample by Index\n"); + return SamplingScheme::OPPORTUNISTIC_LAYERED_SAMPLE_BY_INDEX; + } + default: { + fprintf(stderr, "Sampling Scheme = Flat Sample by Index\n"); + return SamplingScheme::FLAT_SAMPLE_BY_INDEX; + } + } +} + +/** + * Converts integer option to NPAEncodingScheme + */ +NPA::NPAEncodingScheme EncodingSchemeFromOption(int opt) { + switch (opt) { + case 0: { + fprintf(stderr, "NPA Encoding Scheme = Elias Delta\n"); + return NPA::NPAEncodingScheme::ELIAS_DELTA_ENCODED; + } + case 1: { + fprintf(stderr, "NPA Encoding Scheme = Elias Gamma\n"); + return NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED; + } + case 2: { + fprintf(stderr, "NPA Encoding Scheme = Wavelet Tree\n"); + return NPA::NPAEncodingScheme::WAVELET_TREE_ENCODED; + } + default: { + fprintf(stderr, "NPA Encoding Scheme = Elias Gamma\n"); + return NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED; + } + } +} + +/** + * Boost python function to convert vector to python list + */ +boost::python::list VectorToList(const std::vector& v) { + boost::python::object get_iter = boost::python::iterator >(); + boost::python::object iter = get_iter(v); + boost::python::list l(iter); + return l; +} + +/** + * PyFile struct that wraps query and compress functions for boost python + */ +struct PyFile { + // Constructor that loads from file + PyFile(const std::string& filename) { + s_file_ = nullptr; + // Read the serialized data structures from disk. + // The serialized data structures must exist at .succinct. + std::cout << "De-serializing Succinct data structures...\n"; + s_file_ = new SuccinctFile(filename, SuccinctMode::LOAD_IN_MEMORY); + } + + + // Constructor that compresses file given the sampling rate arguments + PyFile(const std::string& inputpath, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, + uint32_t npa_sampling_rate, int sampling_opt, int npa_opt){ + s_file_ = nullptr; + // The following compresses an input file at "inputpath" in memory + // as a flat file (no structure) using the compression parameters + // passed in (sampling rates, etc.). + // Leave the arguments unspecified to use default values. + std::cout << "Constructing Succinct data structures...\n"; + s_file_ = new SuccinctFile(inputpath, + SuccinctMode::CONSTRUCT_IN_MEMORY, + sa_sampling_rate, isa_sampling_rate, + npa_sampling_rate, SamplingSchemeFromOption(sampling_opt), + SamplingSchemeFromOption(sampling_opt), EncodingSchemeFromOption(npa_opt)); + std::cout << "Serializing Succinct data structures...\n"; + // Serialize the compressed representation to disk at the location .succinct + s_file_->Serialize(inputpath + ".succinct"); + } + + // Wrapped search command, that returns a python list + boost::python::list PySearch(const std::string& arg) { + std::vector results; + s_file_->Search(results, arg); + boost::python::list ret; + for (const auto& i :results){ + ret.append(i); + } + return ret; + } + + // Wrapped search command + std::vector Search(const std::string& arg) { + std::vector results; + s_file_->Search(results, arg); + return results; + } + + // Wrapped count command + int64_t Count(const std::string& arg) { + int64_t count = s_file_->Count(arg); + return count; + } + + // Wrapped extract command + std::string Extract(uint64_t offset, uint64_t length) { + std::string result; + s_file_->Extract(result, offset, length); + return result; + } + + //PyFile members + SuccinctFile *s_file_; +}; + +/** + * Boost Python module + */ +BOOST_PYTHON_MODULE(pyfile){ + class_("PyFile", init()) + .def(init()) + .def("PySearch", &PyFile::PySearch) + .def("Search", &PyFile::Search) + .def("Count", &PyFile::Count) + .def("Extract", &PyFile::Extract) + ; + +} \ No newline at end of file diff --git a/pysuccinct/pyfile.py b/pysuccinct/pyfile.py new file mode 100644 index 0000000..4938734 --- /dev/null +++ b/pysuccinct/pyfile.py @@ -0,0 +1,98 @@ +import pyfile +import sys +import getopt + +# Try catch block for non integer argument checking +def RepresentsInt(s): + try: + int(s) + return True + except ValueError: + return False + +# Default values +sa_sampling_rate = 32 +isa_sampling_rate = 32 +sampling_scheme = 0 +npa_sampling_rate = 128 +npa_encoding_scheme = 1 +type = "file" +inputpath = "" + +# Get user input to either load from memory or compress a new file +option = input("Usage: [load/compress] [file]\n") +option = option.split() +if (len(option) != 2): + print("Usage: [load/compress] [file]\n") + sys.exit(2) +else: + inputpath = option[1] + if (option[0] == "load"): + # Load file from memory + print("loading ", inputpath, " from file") + q = pyfile.PyFile(inputpath) + elif (option[0] == "compress"): + # Compress the file + print("Please enter the sampling rates") + option = input("Usage: [-s sa_sampling_rate] [-i isa_sampling_rate] [-x sampling_scheme] [-n npa_sampling_rate] [-r npa_encoding_scheme] [-t input_type]\n") + # Loop through arguments to get sampling rates using getopt + try: + optlist, args = getopt.getopt(option, 's:i:x:n:r:t:') + except getopt.GetoptError as err: + print("Get opt error") + sys.exit(2) + for o, a in optlist: + if o == "-s": + sa_sampling_rate = int(a) + elif o == "-i": + isa_sampling_rate = int(a) + elif o == "-x": + sampling_scheme = int(a) + elif o == "-n": + npa_sampling_rate = int(a) + elif o == "-r": + npa_encoding_scheme = int(a) + elif o == "-t": + type = a + else: + printf("Invalid Option") + sys.exit(2) + q = pyfile.PyFile(inputpath, sa_sampling_rate, + isa_sampling_rate, npa_sampling_rate, + sampling_scheme, npa_encoding_scheme) + else: + print("Usage: [load/compress] [file]\n") + sys.exit(2) + +# Parse through line by line +while (True): + line = input("succinct> ") + line = line.split(" ", 1) + if (line[0] == "search"): + if (len(line) != 2): + print("Could not parse command: ") + continue + else: + print(q.PySearch(line[1].strip())) + elif(line[0] == "count"): + if (len(line) != 2): + print("Could not parse command: ") + continue + else: + print(q.Count(line[1].strip())) + elif(line[0] == "extract"): + if (len(line) == 1): + print("Could not parse command: ") + continue + line = line[1].split(" ", 1) + offset = line[0].strip() + length = line[1].strip() + if (len(line) != 2 or RepresentsInt(offset) == False or RepresentsInt(length) == False): + print("Could not parse command: ") + else: + print(q.Extract(int(line[0].strip()), int(line[1].strip()))) + elif(line[0] == "exit"): + break + else: + print("Unsupported command") + print("Command must be one of: search [query], count [query], extract [offset] [length]") \ No newline at end of file diff --git a/pysuccinct/pykv.cpp b/pysuccinct/pykv.cpp new file mode 100644 index 0000000..7225699 --- /dev/null +++ b/pysuccinct/pykv.cpp @@ -0,0 +1,150 @@ +#include +#include +#include +#include +#include +#include + +#include "succinct_shard.h" + +#include +using namespace boost::python; + +/** + * Program that wraps succinct's query kv functions for python use via boost + */ + +/** + * Converts integer option to SamplingScheme + */ +SamplingScheme SamplingSchemeFromOption(int opt) { + switch (opt) { + case 0: { + fprintf(stderr, "Sampling Scheme = Flat Sample by Index\n"); + return SamplingScheme::FLAT_SAMPLE_BY_INDEX; + } + case 1: { + fprintf(stderr, "Sampling Scheme = Flat Sample by Value\n"); + return SamplingScheme::FLAT_SAMPLE_BY_VALUE; + } + case 2: { + fprintf(stderr, "Sampling Scheme = Layered Sample by Index\n"); + return SamplingScheme::LAYERED_SAMPLE_BY_INDEX; + } + case 3: { + fprintf(stderr, + "Sampling Scheme = Opportunistic Layered Sample by Index\n"); + return SamplingScheme::OPPORTUNISTIC_LAYERED_SAMPLE_BY_INDEX; + } + default: { + fprintf(stderr, "Sampling Scheme = Flat Sample by Index\n"); + return SamplingScheme::FLAT_SAMPLE_BY_INDEX; + } + } +} + +/** + * Converts integer option to NPAEncodingScheme + */ +NPA::NPAEncodingScheme EncodingSchemeFromOption(int opt) { + switch (opt) { + case 0: { + fprintf(stderr, "NPA Encoding Scheme = Elias Delta\n"); + return NPA::NPAEncodingScheme::ELIAS_DELTA_ENCODED; + } + case 1: { + fprintf(stderr, "NPA Encoding Scheme = Elias Gamma\n"); + return NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED; + } + case 2: { + fprintf(stderr, "NPA Encoding Scheme = Wavelet Tree\n"); + return NPA::NPAEncodingScheme::WAVELET_TREE_ENCODED; + } + default: { + fprintf(stderr, "NPA Encoding Scheme = Elias Gamma\n"); + return NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED; + } + } +} + +/** + * PyKv struct that wraps query and compress functions for boost python + */ +struct PyKv{ + // Constructor that loads from kv + PyKv(const std::string& filename) { + s_file_ = nullptr; + // Read the serialized data structures from disk. + // The serialized data structures must exist at .succinct. + std::cout << "De-serializing Succinct data structures...\n"; + s_file_ = new SuccinctShard(0, filename, SuccinctMode::LOAD_IN_MEMORY); + } + + // Constructor that compresses kv given the sampling rate arguments + PyKv(const std::string& inputpath, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, + uint32_t npa_sampling_rate, int sampling_opt, int npa_opt){ + s_file_ = nullptr; + // The following compresses an input file at "inputpath" in memory + // as a buffer containing key-value pairs. It uses newline '\n' to + // differentiate between successive values, and assigns the line number + // as the key for the corresponding value. + std::cout << "Constructing Succinct data structures...\n"; + s_file_ = new SuccinctShard(0, inputpath, + SuccinctMode::CONSTRUCT_IN_MEMORY, + sa_sampling_rate, isa_sampling_rate, + npa_sampling_rate, SamplingSchemeFromOption(sampling_opt), + SamplingSchemeFromOption(sampling_opt), EncodingSchemeFromOption(npa_opt)); + std::cout << "Serializing Succinct data structures...\n"; + // Serialize the compressed representation to disk at the location .succinct + s_file_->Serialize(inputpath + ".succinct"); + } + + // Wrapped search command, that returns a python list + boost::python::list PySearch(const std::string& arg) { + std::set results; + s_file_->Search(results, arg); + boost::python::list ret; + for (const auto& i :results){ + ret.append(i); + } + return ret; + } + + //Wrapped search command + std::set Search(const std::string &arg) { + std::set results; + s_file_->Search(results, arg); + return results; + } + + //Wrapped count command + int64_t Count(const std::string &arg) { + int64_t count = s_file_->Count(arg); + return count; + } + + //Wrapped get command + std::string Get(uint64_t key) { + std::string result; + s_file_->Get(result, key); + return result; + } + + //PyKv members + SuccinctShard *s_file_; + +}; + +//Boost Python module +BOOST_PYTHON_MODULE(pykv){ + class_("PyKv", init()) + .def(init()) + .def("PySearch", &PyKv::PySearch) + .def("Search", &PyKv::Search) + .def("Count", &PyKv::Count) + .def("Get", &PyKv::Get) + ; + +} + + diff --git a/pysuccinct/pykv.py b/pysuccinct/pykv.py new file mode 100644 index 0000000..e8c7612 --- /dev/null +++ b/pysuccinct/pykv.py @@ -0,0 +1,99 @@ +import pykv +import sys +import getopt + +# Try catch block for non integer argument checking +def RepresentsInt(s): + try: + int(s) + return True + except ValueError: + return False + +# Default values +sa_sampling_rate = 32 +isa_sampling_rate = 32 +sampling_scheme = 0 +npa_sampling_rate = 128 +npa_encoding_scheme = 1 +type = "file" +inputpath = "" + +# Get user input to either load from memory or compress a new file +option = input("Usage: [load/compress] [file]\n") +option = option.split() +if (len(option) != 2): + print("Usage: [load/compress] [file]\n") + sys.exit(2) +else: + inputpath = option[1] + if (option[0] == "load"): + # Load file from memory + print("loading ", inputpath, " from file") + q = pykv.PyKv(inputpath) + elif (option[0] == "compress"): + # Compress the file + print("Please enter the sampling rates") + option = input("Usage: [-s sa_sampling_rate] [-i isa_sampling_rate] [-x sampling_scheme] [-n npa_sampling_rate] [-r npa_encoding_scheme] [-t input_type]\n") + # Loop through arguments to get sampling rates using getopt + try: + optlist, args = getopt.getopt(option, 's:i:x:n:r:t:') + except getopt.GetoptError as err: + print("Get opt error") + sys.exit(2) + for o, a in optlist: + if o == "-s": + sa_sampling_rate = int(a) + elif o == "-i": + isa_sampling_rate = int(a) + elif o == "-x": + sampling_scheme = int(a) + elif o == "-n": + npa_sampling_rate = int(a) + elif o == "-r": + npa_encoding_scheme = int(a) + elif o == "-t": + type = a + else: + printf("Invalid Option") + sys.exit(2) + q = pykv.PyKv(inputpath, sa_sampling_rate, + isa_sampling_rate, npa_sampling_rate, + sampling_scheme, npa_encoding_scheme) + else: + print("Usage: [load/compress] [file]\n") + sys.exit(2) + +print("Command must be one of: search [query], count [query], get [key]") + + +# Parse through line by line +while (True): + line = input("succinct> ") + line = line.split(" ", 1) + if (line[0] == "search"): + if (len(line) != 2): + print("Could not parse command: ") + continue + else: + print(q.PySearch(line[1].strip())) + elif(line[0] == "count"): + if (len(line) != 2): + print("Could not parse command: ") + continue + else: + print(q.Count(line[1].strip())) + elif(line[0] == "get"): + if (len(line) == 1): + print("Could not parse command: ") + continue + key = line[1].strip() + if (len(line) != 2 or RepresentsInt(key) == False): + print("Could not parse command: ") + else: + print(q.Get((int(key)))) + elif(line[0] == "exit"): + break + else: + print("Unsupported command") + print("Command must be one of: search [query], count [query], get [key]") \ No newline at end of file diff --git a/pysuccinct/pyquery_file.cpp b/pysuccinct/pyquery_file.cpp deleted file mode 100644 index e36ba92..0000000 --- a/pysuccinct/pyquery_file.cpp +++ /dev/null @@ -1,67 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "succinct_file.h" - -#include -using namespace boost::python; - -/** - * Program that wraps succinct's query file functions for python use via boost - */ - -struct QueryFile { - QueryFile(std::string filename, uint32_t mode) { - this->s_file_ = nullptr; - if (mode == 0) { - // If mode is set to 0, compress the input file. - // Use default parameters. - std::cout << "Constructing Succinct data structures...\n"; - this->s_file_ = new SuccinctFile(filename); - std::cout << "Serializing Succinct data structures...\n"; - this->s_file_->Serialize(filename + ".succinct"); - } else { - // If mode is set to 1, read the serialized data structures from disk. - // The serialized data structures must exist at .succinct. - std::cout << "De-serializing Succinct data structures...\n"; - this->s_file_ = new SuccinctFile(filename, SuccinctMode::LOAD_IN_MEMORY); - } - } - - //Wrapped search command - std::vector Search(const std::string& arg) { - std::vector results; - s_file_->Search(results, arg); - return results; - } - - //Wrapped count command - int64_t Count(const std::string& arg) { - int64_t count = s_file_->Count(arg); - return count; - } - - //Wrapped extract command - std::string Extract(uint64_t offset, uint64_t length) { - std::string result; - s_file_->Extract(result, offset, length); - return result; - } - - //QueryFile members - SuccinctFile *s_file_; -}; - -//Boost Python module -BOOST_PYTHON_MODULE(pyquery_file){ - class_("QueryFile", init()) - .def("Search", &QueryFile::Search) - .def("Count", &QueryFile::Count) - .def("Extract", &QueryFile::Extract) - ; - -} \ No newline at end of file diff --git a/pysuccinct/pyquery_file.py b/pysuccinct/pyquery_file.py deleted file mode 100644 index e11adec..0000000 --- a/pysuccinct/pyquery_file.py +++ /dev/null @@ -1,79 +0,0 @@ -import pyquery_file -import sys -import getopt - -# Try catch block for non integer argument checking -def RepresentsInt(s): - try: - int(s) - return True - except ValueError: - return False - -# Argument size error checking -argc = len(sys.argv) -if (argc < 2 or argc > 12): - print("Usage: %s [-m mode] [file]") - sys.exit(2) - -# Loop through arguments to change default values and get input path using getopt -try: - optlist, args = getopt.getopt(sys.argv[1:], 'm:') -except getopt.GetoptError as err: - print(("Get opt error")) - sys.exit(2) - -# Default values -mode = 0 -filename = "" - -# Modify default values -for o, a in optlist: - if o == "-m": - mode = int(a) - else: - printf("Invalid Option") - sys.exit(2) - -# Process filename -if (len(args) != 1): - print("Usage: %s [-m mode] [file]") - sys.exit(2) -else: - filename = str(args[0]) - -# Create the pyquery_file struct and run command given on next input -q = pyquery_file.QueryFile(filename, mode) -print("Done. Starting Succinct Shell...") -print("Command must be one of: search [query], count [query], extract [offset] [length]") - -# Parse through line by line -while (True): - line = input("succinct> ") - line = line.split(" ", 1) - if (line[0] == "search"): - if (len(line) != 2): - print("Could not parse command: ") - else: - q.Search(line[1].strip()) - elif(line[0] == "count"): - if (len(line) != 2): - print("Could not parse command: ") - else: - q.Count(line[1].strip()) - elif(line[0] == "extract"): - if (len(line) == 1): - print("Could not parse command: ") - continue - line = line[1].split(" ", 1) - offset = line[0].strip() - length = line[1].strip() - if (len(line) != 2 or RepresentsInt(offset) == False or RepresentsInt(length) == False): - print("Could not parse command: ") - else: - q.Extract(int(line[0].strip()), int(line[1].strip())) - elif(line[0] == "exit"): - break - else: - print("Unsupported command") - print("Command must be one of: search [query], count [query], extract [offset] [length]") \ No newline at end of file diff --git a/pysuccinct/pyquery_kv.cpp b/pysuccinct/pyquery_kv.cpp deleted file mode 100644 index 49a22dd..0000000 --- a/pysuccinct/pyquery_kv.cpp +++ /dev/null @@ -1,71 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "succinct_shard.h" - -#include -using namespace boost::python; - -/** - * Program that wraps succinct's query kv functions for python use via boost - */ - -struct QueryKv{ - QueryKv(std::string filename, uint32_t mode) { - this->s_file_ = nullptr; - if (mode == 0) { - // If mode is set to 0, compress the input file. - // Use default parameters. - std::cout << "Constructing Succinct data structures...\n"; - s_file_ = new SuccinctShard(0, filename); - - std::cout << "Serializing Succinct data structures...\n"; - s_file_->Serialize(filename + ".succinct"); - } else { - // If mode is set to 1, read the serialized data structures from disk. - // The serialized data structures must exist at .succinct. - std::cout << "De-serializing Succinct data structures...\n"; - s_file_ = new SuccinctShard(0, filename, SuccinctMode::LOAD_IN_MEMORY); - } - } - - //Wrapped search command - std::set Search(const std::string &arg) { - std::set results; - s_file_->Search(results, arg); - return results; - } - - //Wrapped count command - int64_t Count(const std::string &arg) { - int64_t count = s_file_->Count(arg); - return count; - } - - //Wrapped get command - std::string Get(uint64_t key) { - std::string result; - s_file_->Get(result, key); - return result; - } - - //QueryKv members - SuccinctShard *s_file_; - -}; - -//Boost Python module -BOOST_PYTHON_MODULE(pyquery_kv){ - class_("QueryKv", init()) - .def("Search", &QueryKv::Search) - .def("Count", &QueryKv::Count) - .def("Get", &QueryKv::Get) - ; - -} - - diff --git a/pysuccinct/pyquery_kv.py b/pysuccinct/pyquery_kv.py deleted file mode 100644 index b55592c..0000000 --- a/pysuccinct/pyquery_kv.py +++ /dev/null @@ -1,77 +0,0 @@ -import pyquery_kv -import sys -import getopt - -# Try catch block for non integer argument checking -def RepresentsInt(s): - try: - int(s) - return True - except ValueError: - return False - -# Argument size error checking -argc = len(sys.argv) -if (argc < 2 or argc > 12): - print("Usage: %s [-m mode] [file]") - sys.exit(2) - -# Loop through arguments to change default values and get input path using getopt -try: - optlist, args = getopt.getopt(sys.argv[1:], 'm:') -except getopt.GetoptError as err: - print("Get opt error") - sys.exit(2) - -# Default values -mode = 0 -filename = "" - -# Modify default values -for o, a in optlist: - if o == "-m": - mode = int(a) - else: - print("Invalid Option") - sys.exit(2) - -# Process filename -if (len(args) != 1): - print("Usage: %s [-m mode] [file]") - sys.exit(2) -else: - filename = str(args[0]) - -# Create the pyquery_file struct and run command given on next input -q = pyquery_kv.QueryKv(filename, mode) -print("Done. Starting Succinct Shell...") -print("Command must be one of: search [query], count [query], get [key]") - -# Parse through line by line -while (True): - line = input("succinct> ") - line = line.split(" ", 1) - if (line[0] == "search"): - if (len(line) != 2): - print("Could not parse command: ") - else: - q.Search(line[1].strip()) - elif(line[0] == "count"): - if (len(line) != 2): - print("Could not parse command: ") - else: - q.Count(line[1].strip()) - elif(line[0] == "get"): - if (len(line) == 1): - print("Could not parse command: ") - continue - key = line[1].strip() - if (len(line) != 2 or RepresentsInt(key) == False): - print("Could not parse command: ") - else: - q.Get((int(key))) - elif(line[0] == "exit"): - break - else: - print("Unsupported command") - print("Command must be one of: search [query], count [query], get [key]") \ No newline at end of file diff --git a/pysuccinct/pyquery_semistructured.cpp b/pysuccinct/pyquery_semistructured.cpp deleted file mode 100644 index 860e89f..0000000 --- a/pysuccinct/pyquery_semistructured.cpp +++ /dev/null @@ -1,71 +0,0 @@ -#include -#include -#include -#include -#include - -#include "succinct_semistructured_shard.h" - -#include -using namespace boost::python; - -/** - * Program that wraps succinct's query semistructured functions for python use via boost - */ - -struct QuerySemistructured{ - QuerySemistructured(std::string filename, uint32_t mode) { - this->s_file_ = nullptr; - if (mode == 0) { - // If mode is set to 0, compress the input file. - // Use default parameters. - std::cout << "Constructing Succinct data structures...\n"; - s_file_ = new SuccinctSemistructuredShard(filename); - - std::cout << "Serializing Succinct data structures...\n"; - s_file_->Serialize(filename + ".succinct"); - } else { - // If mode is set to 1, read the serialized data structures from disk. - // The serialized data structures must exist at .succinct. - std::cout << "De-serializing Succinct data structures...\n"; - s_file_ = new SuccinctSemistructuredShard(filename, - SuccinctMode::LOAD_IN_MEMORY); - } - } - - //Wrapped search command - std::set Search(const std::string &attr_key, const std::string &attr_val) { - std::set results; - s_file_->SearchAttribute(results, attr_key, attr_val); - return results; - } - - //Wrapped count command - int64_t Count(const std::string &attr_key, const std::string &attr_val) { - int64_t count = s_file_->CountAttribute(attr_key, attr_val); - return count; - } - - //Wrapped get command - std::string Get(int64_t key, std::string attr_key) { - std::string result; - s_file_->Get(result, key, attr_key); - return result; - } - - //QuerySemistructured members - SuccinctSemistructuredShard *s_file_; - -}; - -//Boost Python module -BOOST_PYTHON_MODULE(pyquery_semistructured){ - class_("QuerySemistructured", init()) - .def("Search", &QuerySemistructured::Search) - .def("Count", &QuerySemistructured::Count) - .def("Get", &QuerySemistructured::Get) - ; - -} - - diff --git a/pysuccinct/pyquery_semistructured.py b/pysuccinct/pyquery_semistructured.py deleted file mode 100644 index d461975..0000000 --- a/pysuccinct/pyquery_semistructured.py +++ /dev/null @@ -1,78 +0,0 @@ -import pyquery_semistructured -import sys -import getopt - -# Try catch block for non integer argument checking -def RepresentsInt(s): - try: - int(s) - return True - except ValueError: - return False - -# Argument size error checking -argc = len(sys.argv) -if (argc < 2 or argc > 12): - print("Usage: %s [-m mode] [file]") - sys.exit(2) - -# Loop through arguments to change default values and get input path using getopt -try: - optlist, args = getopt.getopt(sys.argv[1:], 'm:') -except getopt.GetoptError as err: - print("Get opt error") - sys.exit(2) - -# Default values -mode = 0 -filename = "" - -# Modify default values -for o, a in optlist: - if o == "-m": - mode = int(a) - else: - printf("Invalid Option") - sys.exit(2) - -# Process filename -if (len(args) != 1): - print("Usage: %s [-m mode] [file]") - sys.exit(2) -else: - filename = str(args[0]) - -# Create the pyquery_file struct and run command given on next input -q = pyquery_semistructured.QuerySemistructured(filename, mode) -print("Done. Starting Succinct Shell...") -print("Command must be one of:\n\t\tsearch [attr_key] [attr_val]\n\t\tcount [attr_key] [attr_val]\n\t\tget [key] [attr_key]") - -# Parse through line by line -while (True): - line = input("succinct> ") - line = line.split(" ") - if (line[0] == "search"): - if (len(line) != 3): - print("Could not parse command: ") - else: - q.Search(line[1].strip(), line[2].strip()) - elif(line[0] == "count"): - if (len(line) != 3): - print("Could not parse command: ") - else: - q.Count(line[1].strip(), line[2].strip()) - elif(line[0] == "get"): - if (len(line) == 1): - print("Could not parse command: ") - continue - key = line[1].strip() - attr_key = line[2].strip() - if (len(line) != 3 or RepresentsInt(key) == False): - print("Could not parse command: ") - else: - q.Get((int(key), attr_key)) - elif(line[0] == "exit"): - break - else: - print("Unsupported command") - print("Command must be one of:\n\t\tsearch [attr_key] [attr_val]\n\t\tcount [attr_key] [attr_val]\n\t\tget [key] [attr_key]") \ No newline at end of file diff --git a/pysuccinct/pysemistructured.cpp b/pysuccinct/pysemistructured.cpp new file mode 100644 index 0000000..63b0b8d --- /dev/null +++ b/pysuccinct/pysemistructured.cpp @@ -0,0 +1,152 @@ +#include +#include +#include +#include +#include + +#include "succinct_semistructured_shard.h" + +#include +using namespace boost::python; + +/** + * Program that wraps succinct's query semistructured functions for python use via boost + */ + +/** + * Converts integer option to SamplingScheme + */ +SamplingScheme SamplingSchemeFromOption(int opt) { + switch (opt) { + case 0: { + fprintf(stderr, "Sampling Scheme = Flat Sample by Index\n"); + return SamplingScheme::FLAT_SAMPLE_BY_INDEX; + } + case 1: { + fprintf(stderr, "Sampling Scheme = Flat Sample by Value\n"); + return SamplingScheme::FLAT_SAMPLE_BY_VALUE; + } + case 2: { + fprintf(stderr, "Sampling Scheme = Layered Sample by Index\n"); + return SamplingScheme::LAYERED_SAMPLE_BY_INDEX; + } + case 3: { + fprintf(stderr, + "Sampling Scheme = Opportunistic Layered Sample by Index\n"); + return SamplingScheme::OPPORTUNISTIC_LAYERED_SAMPLE_BY_INDEX; + } + default: { + fprintf(stderr, "Sampling Scheme = Flat Sample by Index\n"); + return SamplingScheme::FLAT_SAMPLE_BY_INDEX; + } + } +} + +/** + * Converts integer option to NPAEncodingScheme + */ +NPA::NPAEncodingScheme EncodingSchemeFromOption(int opt) { + switch (opt) { + case 0: { + fprintf(stderr, "NPA Encoding Scheme = Elias Delta\n"); + return NPA::NPAEncodingScheme::ELIAS_DELTA_ENCODED; + } + case 1: { + fprintf(stderr, "NPA Encoding Scheme = Elias Gamma\n"); + return NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED; + } + case 2: { + fprintf(stderr, "NPA Encoding Scheme = Wavelet Tree\n"); + return NPA::NPAEncodingScheme::WAVELET_TREE_ENCODED; + } + default: { + fprintf(stderr, "NPA Encoding Scheme = Elias Gamma\n"); + return NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED; + } + } +} + +/** + * PySemistructured struct that wraps query and compress functions for boost python + */ +struct PySemistructured{ + // Constructor that loads from semistructured + PySemistructured(std::string filename) { + s_file_ = nullptr; + // If mode is set to 1, read the serialized data structures from disk. + // The serialized data structures must exist at .succinct. + std::cout << "De-serializing Succinct data structures...\n"; + s_file_ = new SuccinctSemistructuredShard(filename, + SuccinctMode::LOAD_IN_MEMORY); + } + + // Constructor that compresses semistructured given the sampling rate arguments + PySemistructured(const std::string& inputpath, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, + uint32_t npa_sampling_rate, int sampling_opt, int npa_opt){ + s_file_ = nullptr; + // The following compresses an input file at "inputpath" in memory + // as a buffer containing key-value pairs. It uses newline '\n' to + // differentiate between successive values, and assigns the line number + // as the key for the corresponding value. + std::cout << "Constructing Succinct data structures...\n"; + s_file_ = new SuccinctSemistructuredShard(inputpath, + SuccinctMode::CONSTRUCT_IN_MEMORY); + // s_file_ = new SuccinctSemistructuredShard(0, inputpath, + // SuccinctMode::CONSTRUCT_IN_MEMORY, + // sa_sampling_rate, isa_sampling_rate, + // npa_sampling_rate, SamplingSchemeFromOption(sampling_opt), + // SamplingSchemeFromOption(sampling_opt), EncodingSchemeFromOption(npa_opt)); + std::cout << "Serializing Succinct data structures...\n"; + // Serialize the compressed representation to disk at the location .succinct + s_file_->Serialize(inputpath + ".succinct"); + } + + // Wrapped search command, that returns a python list + boost::python::list PySearch(const std::string &attr_key, const std::string &attr_val) { + std::set results; + s_file_->SearchAttribute(results, attr_key, attr_val); + boost::python::list ret; + for (const auto& i :results){ + ret.append(i); + } + return ret; + } + + //Wrapped search command + std::set Search(const std::string &attr_key, const std::string &attr_val) { + std::set results; + s_file_->SearchAttribute(results, attr_key, attr_val); + return results; + } + + //Wrapped count command + int64_t Count(const std::string &attr_key, const std::string &attr_val) { + int64_t count = s_file_->CountAttribute(attr_key, attr_val); + return count; + } + + //Wrapped get command + std::string Get(int64_t key, std::string attr_key) { + std::string result; + s_file_->Get(result, key, attr_key); + return result; + } + + //PySemistructured members + SuccinctSemistructuredShard *s_file_; + +}; + +//Boost Python module +BOOST_PYTHON_MODULE(pysemistructured){ + class_("PySemistructured", init()) + .def(init()) + .def("PySearch", &PySemistructured::PySearch) + .def("Search", &PySemistructured::Search) + .def("Count", &PySemistructured::Count) + .def("Get", &PySemistructured::Get) + ; + +} + + diff --git a/pysuccinct/pysemistructured.py b/pysuccinct/pysemistructured.py new file mode 100644 index 0000000..cc7d3b7 --- /dev/null +++ b/pysuccinct/pysemistructured.py @@ -0,0 +1,97 @@ +import pysemistructured +import sys +import getopt + +# Try catch block for non integer argument checking +def RepresentsInt(s): + try: + int(s) + return True + except ValueError: + return False + +# Default values +sa_sampling_rate = 32 +isa_sampling_rate = 32 +sampling_scheme = 0 +npa_sampling_rate = 128 +npa_encoding_scheme = 1 +type = "file" +inputpath = "" + +# Get user input to either load from memory or compress a new file +option = input("Usage: [load/compress] [file]\n") +option = option.split() +if (len(option) != 2): + print("Usage: [load/compress] [file]\n") + sys.exit(2) +else: + inputpath = option[1] + if (option[0] == "load"): + # Load file from memory + print("loading ", inputpath, " from file") + q = pysemistructured.PySemistructured(inputpath) + elif (option[0] == "compress"): + # Compress the file + print("Please enter the sampling rates") + option = input("Usage: [-s sa_sampling_rate] [-i isa_sampling_rate] [-x sampling_scheme] [-n npa_sampling_rate] [-r npa_encoding_scheme] [-t input_type]\n") + # Loop through arguments to get sampling rates using getopt + try: + optlist, args = getopt.getopt(option, 's:i:x:n:r:t:') + except getopt.GetoptError as err: + print("Get opt error") + sys.exit(2) + for o, a in optlist: + if o == "-s": + sa_sampling_rate = int(a) + elif o == "-i": + isa_sampling_rate = int(a) + elif o == "-x": + sampling_scheme = int(a) + elif o == "-n": + npa_sampling_rate = int(a) + elif o == "-r": + npa_encoding_scheme = int(a) + elif o == "-t": + type = a + else: + printf("Invalid Option") + sys.exit(2) + q = pysemistructured.PySemistructured(inputpath, sa_sampling_rate, + isa_sampling_rate, npa_sampling_rate, + sampling_scheme, npa_encoding_scheme) + else: + print("Usage: [load/compress] [file]\n") + sys.exit(2) + +print("Command must be one of:\n\t\tsearch [attr_key] [attr_val]\n\t\tcount [attr_key] [attr_val]\n\t\tget [key] [attr_key]") + +# Parse through line by line +while (True): + line = input("succinct> ") + line = line.split(" ") + if (line[0] == "search"): + if (len(line) != 3): + print("Could not parse command: ") + else: + print(q.PySearch(line[1].strip(), line[2].strip())) + elif(line[0] == "count"): + if (len(line) != 3): + print("Could not parse command: ") + else: + print(q.Count(line[1].strip(), line[2].strip())) + elif(line[0] == "get"): + if (len(line) == 1): + print("Could not parse command: ") + continue + key = line[1].strip() + attr_key = line[2].strip() + if (len(line) != 3 or RepresentsInt(key) == False): + print("Could not parse command: ") + else: + print(q.Get((int(key), attr_key))) + elif(line[0] == "exit"): + break + else: + print("Unsupported command") + print("Command must be one of:\n\t\tsearch [attr_key] [attr_val]\n\t\tcount [attr_key] [attr_val]\n\t\tget [key] [attr_key]") \ No newline at end of file From edb55089ecfab98515fb8ba2e0ce69afafbf7626 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Fri, 8 May 2020 14:22:16 -0400 Subject: [PATCH 09/39] Moved .py files to examples --- examples/CMakeLists.txt | 13 +++++++++++++ {pysuccinct => examples/src}/pyfile.cpp | 0 {pysuccinct => examples/src}/pykv.cpp | 2 -- {pysuccinct => examples/src}/pysemistructured.cpp | 0 pysuccinct/CMakeLists.txt | 15 +-------------- 5 files changed, 14 insertions(+), 16 deletions(-) rename {pysuccinct => examples/src}/pyfile.cpp (100%) rename {pysuccinct => examples/src}/pykv.cpp (99%) rename {pysuccinct => examples/src}/pysemistructured.cpp (100%) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index f5d0b4d..fbf3eeb 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -26,7 +26,20 @@ add_executable(query_file src/query_file.cc) add_executable(query_kv src/query_kv.cc) add_executable(query_semistructured src/query_semistructured.cc) +python_add_module(pyfile src/pyfile.cpp) +file(COPY ../pysuccinct/pyfile.py DESTINATION .) + +python_add_module(pykv src/pykv.cpp) +file(COPY ../pysuccinct/pykv.py DESTINATION .) + +python_add_module(pysemistructured src/pysemistructured.cpp) +file(COPY ../pysuccinct/pysemistructured.py DESTINATION .) + target_link_libraries(compress succinct) target_link_libraries(query_file succinct) target_link_libraries(query_kv succinct) target_link_libraries(query_semistructured succinct) + +target_link_libraries(pyfile succinct) +target_link_libraries(pykv succinct) +target_link_libraries(pysemistructured succinct) diff --git a/pysuccinct/pyfile.cpp b/examples/src/pyfile.cpp similarity index 100% rename from pysuccinct/pyfile.cpp rename to examples/src/pyfile.cpp diff --git a/pysuccinct/pykv.cpp b/examples/src/pykv.cpp similarity index 99% rename from pysuccinct/pykv.cpp rename to examples/src/pykv.cpp index 7225699..bcea4aa 100644 --- a/pysuccinct/pykv.cpp +++ b/examples/src/pykv.cpp @@ -146,5 +146,3 @@ BOOST_PYTHON_MODULE(pykv){ ; } - - diff --git a/pysuccinct/pysemistructured.cpp b/examples/src/pysemistructured.cpp similarity index 100% rename from pysuccinct/pysemistructured.cpp rename to examples/src/pysemistructured.cpp diff --git a/pysuccinct/CMakeLists.txt b/pysuccinct/CMakeLists.txt index 571090d..d95b240 100644 --- a/pysuccinct/CMakeLists.txt +++ b/pysuccinct/CMakeLists.txt @@ -1,12 +1,3 @@ -python_add_module(pyfile pyfile.cpp) -file(COPY pyfile.py DESTINATION .) - -python_add_module(pykv pykv.cpp) -file(COPY pykv.py DESTINATION .) - -python_add_module(pysemistructured pysemistructured.cpp) -file(COPY pysemistructured.py DESTINATION .) - cmake_minimum_required(VERSION 2.8) project(succinct-pysuccinct CXX) @@ -15,8 +6,4 @@ set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) file(MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) set(INCLUDE include ../core/include) -include_directories(${INCLUDE}) - -target_link_libraries(pyfile succinct) -target_link_libraries(pykv succinct) -target_link_libraries(pysemistructured succinct) \ No newline at end of file +include_directories(${INCLUDE}) \ No newline at end of file From 4ded923ae2958904281babf7465ee954d52813b6 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Mon, 11 May 2020 19:36:28 -0400 Subject: [PATCH 10/39] Cmake dependency Error --- examples/CMakeLists.txt | 25 ++++++++----------- {pysuccinct => examples/pysuccinct}/pyfile.py | 2 +- {pysuccinct => examples/pysuccinct}/pykv.py | 2 +- .../pysuccinct}/pysemistructured.py | 2 +- pysuccinct/CMakeLists.txt | 10 +++++++- {examples/src => pysuccinct}/pyfile.cpp | 12 +++------ {examples/src => pysuccinct}/pykv.cpp | 11 ++------ .../src => pysuccinct}/pysemistructured.cpp | 10 +------- 8 files changed, 28 insertions(+), 46 deletions(-) rename {pysuccinct => examples/pysuccinct}/pyfile.py (98%) rename {pysuccinct => examples/pysuccinct}/pykv.py (98%) rename {pysuccinct => examples/pysuccinct}/pysemistructured.py (98%) rename {examples/src => pysuccinct}/pyfile.cpp (94%) rename {examples/src => pysuccinct}/pykv.cpp (94%) rename {examples/src => pysuccinct}/pysemistructured.cpp (92%) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index fbf3eeb..2dc6dd6 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 2.8) -project(succinct-examples CXX) +project(succinct-examples) include(CheckCXXCompilerFlag) CHECK_CXX_COMPILER_FLAG("-std=c++11" COMPILER_SUPPORTS_CXX11) @@ -21,25 +21,20 @@ file(MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) set(INCLUDE include ../core/include) include_directories(${INCLUDE}) + +file(COPY pysuccinct/pyfile.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) +add_dependencies(pysuccinct/pyfile.py pysuccinct) +file(COPY pysuccinct/pykv.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) +add_dependencies(pysuccinct/pykv.py pysuccinct) +file(COPY pysuccinct/pysemistructured.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) +add_dependencies(pysuccinct/pysemistructured.py pysuccinct) + add_executable(compress src/compress.cc) add_executable(query_file src/query_file.cc) add_executable(query_kv src/query_kv.cc) add_executable(query_semistructured src/query_semistructured.cc) -python_add_module(pyfile src/pyfile.cpp) -file(COPY ../pysuccinct/pyfile.py DESTINATION .) - -python_add_module(pykv src/pykv.cpp) -file(COPY ../pysuccinct/pykv.py DESTINATION .) - -python_add_module(pysemistructured src/pysemistructured.cpp) -file(COPY ../pysuccinct/pysemistructured.py DESTINATION .) - target_link_libraries(compress succinct) target_link_libraries(query_file succinct) target_link_libraries(query_kv succinct) -target_link_libraries(query_semistructured succinct) - -target_link_libraries(pyfile succinct) -target_link_libraries(pykv succinct) -target_link_libraries(pysemistructured succinct) +target_link_libraries(query_semistructured succinct) \ No newline at end of file diff --git a/pysuccinct/pyfile.py b/examples/pysuccinct/pyfile.py similarity index 98% rename from pysuccinct/pyfile.py rename to examples/pysuccinct/pyfile.py index 4938734..9cc2899 100644 --- a/pysuccinct/pyfile.py +++ b/examples/pysuccinct/pyfile.py @@ -73,7 +73,7 @@ def RepresentsInt(s): print("Could not parse command: ") continue else: - print(q.PySearch(line[1].strip())) + print(q.Search(line[1].strip())) elif(line[0] == "count"): if (len(line) != 2): print("Could not parse command: ") diff --git a/pysuccinct/pykv.py b/examples/pysuccinct/pykv.py similarity index 98% rename from pysuccinct/pykv.py rename to examples/pysuccinct/pykv.py index e8c7612..9220202 100644 --- a/pysuccinct/pykv.py +++ b/examples/pysuccinct/pykv.py @@ -76,7 +76,7 @@ def RepresentsInt(s): print("Could not parse command: ") continue else: - print(q.PySearch(line[1].strip())) + print(q.Search(line[1].strip())) elif(line[0] == "count"): if (len(line) != 2): print("Could not parse command: ") diff --git a/pysuccinct/pysemistructured.py b/examples/pysuccinct/pysemistructured.py similarity index 98% rename from pysuccinct/pysemistructured.py rename to examples/pysuccinct/pysemistructured.py index cc7d3b7..f7d77d3 100644 --- a/pysuccinct/pysemistructured.py +++ b/examples/pysuccinct/pysemistructured.py @@ -74,7 +74,7 @@ def RepresentsInt(s): if (len(line) != 3): print("Could not parse command: ") else: - print(q.PySearch(line[1].strip(), line[2].strip())) + print(q.Search(line[1].strip(), line[2].strip())) elif(line[0] == "count"): if (len(line) != 3): print("Could not parse command: ") diff --git a/pysuccinct/CMakeLists.txt b/pysuccinct/CMakeLists.txt index d95b240..b5a64c4 100644 --- a/pysuccinct/CMakeLists.txt +++ b/pysuccinct/CMakeLists.txt @@ -6,4 +6,12 @@ set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) file(MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) set(INCLUDE include ../core/include) -include_directories(${INCLUDE}) \ No newline at end of file +include_directories(${INCLUDE}) + +python_add_module(pyfile pyfile.cpp) +python_add_module(pykv pykv.cpp) +python_add_module(pysemistructured pysemistructured.cpp) + +target_link_libraries(pyfile succinct) +target_link_libraries(pykv succinct) +target_link_libraries(pysemistructured succinct) \ No newline at end of file diff --git a/examples/src/pyfile.cpp b/pysuccinct/pyfile.cpp similarity index 94% rename from examples/src/pyfile.cpp rename to pysuccinct/pyfile.cpp index 21d2644..97a446b 100644 --- a/examples/src/pyfile.cpp +++ b/pysuccinct/pyfile.cpp @@ -111,7 +111,7 @@ struct PyFile { } // Wrapped search command, that returns a python list - boost::python::list PySearch(const std::string& arg) { + boost::python::list Search(const std::string& arg) { std::vector results; s_file_->Search(results, arg); boost::python::list ret; @@ -121,13 +121,6 @@ struct PyFile { return ret; } - // Wrapped search command - std::vector Search(const std::string& arg) { - std::vector results; - s_file_->Search(results, arg); - return results; - } - // Wrapped count command int64_t Count(const std::string& arg) { int64_t count = s_file_->Count(arg); @@ -145,13 +138,14 @@ struct PyFile { SuccinctFile *s_file_; }; + + /** * Boost Python module */ BOOST_PYTHON_MODULE(pyfile){ class_("PyFile", init()) .def(init()) - .def("PySearch", &PyFile::PySearch) .def("Search", &PyFile::Search) .def("Count", &PyFile::Count) .def("Extract", &PyFile::Extract) diff --git a/examples/src/pykv.cpp b/pysuccinct/pykv.cpp similarity index 94% rename from examples/src/pykv.cpp rename to pysuccinct/pykv.cpp index bcea4aa..a2bcd00 100644 --- a/examples/src/pykv.cpp +++ b/pysuccinct/pykv.cpp @@ -100,7 +100,7 @@ struct PyKv{ } // Wrapped search command, that returns a python list - boost::python::list PySearch(const std::string& arg) { + boost::python::list Search(const std::string& arg) { std::set results; s_file_->Search(results, arg); boost::python::list ret; @@ -109,14 +109,7 @@ struct PyKv{ } return ret; } - - //Wrapped search command - std::set Search(const std::string &arg) { - std::set results; - s_file_->Search(results, arg); - return results; - } - + //Wrapped count command int64_t Count(const std::string &arg) { int64_t count = s_file_->Count(arg); diff --git a/examples/src/pysemistructured.cpp b/pysuccinct/pysemistructured.cpp similarity index 92% rename from examples/src/pysemistructured.cpp rename to pysuccinct/pysemistructured.cpp index 63b0b8d..6368b7b 100644 --- a/examples/src/pysemistructured.cpp +++ b/pysuccinct/pysemistructured.cpp @@ -102,7 +102,7 @@ struct PySemistructured{ } // Wrapped search command, that returns a python list - boost::python::list PySearch(const std::string &attr_key, const std::string &attr_val) { + boost::python::list Search(const std::string &attr_key, const std::string &attr_val) { std::set results; s_file_->SearchAttribute(results, attr_key, attr_val); boost::python::list ret; @@ -111,13 +111,6 @@ struct PySemistructured{ } return ret; } - - //Wrapped search command - std::set Search(const std::string &attr_key, const std::string &attr_val) { - std::set results; - s_file_->SearchAttribute(results, attr_key, attr_val); - return results; - } //Wrapped count command int64_t Count(const std::string &attr_key, const std::string &attr_val) { @@ -141,7 +134,6 @@ struct PySemistructured{ BOOST_PYTHON_MODULE(pysemistructured){ class_("PySemistructured", init()) .def(init()) - .def("PySearch", &PySemistructured::PySearch) .def("Search", &PySemistructured::Search) .def("Count", &PySemistructured::Count) .def("Get", &PySemistructured::Get) From 392dba761b21d89c2ac33b94dc903cfe58ba48f2 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Wed, 13 May 2020 19:40:56 -0400 Subject: [PATCH 11/39] Renaming, modified semistructured constructor --- examples/CMakeLists.txt | 12 +++--- examples/pysuccinct/pyfile.py | 10 +++-- examples/pysuccinct/pykv.py | 7 ++- examples/pysuccinct/pysemistructured.py | 6 +-- pysuccinct/CMakeLists.txt | 12 +++--- pysuccinct/{pyfile.cpp => file.cpp} | 20 ++++----- pysuccinct/{pykv.cpp => kvstore.cpp} | 23 +++++----- ...structured.cpp => semistructuredstore.cpp} | 43 +++++++++++-------- 8 files changed, 71 insertions(+), 62 deletions(-) rename pysuccinct/{pyfile.cpp => file.cpp} (91%) rename pysuccinct/{pykv.cpp => kvstore.cpp} (90%) rename pysuccinct/{pysemistructured.cpp => semistructuredstore.cpp} (76%) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 2dc6dd6..4d151ea 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -22,18 +22,18 @@ file(MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) set(INCLUDE include ../core/include) include_directories(${INCLUDE}) -file(COPY pysuccinct/pyfile.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) -add_dependencies(pysuccinct/pyfile.py pysuccinct) -file(COPY pysuccinct/pykv.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) -add_dependencies(pysuccinct/pykv.py pysuccinct) -file(COPY pysuccinct/pysemistructured.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) -add_dependencies(pysuccinct/pysemistructured.py pysuccinct) +set(INCLUDE include ../pysuccinct) +include_directories(${INCLUDE}) add_executable(compress src/compress.cc) add_executable(query_file src/query_file.cc) add_executable(query_kv src/query_kv.cc) add_executable(query_semistructured src/query_semistructured.cc) +file(COPY pysuccinct/pyfile.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) +file(COPY pysuccinct/pykv.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) +file(COPY pysuccinct/pysemistructured.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) + target_link_libraries(compress succinct) target_link_libraries(query_file succinct) target_link_libraries(query_kv succinct) diff --git a/examples/pysuccinct/pyfile.py b/examples/pysuccinct/pyfile.py index 9cc2899..04ce608 100644 --- a/examples/pysuccinct/pyfile.py +++ b/examples/pysuccinct/pyfile.py @@ -1,4 +1,4 @@ -import pyfile +import file import sys import getopt @@ -30,7 +30,7 @@ def RepresentsInt(s): if (option[0] == "load"): # Load file from memory print("loading ", inputpath, " from file") - q = pyfile.PyFile(inputpath) + q = file.File(inputpath) elif (option[0] == "compress"): # Compress the file print("Please enter the sampling rates") @@ -57,13 +57,15 @@ def RepresentsInt(s): else: printf("Invalid Option") sys.exit(2) - q = pyfile.PyFile(inputpath, sa_sampling_rate, + q = file.File(inputpath, sa_sampling_rate, isa_sampling_rate, npa_sampling_rate, sampling_scheme, npa_encoding_scheme) else: print("Usage: [load/compress] [file]\n") sys.exit(2) +print("Command must be one of: search [query], count [query], extract [offset] [length]") + # Parse through line by line while (True): line = input("succinct> ") @@ -84,7 +86,7 @@ def RepresentsInt(s): if (len(line) == 1): print("Could not parse command: ") continue - line = line[1].split(" ", 1) + line = line[1].split() offset = line[0].strip() length = line[1].strip() if (len(line) != 2 or RepresentsInt(offset) == False or RepresentsInt(length) == False): diff --git a/examples/pysuccinct/pykv.py b/examples/pysuccinct/pykv.py index 9220202..50e68f0 100644 --- a/examples/pysuccinct/pykv.py +++ b/examples/pysuccinct/pykv.py @@ -1,4 +1,4 @@ -import pykv +import kvstore import sys import getopt @@ -30,7 +30,7 @@ def RepresentsInt(s): if (option[0] == "load"): # Load file from memory print("loading ", inputpath, " from file") - q = pykv.PyKv(inputpath) + q = kvstore.Kvstore(inputpath) elif (option[0] == "compress"): # Compress the file print("Please enter the sampling rates") @@ -57,7 +57,7 @@ def RepresentsInt(s): else: printf("Invalid Option") sys.exit(2) - q = pykv.PyKv(inputpath, sa_sampling_rate, + q = kvstore.Kvstore(inputpath, sa_sampling_rate, isa_sampling_rate, npa_sampling_rate, sampling_scheme, npa_encoding_scheme) else: @@ -66,7 +66,6 @@ def RepresentsInt(s): print("Command must be one of: search [query], count [query], get [key]") - # Parse through line by line while (True): line = input("succinct> ") diff --git a/examples/pysuccinct/pysemistructured.py b/examples/pysuccinct/pysemistructured.py index f7d77d3..fd4867f 100644 --- a/examples/pysuccinct/pysemistructured.py +++ b/examples/pysuccinct/pysemistructured.py @@ -1,4 +1,4 @@ -import pysemistructured +import semistructuredstore import sys import getopt @@ -30,7 +30,7 @@ def RepresentsInt(s): if (option[0] == "load"): # Load file from memory print("loading ", inputpath, " from file") - q = pysemistructured.PySemistructured(inputpath) + q = semistructuredstore.Semistructuredstore(inputpath) elif (option[0] == "compress"): # Compress the file print("Please enter the sampling rates") @@ -57,7 +57,7 @@ def RepresentsInt(s): else: printf("Invalid Option") sys.exit(2) - q = pysemistructured.PySemistructured(inputpath, sa_sampling_rate, + q = semistructuredstore.Semistructuredstore(inputpath, sa_sampling_rate, isa_sampling_rate, npa_sampling_rate, sampling_scheme, npa_encoding_scheme) else: diff --git a/pysuccinct/CMakeLists.txt b/pysuccinct/CMakeLists.txt index b5a64c4..783edcb 100644 --- a/pysuccinct/CMakeLists.txt +++ b/pysuccinct/CMakeLists.txt @@ -8,10 +8,10 @@ file(MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) set(INCLUDE include ../core/include) include_directories(${INCLUDE}) -python_add_module(pyfile pyfile.cpp) -python_add_module(pykv pykv.cpp) -python_add_module(pysemistructured pysemistructured.cpp) +python_add_module(file file.cpp) +python_add_module(kvstore kvstore.cpp) +python_add_module(semistructuredstore semistructuredstore.cpp) -target_link_libraries(pyfile succinct) -target_link_libraries(pykv succinct) -target_link_libraries(pysemistructured succinct) \ No newline at end of file +target_link_libraries(file succinct) +target_link_libraries(kvstore succinct) +target_link_libraries(semistructuredstore succinct) \ No newline at end of file diff --git a/pysuccinct/pyfile.cpp b/pysuccinct/file.cpp similarity index 91% rename from pysuccinct/pyfile.cpp rename to pysuccinct/file.cpp index 97a446b..ab2fe96 100644 --- a/pysuccinct/pyfile.cpp +++ b/pysuccinct/file.cpp @@ -78,11 +78,11 @@ boost::python::list VectorToList(const std::vector& v) { } /** - * PyFile struct that wraps query and compress functions for boost python + * file struct that wraps query and compress functions for boost python */ -struct PyFile { +struct File { // Constructor that loads from file - PyFile(const std::string& filename) { + File(const std::string& filename) { s_file_ = nullptr; // Read the serialized data structures from disk. // The serialized data structures must exist at .succinct. @@ -92,7 +92,7 @@ struct PyFile { // Constructor that compresses file given the sampling rate arguments - PyFile(const std::string& inputpath, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, + File(const std::string& inputpath, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, uint32_t npa_sampling_rate, int sampling_opt, int npa_opt){ s_file_ = nullptr; // The following compresses an input file at "inputpath" in memory @@ -134,7 +134,7 @@ struct PyFile { return result; } - //PyFile members + //File members SuccinctFile *s_file_; }; @@ -143,12 +143,12 @@ struct PyFile { /** * Boost Python module */ -BOOST_PYTHON_MODULE(pyfile){ - class_("PyFile", init()) +BOOST_PYTHON_MODULE(file){ + class_("File", init()) .def(init()) - .def("Search", &PyFile::Search) - .def("Count", &PyFile::Count) - .def("Extract", &PyFile::Extract) + .def("Search", &File::Search) + .def("Count", &File::Count) + .def("Extract", &File::Extract) ; } \ No newline at end of file diff --git a/pysuccinct/pykv.cpp b/pysuccinct/kvstore.cpp similarity index 90% rename from pysuccinct/pykv.cpp rename to pysuccinct/kvstore.cpp index a2bcd00..05f14f1 100644 --- a/pysuccinct/pykv.cpp +++ b/pysuccinct/kvstore.cpp @@ -68,11 +68,11 @@ NPA::NPAEncodingScheme EncodingSchemeFromOption(int opt) { } /** - * PyKv struct that wraps query and compress functions for boost python + * kvstore struct that wraps query and compress functions for boost python */ -struct PyKv{ +struct Kvstore{ // Constructor that loads from kv - PyKv(const std::string& filename) { + Kvstore(const std::string& filename) { s_file_ = nullptr; // Read the serialized data structures from disk. // The serialized data structures must exist at .succinct. @@ -81,7 +81,7 @@ struct PyKv{ } // Constructor that compresses kv given the sampling rate arguments - PyKv(const std::string& inputpath, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, + Kvstore(const std::string& inputpath, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, uint32_t npa_sampling_rate, int sampling_opt, int npa_opt){ s_file_ = nullptr; // The following compresses an input file at "inputpath" in memory @@ -109,7 +109,7 @@ struct PyKv{ } return ret; } - + //Wrapped count command int64_t Count(const std::string &arg) { int64_t count = s_file_->Count(arg); @@ -123,19 +123,18 @@ struct PyKv{ return result; } - //PyKv members + //Kvstore members SuccinctShard *s_file_; }; //Boost Python module -BOOST_PYTHON_MODULE(pykv){ - class_("PyKv", init()) +BOOST_PYTHON_MODULE(kvstore){ + class_("Kvstore", init()) .def(init()) - .def("PySearch", &PyKv::PySearch) - .def("Search", &PyKv::Search) - .def("Count", &PyKv::Count) - .def("Get", &PyKv::Get) + .def("Search", &Kvstore::Search) + .def("Count", &Kvstore::Count) + .def("Get", &Kvstore::Get) ; } diff --git a/pysuccinct/pysemistructured.cpp b/pysuccinct/semistructuredstore.cpp similarity index 76% rename from pysuccinct/pysemistructured.cpp rename to pysuccinct/semistructuredstore.cpp index 6368b7b..e66c40b 100644 --- a/pysuccinct/pysemistructured.cpp +++ b/pysuccinct/semistructuredstore.cpp @@ -67,11 +67,11 @@ NPA::NPAEncodingScheme EncodingSchemeFromOption(int opt) { } /** - * PySemistructured struct that wraps query and compress functions for boost python + * Semistructuredstore struct that wraps query and compress functions for boost python */ -struct PySemistructured{ +struct Semistructuredstore{ // Constructor that loads from semistructured - PySemistructured(std::string filename) { + Semistructuredstore(std::string filename) { s_file_ = nullptr; // If mode is set to 1, read the serialized data structures from disk. // The serialized data structures must exist at .succinct. @@ -81,7 +81,7 @@ struct PySemistructured{ } // Constructor that compresses semistructured given the sampling rate arguments - PySemistructured(const std::string& inputpath, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, + Semistructuredstore(const std::string& inputpath, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, uint32_t npa_sampling_rate, int sampling_opt, int npa_opt){ s_file_ = nullptr; // The following compresses an input file at "inputpath" in memory @@ -90,12 +90,10 @@ struct PySemistructured{ // as the key for the corresponding value. std::cout << "Constructing Succinct data structures...\n"; s_file_ = new SuccinctSemistructuredShard(inputpath, - SuccinctMode::CONSTRUCT_IN_MEMORY); - // s_file_ = new SuccinctSemistructuredShard(0, inputpath, - // SuccinctMode::CONSTRUCT_IN_MEMORY, - // sa_sampling_rate, isa_sampling_rate, - // npa_sampling_rate, SamplingSchemeFromOption(sampling_opt), - // SamplingSchemeFromOption(sampling_opt), EncodingSchemeFromOption(npa_opt)); + SuccinctMode::CONSTRUCT_IN_MEMORY, + sa_sampling_rate, isa_sampling_rate, + npa_sampling_rate, 3, SamplingSchemeFromOption(sampling_opt), + SamplingSchemeFromOption(sampling_opt), EncodingSchemeFromOption(npa_opt)); std::cout << "Serializing Succinct data structures...\n"; // Serialize the compressed representation to disk at the location .succinct s_file_->Serialize(inputpath + ".succinct"); @@ -119,24 +117,35 @@ struct PySemistructured{ } //Wrapped get command - std::string Get(int64_t key, std::string attr_key) { + std::string Get(int64_t key) { + std::string result; + s_file_->Get(result, key); + return result; + } + + //Wrapped get command + std::string Get(int64_t key, std::string &attr_key) { std::string result; s_file_->Get(result, key, attr_key); return result; } - //PySemistructured members + //Semistructuredstore members SuccinctSemistructuredShard *s_file_; }; //Boost Python module -BOOST_PYTHON_MODULE(pysemistructured){ - class_("PySemistructured", init()) +BOOST_PYTHON_MODULE(semistructuredstore){ + std::string (Semistructuredstore::*g1)(int64_t) = &Semistructuredstore::Get; + std::string (Semistructuredstore::*g2)(int64_t, std::string&) = &Semistructuredstore::Get; + + class_("Semistructuredstore", init()) .def(init()) - .def("Search", &PySemistructured::Search) - .def("Count", &PySemistructured::Count) - .def("Get", &PySemistructured::Get) + .def("Search", &Semistructuredstore::Search) + .def("Count", &Semistructuredstore::Count) + .def("Get", g1) + .def("Get", g2) ; } From 78dbe341b5494393b4ef23d272e3760ec821a0c6 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Tue, 19 May 2020 18:17:08 -0400 Subject: [PATCH 12/39] Added lambda folder --- lambda/test.py | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 lambda/test.py diff --git a/lambda/test.py b/lambda/test.py new file mode 100644 index 0000000..ade1d13 --- /dev/null +++ b/lambda/test.py @@ -0,0 +1,2 @@ +def call_compress (event = None, context = None): + print("hello world") \ No newline at end of file From bb073d2ab1798470c90369ac7dca5ad98afe222e Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Wed, 20 May 2020 23:02:27 -0400 Subject: [PATCH 13/39] added lambda test folder --- examples/CMakeLists.txt | 5 ++--- examples/lambda/lambdatest.py | 13 +++++++++++++ lambda/test.py | 2 -- 3 files changed, 15 insertions(+), 5 deletions(-) create mode 100644 examples/lambda/lambdatest.py delete mode 100644 lambda/test.py diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 4d151ea..3956dfc 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -22,9 +22,6 @@ file(MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) set(INCLUDE include ../core/include) include_directories(${INCLUDE}) -set(INCLUDE include ../pysuccinct) -include_directories(${INCLUDE}) - add_executable(compress src/compress.cc) add_executable(query_file src/query_file.cc) add_executable(query_kv src/query_kv.cc) @@ -34,6 +31,8 @@ file(COPY pysuccinct/pyfile.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) file(COPY pysuccinct/pykv.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) file(COPY pysuccinct/pysemistructured.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) +file(COPY lambda/lambdatest.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) + target_link_libraries(compress succinct) target_link_libraries(query_file succinct) target_link_libraries(query_kv succinct) diff --git a/examples/lambda/lambdatest.py b/examples/lambda/lambdatest.py new file mode 100644 index 0000000..98f8c5c --- /dev/null +++ b/examples/lambda/lambdatest.py @@ -0,0 +1,13 @@ +import file +import kvstore +import semistructuredstore + +# compress file +def call_compress (event, context): + # q = file.File(inputpath, 32, 32, 128, 0, 1) + print("success") + +def call_query (event, context): + print("success") + +print(call_compress(None, None)) \ No newline at end of file diff --git a/lambda/test.py b/lambda/test.py deleted file mode 100644 index ade1d13..0000000 --- a/lambda/test.py +++ /dev/null @@ -1,2 +0,0 @@ -def call_compress (event = None, context = None): - print("hello world") \ No newline at end of file From 635381d4e74cd49d3490f68bbd0912ece1cf8eee Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Thu, 21 May 2020 16:52:42 -0400 Subject: [PATCH 14/39] changed succinct_semi_shard.h to meet ec2 C++ standard move constructor --- .vscode/settings.json | 4 +++- core/include/succinct_semistructured_shard.h | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 66a9512..21bd06b 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -30,7 +30,9 @@ "set": "cpp", "unordered_map": "cpp", "__node_handle": "cpp", - "cwchar": "cpp" + "cwchar": "cpp", + "iosfwd": "cpp", + "fstream": "cpp" }, "python.linting.enabled": false } \ No newline at end of file diff --git a/core/include/succinct_semistructured_shard.h b/core/include/succinct_semistructured_shard.h index a13311f..8b820dc 100644 --- a/core/include/succinct_semistructured_shard.h +++ b/core/include/succinct_semistructured_shard.h @@ -174,8 +174,8 @@ class SuccinctSemistructuredShard : public SuccinctShard { std::string Format(const std::string &filename, char delim = ',') { std::string outf = filename + ".tmp.formatted"; - std::ifstream infile = std::ifstream(filename); - std::ofstream formatted = std::ofstream(outf); + std::ifstream infile(filename); + std::ofstream formatted(outf); std::string line; int64_t line_no = 0; while (std::getline(infile, line)) { From 64a02a63b1e47f64b6ae08c1d38962f957522eb3 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Thu, 21 May 2020 22:44:47 -0400 Subject: [PATCH 15/39] added -ldl flag to cmakelists --- examples/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 3956dfc..3d5c068 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -13,7 +13,7 @@ else() message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a different C++ compiler.") endif() endif() -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g -ldl") set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) @@ -27,6 +27,8 @@ add_executable(query_file src/query_file.cc) add_executable(query_kv src/query_kv.cc) add_executable(query_semistructured src/query_semistructured.cc) +file(COPY lambda/lambdatest.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) + file(COPY pysuccinct/pyfile.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) file(COPY pysuccinct/pykv.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) file(COPY pysuccinct/pysemistructured.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) From 9d21bcd7fe5537218c5090d8601b5ced33a0cd8a Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Thu, 21 May 2020 22:47:35 -0400 Subject: [PATCH 16/39] added lutil to flags --- examples/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 3d5c068..29f05d0 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -13,7 +13,7 @@ else() message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a different C++ compiler.") endif() endif() -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g -ldl") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g -ldl -lutil") set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) From 6551859d020358385c62826ec8b756b5cf8862ca Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Thu, 21 May 2020 23:01:34 -0400 Subject: [PATCH 17/39] Include path to python 3.7 in cmake file --- examples/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 29f05d0..18b8fba 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -13,7 +13,7 @@ else() message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a different C++ compiler.") endif() endif() -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g -ldl -lutil") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g -ldl -lutil -I/usr/local/bin/python3.7") set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) From bcb57bdb00a7014acaa85b9d8fe72a04794799c1 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Thu, 21 May 2020 23:07:57 -0400 Subject: [PATCH 18/39] added -lboost_python3 flag --- examples/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 18b8fba..4ddbcbf 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -13,7 +13,7 @@ else() message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a different C++ compiler.") endif() endif() -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g -ldl -lutil -I/usr/local/bin/python3.7") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g -ldl -lutil -lboost_python3") set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) From 1f3a1b785e109a4f38ca472a4acec79a3f73a6d0 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Thu, 21 May 2020 23:24:28 -0400 Subject: [PATCH 19/39] removed flag from cmake lists boost_python3 dir --- examples/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 4ddbcbf..29f05d0 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -13,7 +13,7 @@ else() message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a different C++ compiler.") endif() endif() -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g -ldl -lutil -lboost_python3") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -g -ldl -lutil") set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) From 1389ab2ad5bcc86d0c0f67cb3183818a1b86ace9 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Mon, 25 May 2020 14:30:02 -0400 Subject: [PATCH 20/39] lambda ec 3.6 python version --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1746186..a53df59 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 2.8) project(succinct CXX) -find_package(PythonInterp "3.7") +find_package(PythonInterp "3.6") if (PYTHONINTERP_FOUND) if (UNIX AND NOT APPLE) From a54ea40b9bc5e7c89f0a8939d095b77804804247 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Tue, 26 May 2020 21:32:31 -0400 Subject: [PATCH 21/39] 3.7 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a53df59..1746186 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 2.8) project(succinct CXX) -find_package(PythonInterp "3.6") +find_package(PythonInterp "3.7") if (PYTHONINTERP_FOUND) if (UNIX AND NOT APPLE) From 55058eadaa75a0a18e6d8a0fa534417df46ce1f6 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Thu, 28 May 2020 22:06:26 -0400 Subject: [PATCH 22/39] Lambda cmake, removed thrift directories --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1746186..318aef9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,9 +58,9 @@ set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake-modules) # Builds all of Succinct's sub-modules add_subdirectory(core) add_subdirectory(examples) -add_subdirectory(sharded) -add_subdirectory(sharded-kv) -add_subdirectory(bench) +# add_subdirectory(sharded) +# add_subdirectory(sharded-kv) +# add_subdirectory(bench) add_subdirectory(pysuccinct) # Testing framework From 792e42287220ee09e108e5a56a9df49af41729dc Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Wed, 10 Jun 2020 23:52:48 -0400 Subject: [PATCH 23/39] Modified succinct to compress all metadata into a metadata_succinct file --- CMakeLists.txt | 2 +- core/include/npa/delta_encoded_npa.h | 5 +- core/include/npa/npa.h | 2 +- core/include/npa/wavelet_tree_encoded_npa.h | 2 +- .../include/sampledarray/flat_sampled_array.h | 5 +- .../sampledarray/layered_sampled_array.h | 5 +- core/include/sampledarray/sampled_array.h | 2 +- core/src/npa/wavelet_tree_encoded_npa.cc | 6 +-- core/src/succinct_core.cc | 50 ++++++++----------- 9 files changed, 34 insertions(+), 45 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 318aef9..afb01f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,4 +71,4 @@ add_subdirectory(external/gtest-1.7.0) add_subdirectory(test) enable_testing() -add_test(CoreTest ${TESTS_PATH}/core_test ${PROJECT_SOURCE_DIR}/test/data) +add_test(CoreTest ${TESTS_PATH}/core_test ${PROJECT_SOURCE_DIR}/test/data) \ No newline at end of file diff --git a/core/include/npa/delta_encoded_npa.h b/core/include/npa/delta_encoded_npa.h index 12b0a8e..3769b53 100644 --- a/core/include/npa/delta_encoded_npa.h +++ b/core/include/npa/delta_encoded_npa.h @@ -256,9 +256,8 @@ class DeltaEncodedNPA : public NPA { return in_size; } - virtual size_t MemoryMap(std::string filename) { - uint8_t *data, *data_beg; - data = data_beg = (uint8_t *) SuccinctUtils::MemoryMap(filename); + virtual size_t MemoryMap(uint8_t* data) { + uint8_t *data_beg = data; encoding_scheme_ = (NPAEncodingScheme) (*((uint64_t *) data)); data += sizeof(uint64_t); diff --git a/core/include/npa/npa.h b/core/include/npa/npa.h index f8bee72..3d5ab2f 100644 --- a/core/include/npa/npa.h +++ b/core/include/npa/npa.h @@ -65,7 +65,7 @@ class NPA { virtual size_t Deserialize(std::istream& in) = 0; - virtual size_t MemoryMap(std::string filename) = 0; + virtual size_t MemoryMap(uint8_t* data) = 0; virtual size_t StorageSize() = 0; diff --git a/core/include/npa/wavelet_tree_encoded_npa.h b/core/include/npa/wavelet_tree_encoded_npa.h index d436252..cdaaf29 100644 --- a/core/include/npa/wavelet_tree_encoded_npa.h +++ b/core/include/npa/wavelet_tree_encoded_npa.h @@ -41,7 +41,7 @@ class WaveletTreeEncodedNPA : public NPA { virtual size_t Deserialize(std::istream& in); // Memory map the wavelet tree encoded NPA - virtual size_t MemoryMap(std::string filename); + virtual size_t MemoryMap(uint8_t* data); virtual size_t StorageSize(); diff --git a/core/include/sampledarray/flat_sampled_array.h b/core/include/sampledarray/flat_sampled_array.h index 8994254..bc13152 100644 --- a/core/include/sampledarray/flat_sampled_array.h +++ b/core/include/sampledarray/flat_sampled_array.h @@ -91,9 +91,8 @@ class FlatSampledArray : public SampledArray { return in_size; } - virtual size_t MemoryMap(std::string filename) { - uint8_t *data_buf, *data_beg; - data_buf = data_beg = (uint8_t *) SuccinctUtils::MemoryMap(filename); + virtual size_t MemoryMap(uint8_t* data) { + uint8_t *data_buf, *data_beg = data; data_size_ = *((uint64_t *) data_buf); data_buf += sizeof(uint64_t); diff --git a/core/include/sampledarray/layered_sampled_array.h b/core/include/sampledarray/layered_sampled_array.h index abe36ba..3e4c4da 100644 --- a/core/include/sampledarray/layered_sampled_array.h +++ b/core/include/sampledarray/layered_sampled_array.h @@ -244,9 +244,8 @@ class LayeredSampledArray : public SampledArray { return in_size; } - virtual size_t MemoryMap(std::string filename) { - uint8_t *data, *data_beg; - data = data_beg = (uint8_t *) SuccinctUtils::MemoryMap(filename); + virtual size_t MemoryMap(uint8_t* data) { + uint8_t *data_beg = data; layer_map_ = *((uint64_t *) data); data += sizeof(uint64_t); diff --git a/core/include/sampledarray/sampled_array.h b/core/include/sampledarray/sampled_array.h index f035a10..df18ca6 100644 --- a/core/include/sampledarray/sampled_array.h +++ b/core/include/sampledarray/sampled_array.h @@ -19,7 +19,7 @@ class SampledArray { virtual size_t Serialize(std::ostream& out) = 0; virtual size_t Deserialize(std::istream& in) = 0; - virtual size_t MemoryMap(std::string filename) = 0; + virtual size_t MemoryMap(uint8_t* data) = 0; SamplingScheme GetSamplingScheme() { return sampling_scheme_; diff --git a/core/src/npa/wavelet_tree_encoded_npa.cc b/core/src/npa/wavelet_tree_encoded_npa.cc index 02cf134..d7e12d1 100644 --- a/core/src/npa/wavelet_tree_encoded_npa.cc +++ b/core/src/npa/wavelet_tree_encoded_npa.cc @@ -509,11 +509,9 @@ size_t WaveletTreeEncodedNPA::Deserialize(std::istream& in) { return in_size; } -size_t WaveletTreeEncodedNPA::MemoryMap(std::string filename) { - uint8_t *data, *data_beg; - data = data_beg = (uint8_t *) SuccinctUtils::MemoryMap(filename); +size_t WaveletTreeEncodedNPA::MemoryMap(uint8_t* data) { + uint8_t *data_beg = data; - encoding_scheme_ = (NPAEncodingScheme) (*((uint64_t *) data)); data += sizeof(uint64_t); npa_size_ = *((uint64_t *) data); data += sizeof(uint64_t); diff --git a/core/src/succinct_core.cc b/core/src/succinct_core.cc index 8dcfb09..7e9e5a0 100644 --- a/core/src/succinct_core.cc +++ b/core/src/succinct_core.cc @@ -338,10 +338,10 @@ size_t SuccinctCore::Serialize(const std::string &path) { return 0; } } - std::ofstream out(path + "/metadata"); - std::ofstream sa_out(path + "/sa"); - std::ofstream isa_out(path + "/isa"); - std::ofstream npa_out(path + "/npa"); + std::ofstream out(path + "/succinct_metadata"); + // std::ofstream sa_out(path + "/sa"); + // std::ofstream isa_out(path + "/isa"); + // std::ofstream npa_out(path + "/npa"); // Output size of input file out.write(reinterpret_cast(&(input_size_)), sizeof(uint64_t)); @@ -366,20 +366,17 @@ size_t SuccinctCore::Serialize(const std::string &path) { out.write(reinterpret_cast(&alphabet_[i]), sizeof(char)); } - out_size += sa_->Serialize(sa_out); - out_size += isa_->Serialize(isa_out); + out_size += sa_->Serialize(out); + out_size += isa_->Serialize(out); if (sa_->GetSamplingScheme() == SamplingScheme::FLAT_SAMPLE_BY_VALUE) { assert(isa_->GetSamplingScheme() == SamplingScheme::FLAT_SAMPLE_BY_VALUE); out_size += SerializeDictionary(((SampledByValueSA *) sa_)->GetSampledPositions(), out); } - out_size += npa_->Serialize(npa_out); + out_size += npa_->Serialize(out); out.close(); - sa_out.close(); - isa_out.close(); - npa_out.close(); return out_size; } @@ -389,10 +386,10 @@ size_t SuccinctCore::Deserialize(const std::string &path) { struct stat st{}; assert(stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode)); - std::ifstream in(path + "/metadata"); - std::ifstream sa_in(path + "/sa"); - std::ifstream isa_in(path + "/isa"); - std::ifstream npa_in(path + "/npa"); + std::ifstream in(path + "/succinct_metadata"); + // std::ifstream sa_in(path + "/sa"); + // std::ifstream isa_in(path + "/isa"); + // std::ifstream npa_in(path + "/npa"); size_t in_size = 0; @@ -431,8 +428,8 @@ size_t SuccinctCore::Deserialize(const std::string &path) { } // Deserialize SA, ISA - in_size += sa_->Deserialize(sa_in); - in_size += isa_->Deserialize(isa_in); + in_size += sa_->Deserialize(in); + in_size += isa_->Deserialize(in); // Deserialize bitmap marking positions of sampled values if the sampling scheme // is sample by value. @@ -446,19 +443,16 @@ size_t SuccinctCore::Deserialize(const std::string &path) { // Deserialize NPA based on the NPA encoding scheme. switch (npa_->GetEncodingScheme()) { - case NPA::NPAEncodingScheme::ELIAS_DELTA_ENCODED:in_size += ((EliasDeltaEncodedNPA *) npa_)->Deserialize(npa_in); + case NPA::NPAEncodingScheme::ELIAS_DELTA_ENCODED:in_size += ((EliasDeltaEncodedNPA *) npa_)->Deserialize(in); break; - case NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED:in_size += ((EliasGammaEncodedNPA *) npa_)->Deserialize(npa_in); + case NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED:in_size += ((EliasGammaEncodedNPA *) npa_)->Deserialize(in); break; - case NPA::NPAEncodingScheme::WAVELET_TREE_ENCODED:in_size += ((WaveletTreeEncodedNPA *) npa_)->Deserialize(npa_in); + case NPA::NPAEncodingScheme::WAVELET_TREE_ENCODED:in_size += ((WaveletTreeEncodedNPA *) npa_)->Deserialize(in); break; default:assert(0); } in.close(); - sa_in.close(); - isa_in.close(); - npa_in.close(); return in_size; } @@ -469,7 +463,7 @@ size_t SuccinctCore::MemoryMap(const std::string &path) { assert(stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode)); uint8_t *data_beg, *data; - data = data_beg = (uint8_t *) SuccinctUtils::MemoryMap(path + "/metadata"); + data = data_beg = (uint8_t *) SuccinctUtils::MemoryMap(path + "/succinct_metadata"); input_size_ = *((uint64_t *) data); data += sizeof(uint64_t); @@ -501,8 +495,8 @@ size_t SuccinctCore::MemoryMap(const std::string &path) { data += (sizeof(char) * (alphabet_size_ + 1)); // Memory map SA and ISA - data += sa_->MemoryMap(path + "/sa"); - data += isa_->MemoryMap(path + "/isa"); + data += sa_->MemoryMap(data); + data += isa_->MemoryMap(data); // Memory map bitmap marking positions of sampled values if the sampling scheme // is sample by value. @@ -516,12 +510,12 @@ size_t SuccinctCore::MemoryMap(const std::string &path) { // Memory map NPA based on the NPA encoding scheme. switch (npa_->GetEncodingScheme()) { - case NPA::NPAEncodingScheme::ELIAS_DELTA_ENCODED:data += ((EliasDeltaEncodedNPA *) npa_)->MemoryMap(path + "/npa"); + case NPA::NPAEncodingScheme::ELIAS_DELTA_ENCODED:data += ((EliasDeltaEncodedNPA *) npa_)->MemoryMap(data); break; - case NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED:data += ((EliasGammaEncodedNPA *) npa_)->MemoryMap(path + "/npa"); + case NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED:data += ((EliasGammaEncodedNPA *) npa_)->MemoryMap(data); break; case NPA::NPAEncodingScheme::WAVELET_TREE_ENCODED: - data += ((WaveletTreeEncodedNPA *) npa_)->MemoryMap(path + "/npa"); + data += ((WaveletTreeEncodedNPA *) npa_)->MemoryMap(data); break; default:assert(0); } From a13d64f782fdba4dfa0f1b58171f72941a315f3b Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Thu, 11 Jun 2020 14:17:09 -0400 Subject: [PATCH 24/39] renamed compressed .succinct file and modified lambda test --- core/src/succinct_core.cc | 6 ++-- examples/lambda/lambdatest.py | 56 +++++++++++++++++++++++++++++++---- 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/core/src/succinct_core.cc b/core/src/succinct_core.cc index 7e9e5a0..c05b406 100644 --- a/core/src/succinct_core.cc +++ b/core/src/succinct_core.cc @@ -338,7 +338,7 @@ size_t SuccinctCore::Serialize(const std::string &path) { return 0; } } - std::ofstream out(path + "/succinct_metadata"); + std::ofstream out(path + "/" + path.substr(0, path.size()-9) + ".succinct.metadata"); // std::ofstream sa_out(path + "/sa"); // std::ofstream isa_out(path + "/isa"); // std::ofstream npa_out(path + "/npa"); @@ -386,7 +386,7 @@ size_t SuccinctCore::Deserialize(const std::string &path) { struct stat st{}; assert(stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode)); - std::ifstream in(path + "/succinct_metadata"); + std::ifstream in(path + "/" + path.substr(0, path.size()-9) + ".succinct.metadata"); // std::ifstream sa_in(path + "/sa"); // std::ifstream isa_in(path + "/isa"); // std::ifstream npa_in(path + "/npa"); @@ -463,7 +463,7 @@ size_t SuccinctCore::MemoryMap(const std::string &path) { assert(stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode)); uint8_t *data_beg, *data; - data = data_beg = (uint8_t *) SuccinctUtils::MemoryMap(path + "/succinct_metadata"); + data = data_beg = (uint8_t *) SuccinctUtils::MemoryMap(path + "/" + path.substr(0, path.size()-9) + ".succinct.metadata"); input_size_ = *((uint64_t *) data); data += sizeof(uint64_t); diff --git a/examples/lambda/lambdatest.py b/examples/lambda/lambdatest.py index 98f8c5c..294eb4e 100644 --- a/examples/lambda/lambdatest.py +++ b/examples/lambda/lambdatest.py @@ -1,13 +1,59 @@ import file import kvstore import semistructuredstore +import os +from os import path +import boto3 +import tempfile + +# Change the filenames to succinctdi_sa, succinctdir_isa .... and upload to s3 +def uploadDirectory(path,bucketname, name): + s3 = boto3.client("s3") + for root,dirs,files in os.walk(path): + for f in files: + s3.upload_file(os.path.join(root,f), bucketname, f) # compress file def call_compress (event, context): - # q = file.File(inputpath, 32, 32, 128, 0, 1) - print("success") + # # Upload a file to the bucket + # s3 = boto3.resource("s3") + # os.chdir("/tmp") + # f = open("test.txt","w") + # f.write("hello this is a test") + # f.close() + # s3.meta.client.upload_file("/tmp/test.txt", "succinct-datasets", "test.txt") -def call_query (event, context): - print("success") + # Download a file from a bucket and compress + s3 = boto3.client("s3") + s3.download_file("succinct-datasets",event['key1'], "/tmp/" + event['key1']) + os.chdir("/tmp") + + # with open(event['key1'], 'r') as f: + # print(f.read()) + + q = file.File(event['key1'], 32, 32, 128, 0, 1) + + # for f in os.listdir("/tmp"): + # print(f) -print(call_compress(None, None)) \ No newline at end of file + uploadDirectory("/tmp/" + event['key1'] + ".succinct", "succinct-datasets", event['key1']) + + # out = os.path.isfile("/tmp/" + event['key1'] + ".succinct") + # print(out) + + # Remove compressed .succinct file contents from s3 + # s3 = boto3.resource("s3") + # obj = s3.Object("succinct-datasets", event['key1'] + ".succinct.metadata") + # obj.delete() + print("File compression and upload is complete") + +def call_query (event, context): + # Download .succinct file from bucket and compress + s3 = boto3.client("s3") + os.chdir("/tmp") + os.mkdir(event['key1']) + s3.download_file("succinct-datasets", event['key1'] + ".metadata", "/tmp/" + event['key1'] + "/" + event['key1'] + ".metadata") + # for f in os.listdir("/tmp"): + # print(f) + q = file.File(event['key1']) + print("File deserialization and querying is complete") \ No newline at end of file From 8b6266b4bb927cc7e1472270bf4f20bd34c7a40c Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Tue, 16 Jun 2020 16:54:38 -0400 Subject: [PATCH 25/39] Started creating code for compressing files via content --- .vscode/settings.json | 8 +++++++- core/src/succinct_core.cc | 20 +++++++++++++++++--- pysuccinct/file.cpp | 19 +++++++++++++++++-- 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 21bd06b..997d542 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -32,7 +32,13 @@ "__node_handle": "cpp", "cwchar": "cpp", "iosfwd": "cpp", - "fstream": "cpp" + "fstream": "cpp", + "cwctype": "cpp", + "array": "cpp", + "iterator": "cpp", + "string": "cpp", + "string_view": "cpp", + "vector": "cpp" }, "python.linting.enabled": false } \ No newline at end of file diff --git a/core/src/succinct_core.cc b/core/src/succinct_core.cc index c05b406..4d0214b 100644 --- a/core/src/succinct_core.cc +++ b/core/src/succinct_core.cc @@ -1,5 +1,14 @@ #include "succinct_core.h" +#include +#include +#include +#include +#include +#include +#include +#include + SuccinctCore::SuccinctCore(const std::string &filename, SuccinctMode s_mode, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, @@ -42,6 +51,7 @@ SuccinctCore::SuccinctCore(const std::string &filename, SuccinctMode s_mode, MemoryMap(filename); break; } + } } @@ -326,6 +336,10 @@ char SuccinctCore::CharAt(uint64_t i) { return alphabet_[LookupC(LookupISA(i))]; } +void SuccinctCore::SerializeS3(const uint8_t *data, const std::string &out_path){ + //TODO +} + size_t SuccinctCore::Serialize(const std::string &path) { size_t out_size = 0; typedef std::map >::iterator iterator_t; @@ -338,7 +352,7 @@ size_t SuccinctCore::Serialize(const std::string &path) { return 0; } } - std::ofstream out(path + "/" + path.substr(0, path.size()-9) + ".succinct.metadata"); + std::ofstream out(path + "/" + path); // std::ofstream sa_out(path + "/sa"); // std::ofstream isa_out(path + "/isa"); // std::ofstream npa_out(path + "/npa"); @@ -386,7 +400,7 @@ size_t SuccinctCore::Deserialize(const std::string &path) { struct stat st{}; assert(stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode)); - std::ifstream in(path + "/" + path.substr(0, path.size()-9) + ".succinct.metadata"); + std::ifstream in(path + "/" + path); // std::ifstream sa_in(path + "/sa"); // std::ifstream isa_in(path + "/isa"); // std::ifstream npa_in(path + "/npa"); @@ -463,7 +477,7 @@ size_t SuccinctCore::MemoryMap(const std::string &path) { assert(stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode)); uint8_t *data_beg, *data; - data = data_beg = (uint8_t *) SuccinctUtils::MemoryMap(path + "/" + path.substr(0, path.size()-9) + ".succinct.metadata"); + data = data_beg = (uint8_t *) SuccinctUtils::MemoryMap(path + "/" + path); input_size_ = *((uint64_t *) data); data += sizeof(uint64_t); diff --git a/pysuccinct/file.cpp b/pysuccinct/file.cpp index ab2fe96..012a3a6 100644 --- a/pysuccinct/file.cpp +++ b/pysuccinct/file.cpp @@ -6,6 +6,7 @@ #include #include "succinct_file.h" +#include "succinct_core.h" #include using namespace boost::python; @@ -110,6 +111,21 @@ struct File { s_file_->Serialize(inputpath + ".succinct"); } + //Constructor that takes input in a memory buffer and compresses + File(uint8_t *input, size_t input_size, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, + int32_t npa_sampling_rate, uint32_t context_len, int sampling_opt, int npa_opt, uint32_t sampling_range){ + s_file_ = nullptr; + // Compresses a the data from "input" in memory + std::cout << "Constructing Succinct data structures...\n"; + SuccinctCore* obj; + obj->Construct(input, input_size, sa_sampling_rate, isa_sampling_rate, + npa_sampling_rate, context_len, SamplingSchemeFromOption(sampling_opt), + SamplingSchemeFromOption(sampling_opt), EncodingSchemeFromOption(npa_opt), sampling_range); + s_file_ = (SuccinctFile *) obj; + std::cout << "Serializing Succinct data structures...\n"; + } + + // Wrapped search command, that returns a python list boost::python::list Search(const std::string& arg) { std::vector results; @@ -138,14 +154,13 @@ struct File { SuccinctFile *s_file_; }; - - /** * Boost Python module */ BOOST_PYTHON_MODULE(file){ class_("File", init()) .def(init()) + .def(init()) .def("Search", &File::Search) .def("Count", &File::Count) .def("Extract", &File::Extract) From d512c53ae9d9c7f48ffd38b07711ed607d1a07c2 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Tue, 16 Jun 2020 17:03:52 -0400 Subject: [PATCH 26/39] Revert C++ serialize approach --- core/include/succinct_core.h | 2 +- core/src/succinct_core.cc | 13 ------------- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/core/include/succinct_core.h b/core/include/succinct_core.h index b1b4671..d1de2f3 100644 --- a/core/include/succinct_core.h +++ b/core/include/succinct_core.h @@ -76,7 +76,7 @@ class SuccinctCore : public SuccinctBase { // Get the character at index i char CharAt(uint64_t i); - + // Serialize succinct data structures virtual size_t Serialize(const std::string& filename); diff --git a/core/src/succinct_core.cc b/core/src/succinct_core.cc index 4d0214b..70d135d 100644 --- a/core/src/succinct_core.cc +++ b/core/src/succinct_core.cc @@ -1,14 +1,5 @@ #include "succinct_core.h" -#include -#include -#include -#include -#include -#include -#include -#include - SuccinctCore::SuccinctCore(const std::string &filename, SuccinctMode s_mode, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, @@ -336,10 +327,6 @@ char SuccinctCore::CharAt(uint64_t i) { return alphabet_[LookupC(LookupISA(i))]; } -void SuccinctCore::SerializeS3(const uint8_t *data, const std::string &out_path){ - //TODO -} - size_t SuccinctCore::Serialize(const std::string &path) { size_t out_size = 0; typedef std::map >::iterator iterator_t; From b30cc820ac1a1f854eac19f4120673a515637d9d Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Fri, 19 Jun 2020 02:00:20 -0400 Subject: [PATCH 27/39] Created File constructor to take in input and added pyfile_input example --- .vscode/launch.json | 116 ++++++++++++++++++++++++++++ .vscode/settings.json | 12 ++- core/include/succinct_core.h | 22 +++++- core/src/succinct_core.cc | 98 +++++++++++++++++++++-- examples/CMakeLists.txt | 1 + examples/lambda/lambdatest.py | 17 ++-- examples/pysuccinct/pyfile_input.py | 92 ++++++++++++++++++++++ pysuccinct/file.cpp | 44 ++++++++--- 8 files changed, 376 insertions(+), 26 deletions(-) create mode 100644 .vscode/launch.json create mode 100644 examples/pysuccinct/pyfile_input.py diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..84c9bb2 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,116 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}" + }, + { + "name": "Python: Attach", + "type": "python", + "request": "attach", + "localRoot": "${workspaceFolder}", + "remoteRoot": "${workspaceFolder}", + "port": 3000, + "secret": "my_secret", + "host": "localhost" + }, + { + "name": "Python: Terminal (integrated)", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + }, + { + "name": "Python: Terminal (external)", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "externalTerminal" + }, + { + "name": "Python: Django", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/manage.py", + "args": [ + "runserver", + "--noreload", + "--nothreading" + ], + "debugOptions": [ + "RedirectOutput", + "Django" + ] + }, + { + "name": "Python: Flask (0.11.x or later)", + "type": "python", + "request": "launch", + "module": "flask", + "env": { + "FLASK_APP": "app.py" + }, + "args": [ + "run", + "--no-debugger", + "--no-reload" + ] + }, + { + "name": "Python: Module", + "type": "python", + "request": "launch", + "module": "module.name" + }, + { + "name": "Python: Pyramid", + "type": "python", + "request": "launch", + "args": [ + "${workspaceFolder}/development.ini" + ], + "debugOptions": [ + "RedirectOutput", + "Pyramid" + ] + }, + { + "name": "Python: Watson", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/console.py", + "args": [ + "dev", + "runserver", + "--noreload=True" + ] + }, + { + "name": "Python: All debug Options", + "type": "python", + "request": "launch", + "pythonPath": "${config:python.pythonPath}", + "program": "${file}", + "module": "module.name", + "env": { + "VAR1": "1", + "VAR2": "2" + }, + "envFile": "${workspaceFolder}/.env", + "args": [ + "arg1", + "arg2" + ], + "debugOptions": [ + "RedirectOutput" + ] + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index 997d542..090e946 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -38,7 +38,17 @@ "iterator": "cpp", "string": "cpp", "string_view": "cpp", - "vector": "cpp" + "vector": "cpp", + "__split_buffer": "cpp", + "__tree": "cpp", + "deque": "cpp", + "list": "cpp", + "queue": "cpp", + "stack": "cpp", + "utility": "cpp", + "sstream": "cpp", + "strstream": "cpp", + "__string": "cpp" }, "python.linting.enabled": false } \ No newline at end of file diff --git a/core/include/succinct_core.h b/core/include/succinct_core.h index d1de2f3..b6ead50 100644 --- a/core/include/succinct_core.h +++ b/core/include/succinct_core.h @@ -25,10 +25,11 @@ #include "utils/divsufsortxx_utility.h" typedef enum { - CONSTRUCT_IN_MEMORY = 0, - CONSTRUCT_MEMORY_MAPPED = 1, - LOAD_IN_MEMORY = 2, - LOAD_MEMORY_MAPPED = 3 + CONSTRUCT_FROM_CONTENT = 0, + CONSTRUCT_IN_MEMORY = 1, + CONSTRUCT_MEMORY_MAPPED = 2, + LOAD_IN_MEMORY = 3, + LOAD_MEMORY_MAPPED = 4 } SuccinctMode; class SuccinctCore : public SuccinctBase { @@ -76,6 +77,9 @@ class SuccinctCore : public SuccinctBase { // Get the character at index i char CharAt(uint64_t i); + + // Serialize succinct data structures + virtual size_t SerializeFromContent(std::ostream &path); // Serialize succinct data structures virtual size_t Serialize(const std::string& filename); @@ -122,6 +126,16 @@ class SuccinctCore : public SuccinctBase { SamplingScheme isa_sampling_scheme, NPA::NPAEncodingScheme npa_encoding_scheme, uint32_t sampling_range); + + // Constructs the core data structures from an input string + void ConstructFromContent(const std::string &input, + uint32_t sa_sampling_rate, + uint32_t isa_sampling_rate, + uint32_t npa_sampling_rate, uint32_t context_len, + SamplingScheme sa_sampling_scheme, + SamplingScheme isa_sampling_scheme, + NPA::NPAEncodingScheme npa_encoding_scheme, + uint32_t sampling_range); // Constructs the core data structures void Construct(const std::string& filename, uint32_t sa_sampling_rate, diff --git a/core/src/succinct_core.cc b/core/src/succinct_core.cc index 70d135d..8e0f317 100644 --- a/core/src/succinct_core.cc +++ b/core/src/succinct_core.cc @@ -1,4 +1,6 @@ #include "succinct_core.h" +#include +#include SuccinctCore::SuccinctCore(const std::string &filename, SuccinctMode s_mode, uint32_t sa_sampling_rate, @@ -17,6 +19,12 @@ SuccinctCore::SuccinctCore(const std::string &filename, SuccinctMode s_mode, this->alphabet_size_ = 0; this->input_size_ = 0; switch (s_mode) { + case SuccinctMode::CONSTRUCT_FROM_CONTENT: { + ConstructFromContent(filename, sa_sampling_rate, isa_sampling_rate, + npa_sampling_rate, context_len, sa_sampling_scheme, + isa_sampling_scheme, npa_encoding_scheme, sampling_range); + break; + } case SuccinctMode::CONSTRUCT_IN_MEMORY: { Construct(filename, sa_sampling_rate, isa_sampling_rate, npa_sampling_rate, context_len, sa_sampling_scheme, @@ -115,6 +123,28 @@ void SuccinctCore::Allocate(uint32_t sa_sampling_rate, assert(isa_ != nullptr); } +void SuccinctCore::ConstructFromContent(const std::string &input, + uint32_t sa_sampling_rate, + uint32_t isa_sampling_rate, + uint32_t npa_sampling_rate, uint32_t context_len, + SamplingScheme sa_sampling_scheme, + SamplingScheme isa_sampling_scheme, + NPA::NPAEncodingScheme npa_encoding_scheme, + uint32_t sampling_range) { + + // Convert input into uint8_t * + uint64_t fsize = input.length(); + auto *data = (uint8_t *) s_allocator.s_malloc(fsize + 1); + for (int i = 0; i < fsize; i++ ){ + data[i] = input[i]; + } + data[fsize] = 1; + + Construct(data, fsize + 1, sa_sampling_rate, isa_sampling_rate, + npa_sampling_rate, context_len, sa_sampling_scheme, + isa_sampling_scheme, npa_encoding_scheme, sampling_range); +} + void SuccinctCore::Construct(const std::string &filename, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, @@ -327,6 +357,64 @@ char SuccinctCore::CharAt(uint64_t i) { return alphabet_[LookupC(LookupISA(i))]; } +// Serialize function that returns string +size_t SuccinctCore::SerializeFromContent(std::ostream &path) { + size_t out_size = 0; + typedef std::map >::iterator iterator_t; + struct stat st{}; + // mode_t create_mode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; + // if (stat(path.c_str(), &st) != 0) { + // if (mkdir(path.c_str(), create_mode) != 0) { + // fprintf(stderr, "Failed to create path '%s'\n", path.c_str()); + // fprintf(stderr, "Terminating the serialization process.\n"); + // return 0; + // } + // } + // std::ofstream out(path + "/" + path); + // std::ofstream sa_out(path + "/sa"); + // std::ofstream isa_out(path + "/isa"); + // std::ofstream npa_out(path + "/npa"); + + // Output size of input file + path.write(reinterpret_cast(&(input_size_)), sizeof(uint64_t)); + out_size += sizeof(uint64_t); + + // Output cmap size + uint64_t cmap_size = alphabet_map_.size(); + path.write(reinterpret_cast(&(cmap_size)), sizeof(uint64_t)); + out_size += sizeof(uint64_t); + for (auto &it : alphabet_map_) { + path.write(reinterpret_cast(&(it.first)), sizeof(char)); + out_size += sizeof(char); + path.write(reinterpret_cast(&(it.second.first)), sizeof(uint64_t)); + out_size += sizeof(uint64_t); + path.write(reinterpret_cast(&(it.second.second)), sizeof(uint32_t)); + out_size += sizeof(uint32_t); + } + + path.write(reinterpret_cast(&alphabet_size_), sizeof(uint32_t)); + out_size += sizeof(uint32_t); + for (uint32_t i = 0; i < alphabet_size_ + 1; i++) { + path.write(reinterpret_cast(&alphabet_[i]), sizeof(char)); + } + + out_size += sa_->Serialize(path); + out_size += isa_->Serialize(path); + + if (sa_->GetSamplingScheme() == SamplingScheme::FLAT_SAMPLE_BY_VALUE) { + assert(isa_->GetSamplingScheme() == SamplingScheme::FLAT_SAMPLE_BY_VALUE); + out_size += SerializeDictionary(((SampledByValueSA *) sa_)->GetSampledPositions(), path); + } + + out_size += npa_->Serialize(path); + + return out_size; + // std::stringstream ss; + // ss << path.rdbuf(); + + // return reinterpret_cast(&((ss.str())[0])); +} + size_t SuccinctCore::Serialize(const std::string &path) { size_t out_size = 0; typedef std::map >::iterator iterator_t; @@ -339,7 +427,7 @@ size_t SuccinctCore::Serialize(const std::string &path) { return 0; } } - std::ofstream out(path + "/" + path); + std::ofstream out(path); // std::ofstream sa_out(path + "/sa"); // std::ofstream isa_out(path + "/isa"); // std::ofstream npa_out(path + "/npa"); @@ -384,10 +472,10 @@ size_t SuccinctCore::Serialize(const std::string &path) { size_t SuccinctCore::Deserialize(const std::string &path) { // Check if directory exists - struct stat st{}; - assert(stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode)); + //struct stat st{}; + //assert(stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode)); - std::ifstream in(path + "/" + path); + std::ifstream in(path); // std::ifstream sa_in(path + "/sa"); // std::ifstream isa_in(path + "/isa"); // std::ifstream npa_in(path + "/npa"); @@ -464,7 +552,7 @@ size_t SuccinctCore::MemoryMap(const std::string &path) { assert(stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode)); uint8_t *data_beg, *data; - data = data_beg = (uint8_t *) SuccinctUtils::MemoryMap(path + "/" + path); + data = data_beg = (uint8_t *) SuccinctUtils::MemoryMap(path); input_size_ = *((uint64_t *) data); data += sizeof(uint64_t); diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 29f05d0..64b0d38 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -29,6 +29,7 @@ add_executable(query_semistructured src/query_semistructured.cc) file(COPY lambda/lambdatest.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) +file(COPY pysuccinct/pyfile_input.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) file(COPY pysuccinct/pyfile.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) file(COPY pysuccinct/pykv.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) file(COPY pysuccinct/pysemistructured.py DESTINATION ${EXECUTABLE_OUTPUT_PATH}) diff --git a/examples/lambda/lambdatest.py b/examples/lambda/lambdatest.py index 294eb4e..beccf8a 100644 --- a/examples/lambda/lambdatest.py +++ b/examples/lambda/lambdatest.py @@ -23,20 +23,25 @@ def call_compress (event, context): # f.close() # s3.meta.client.upload_file("/tmp/test.txt", "succinct-datasets", "test.txt") - # Download a file from a bucket and compress + # Get file content from S3 and save as "input" s3 = boto3.client("s3") - s3.download_file("succinct-datasets",event['key1'], "/tmp/" + event['key1']) - os.chdir("/tmp") + obj = client.get_object(Bucket='succinct-datasets', Key=event['key1']) + input = obj.get()['Body'].read().decode('utf-8') # with open(event['key1'], 'r') as f: # print(f.read()) - - q = file.File(event['key1'], 32, 32, 128, 0, 1) + + # Compress the input using file module + q = file.File(input, 32, 32, 128, 0, 1) + content = (q.GetContent().tobytes()) + + # Upload content back onto S3 in .succinct file + client.put_object(Body=content, Bucket='succinct-datasets', Key=event['key1'] + ".succinct") # for f in os.listdir("/tmp"): # print(f) - uploadDirectory("/tmp/" + event['key1'] + ".succinct", "succinct-datasets", event['key1']) + # uploadDirectory("/tmp/" + event['key1'] + ".succinct", "succinct-datasets", event['key1']) # out = os.path.isfile("/tmp/" + event['key1'] + ".succinct") # print(out) diff --git a/examples/pysuccinct/pyfile_input.py b/examples/pysuccinct/pyfile_input.py new file mode 100644 index 0000000..1e3ff8d --- /dev/null +++ b/examples/pysuccinct/pyfile_input.py @@ -0,0 +1,92 @@ +import file +import sys +import getopt +import os + +# Try catch block for non integer argument checking +def RepresentsInt(s): + try: + int(s) + return True + except ValueError: + return False + +# Default values +sa_sampling_rate = 32 +isa_sampling_rate = 32 +sampling_scheme = 0 +npa_sampling_rate = 128 +npa_encoding_scheme = 1 +type = "file" +inputpath = "" + +# Get user input to either load from memory or compress a new file +option = input("Usage: [load/compress] [file]\n") +option = option.split() +if (len(option) != 2): + print("Usage: [load/compress] [file]\n") + sys.exit(2) +else: + inputpath = option[1] + if (option[0] == "load"): + # Load file from memory + print("loading ", inputpath, " from file") + q = file.File(inputpath) + elif (option[0] == "compress"): + # Compress the file + print("Please enter the sampling rates") + option = input("Usage: [-s sa_sampling_rate] [-i isa_sampling_rate] [-x sampling_scheme] [-n npa_sampling_rate] [-r npa_encoding_scheme] [-t input_type]\n") + # Loop through arguments to get sampling rates using getopt + try: + optlist, args = getopt.getopt(option, 's:i:x:n:r:t:') + except getopt.GetoptError as err: + print("Get opt error") + sys.exit(2) + for o, a in optlist: + if o == "-s": + sa_sampling_rate = int(a) + elif o == "-i": + isa_sampling_rate = int(a) + elif o == "-x": + sampling_scheme = int(a) + elif o == "-n": + npa_sampling_rate = int(a) + elif o == "-r": + npa_encoding_scheme = int(a) + elif o == "-t": + type = a + else: + printf("Invalid Option") + sys.exit(2) + + # # Get input size + # FILE *f = fopen(filename.c_str(), "r") + # fseek(f, 0, SEEK_END) + # uint64_t fsize = ftell(f) + # fseek(f, 0, SEEK_SET) + + # # Read input from file + # auto *data = (uint8_t *) s_allocator.s_malloc(fsize + 1) + # fread(data, fsize, 1, f) + # fclose(f) + # data[fsize] = 1 + # uint8_t *, size_t, uint32_t, uint32_t, int32_t, uint32_t, int, int, uint32_t> + + # context_len = 3 or sampling_range = 1024 + + f = open(inputpath, "r") + input = f.read() + + q = file.File(0, input, sa_sampling_rate, + isa_sampling_rate, npa_sampling_rate, + sampling_scheme, npa_encoding_scheme) + + content = (q.GetContent().tobytes()) + # print(str(content,'ISO-8859-1')) + text_file = open(inputpath + ".succinct", "wb") + text_file.write(content) + + else: + print("Usage: [load/compress] [file]\n") + sys.exit(2) + diff --git a/pysuccinct/file.cpp b/pysuccinct/file.cpp index 012a3a6..208c89e 100644 --- a/pysuccinct/file.cpp +++ b/pysuccinct/file.cpp @@ -112,19 +112,40 @@ struct File { } //Constructor that takes input in a memory buffer and compresses - File(uint8_t *input, size_t input_size, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, - int32_t npa_sampling_rate, uint32_t context_len, int sampling_opt, int npa_opt, uint32_t sampling_range){ + File(int from_content, const std::string& input, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, + uint32_t npa_sampling_rate, int sampling_opt, int npa_opt){ s_file_ = nullptr; // Compresses a the data from "input" in memory std::cout << "Constructing Succinct data structures...\n"; - SuccinctCore* obj; - obj->Construct(input, input_size, sa_sampling_rate, isa_sampling_rate, - npa_sampling_rate, context_len, SamplingSchemeFromOption(sampling_opt), - SamplingSchemeFromOption(sampling_opt), EncodingSchemeFromOption(npa_opt), sampling_range); - s_file_ = (SuccinctFile *) obj; - std::cout << "Serializing Succinct data structures...\n"; + s_file_ = new SuccinctFile(input, + SuccinctMode::CONSTRUCT_FROM_CONTENT, + sa_sampling_rate, isa_sampling_rate, + npa_sampling_rate, SamplingSchemeFromOption(sampling_opt), + SamplingSchemeFromOption(sampling_opt), EncodingSchemeFromOption(npa_opt)); + std::cout << "Serializing Succinct data structures...\n"; + std::stringstream path; + size_t size = s_file_->SerializeFromContent(path); + + std::string pathstring = path.str(); + //unsigned char buffer[pathstring.length()]; + file_content_length_ = pathstring.length(); + file_content_ = new unsigned char[file_content_length_ ](); + memcpy(file_content_, pathstring.data(), file_content_length_); + //file_content_ = buffer; + // std::cout << "string is: " << pathstring << "\n"; + // std::cout << "file_content_ is \n"; + // for (int i = 0; i < pathstring.length(); i++){ + // std::cout << file_content_[i]; + // } + + //boost::python::object memoryView(boost::python::handle<>(PyMemoryView_FromMemory(path, size, PyBUF_READ))); + } + + //Return the serialized content as a string + PyObject* GetContent(){ + PyObject* pymemview = PyMemoryView_FromMemory((char*) file_content_, file_content_length_ , PyBUF_READ); + return pymemview; } - // Wrapped search command, that returns a python list boost::python::list Search(const std::string& arg) { @@ -152,6 +173,8 @@ struct File { //File members SuccinctFile *s_file_; + unsigned char *file_content_; + int file_content_length_; }; /** @@ -160,7 +183,8 @@ struct File { BOOST_PYTHON_MODULE(file){ class_("File", init()) .def(init()) - .def(init()) + .def(init()) + .def("GetContent", &File::GetContent) .def("Search", &File::Search) .def("Count", &File::Count) .def("Extract", &File::Extract) From 3f8f7e28eaa5dce48e1f5138a746a572291f0431 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Wed, 24 Jun 2020 00:01:17 -0400 Subject: [PATCH 28/39] Changed code to construct in memory arrays instead of files on disk (1) --- core/include/npa/delta_encoded_npa.h | 88 +++- core/include/npa/elias_gamma_encoded_npa.h | 6 + core/include/succinct_core.h | 3 +- core/include/utils/array_input.h | 37 ++ core/include/utils/succinct_utils.h | 2 + core/src/npa/elias_gamma_encoded_npa.cc | 13 + core/src/succinct_core.cc | 442 ++++++++++++++------- 7 files changed, 447 insertions(+), 144 deletions(-) create mode 100644 core/include/utils/array_input.h diff --git a/core/include/npa/delta_encoded_npa.h b/core/include/npa/delta_encoded_npa.h index 3769b53..da101cb 100644 --- a/core/include/npa/delta_encoded_npa.h +++ b/core/include/npa/delta_encoded_npa.h @@ -6,6 +6,7 @@ #include "utils/succinct_utils.h" #include "utils/definitions.h" #include "utils/array_stream.h" +#include "utils/array_input.h" #include "utils/thread_pool.h" #include "npa.h" @@ -160,6 +161,53 @@ class DeltaEncodedNPA : public NPA { remove(npa_file.c_str()); } + // Encode DeltaEncodedNPA based on the delta encoding scheme IN MEMORY + void EncodeInMem(int64_t *lISA, std::vector& col_offsets) { + + // Initialize Auxiliary NPA structures + col_offsets_ = col_offsets; + + // Get all NPA values + int64_t *lNPA = new int64_t[npa_size_](); + uint64_t first_idx, cur_idx, nxt_idx, num_elements_per_chunk; + std::thread constructor_thread[8]; + + ArrayInput isa_array(lISA); + first_idx = isa_array.Get(); + num_elements_per_chunk = SuccinctUtils::NumBlocks(npa_size_, 8); + for (uint8_t i = 0; i < 8; i++) { + uint64_t remaining_elements = + (i * num_elements_per_chunk >= npa_size_) ? + 0 : npa_size_ - i * num_elements_per_chunk; + uint64_t num_elements = SuccinctUtils::Min(remaining_elements, + num_elements_per_chunk); + constructor_thread[i] = std::thread(&DeltaEncodedNPA::ConstructNPAChunkInMem, lNPA, + lISA, i * num_elements_per_chunk, + num_elements, (i == 7) ? first_idx : -1ULL); + } + + for (uint8_t i = 0; i < 8; i++) { + constructor_thread[i].join(); + } + + //isa_stream.CloseAndRemove(); + + //SuccinctUtils::WriteToFile(lNPA, npa_size_, npa_file); + + del_npa_ = new DeltaEncodedVector[sigma_size_]; + ThreadPool pool(8); + for (uint64_t i = 0; i < col_offsets_.size(); i++) { + uint64_t start_offset = col_offsets_[i]; + uint64_t end_offset = (i < col_offsets_.size() - 1) ? col_offsets_[i + 1] : npa_size_; + pool.Enqueue([&, start_offset, end_offset, i] { + EncodeNPAChunkInMem(&(this->del_npa_[i]), lNPA, start_offset, end_offset); + }); + } + pool.ShutDown(); + delete[] lNPA; + // remove(npa_file.c_str()); + } + // Access element at index i virtual uint64_t operator[](uint64_t i) { // Get column id @@ -243,7 +291,6 @@ class DeltaEncodedNPA : public NPA { // Read sampling rate in.read(reinterpret_cast(&(sampling_rate_)), sizeof(uint32_t)); in_size += sizeof(uint32_t); - // Read coloffsets in_size += SuccinctBase::DeserializeVector(col_offsets_, in); @@ -286,6 +333,8 @@ class DeltaEncodedNPA : public NPA { DeltaEncodedVector *del_npa_; private: + + //On-disk ConstructNPAChunk function static void ConstructNPAChunk(int64_t *lNPA, std::string isa_file, uint64_t start_pos, uint64_t n_elems, int64_t first_idx) { @@ -306,6 +355,7 @@ class DeltaEncodedNPA : public NPA { isa_stream.Close(); } + //On-disk ConstructNPAChunk function void EncodeNPAChunk(DeltaEncodedVector *dv, std::string npa_file, uint64_t start_offset, uint64_t end_offset) { ArrayStream npa_stream(npa_file, start_offset); @@ -318,6 +368,42 @@ class DeltaEncodedNPA : public NPA { npa_stream.Close(); } + //In memory ConstructNPAChunk function + static void ConstructNPAChunkInMem(int64_t *lNPA, int64_t *lISA, + uint64_t start_pos, uint64_t n_elems, + int64_t first_idx) { + // ISA Stream is configured to start reading from correct position + uint64_t cur_idx, nxt_idx; + ArrayInput isa_array(lISA, start_pos); + cur_idx = isa_array.Get(); + + for (uint64_t i = 0; i < n_elems; i++) { + nxt_idx = isa_array.Get(); + lNPA[cur_idx] = nxt_idx; + cur_idx = nxt_idx; + } + + if (first_idx > 0) { + lNPA[cur_idx] = first_idx; + } + } + + //In memory EncodeNPAChunk function + void EncodeNPAChunkInMem(DeltaEncodedVector *dv, int64_t *lNPA, uint64_t start_offset, + uint64_t end_offset) { + ArrayInput npa_array(lNPA, start_offset); + std::vector column; + for (uint64_t j = start_offset; j < end_offset; j++) { + column.push_back(npa_array.Get()); + } + assert(column.size() > 0); + CreateDeltaEncodedVector(dv, column); + } + + + + + }; #endif diff --git a/core/include/npa/elias_gamma_encoded_npa.h b/core/include/npa/elias_gamma_encoded_npa.h index 79b6f44..086b718 100644 --- a/core/include/npa/elias_gamma_encoded_npa.h +++ b/core/include/npa/elias_gamma_encoded_npa.h @@ -15,6 +15,12 @@ class EliasGammaEncodedNPA : public DeltaEncodedNPA { std::string& isa_file, std::vector& col_offsets, std::string npa_file, SuccinctAllocator &s_allocator); + + EliasGammaEncodedNPA(uint64_t npa_size, uint64_t sigma_size, + uint32_t context_len, uint32_t sampling_rate, + int64_t* lISA, + std::vector& col_offsets, + SuccinctAllocator &s_allocator); EliasGammaEncodedNPA(uint32_t context_len, uint32_t sampling_rate, SuccinctAllocator &s_allocator); diff --git a/core/include/succinct_core.h b/core/include/succinct_core.h index b6ead50..4e7688f 100644 --- a/core/include/succinct_core.h +++ b/core/include/succinct_core.h @@ -20,6 +20,7 @@ #include "sampledarray/sampled_by_value_isa.h" #include "sampledarray/sampled_by_value_sa.h" #include "succinct_base.h" +#include "utils/array_input.h" #include "utils/array_stream.h" #include "utils/divsufsortxx.h" #include "utils/divsufsortxx_utility.h" @@ -146,7 +147,7 @@ class SuccinctCore : public SuccinctBase { uint32_t sampling_range); // Constructs the core data structures - void Construct(uint8_t* input, size_t input_size, uint32_t sa_sampling_rate, + void Construct(bool in_mem, uint8_t* input, size_t input_size, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, uint32_t npa_sampling_rate, uint32_t context_len, SamplingScheme sa_sampling_scheme, SamplingScheme isa_sampling_scheme, diff --git a/core/include/utils/array_input.h b/core/include/utils/array_input.h new file mode 100644 index 0000000..c514217 --- /dev/null +++ b/core/include/utils/array_input.h @@ -0,0 +1,37 @@ +#ifndef CORE_ARRAY_INPUT_H_ +#define CORE_ARRAY_INPUT_H_ + +#include +#include +#include + +#include "utils/succinct_utils.h" + +class ArrayInput { + public: + ArrayInput(int64_t *array, uint64_t start_idx = 0) { + current_idx_ = start_idx; + array_ = array; + } + + uint64_t Get() { + uint64_t val = array_[current_idx_]; + current_idx_++; + return val; + } + + uint64_t GetCurrentIndex() { + return current_idx_; + } + + void Reset() { + current_idx_ = 0; + } + + private: + int64_t *array_; + uint64_t current_idx_; + +}; + +#endif // CORE_ARRAY_STREAM_H_ diff --git a/core/include/utils/succinct_utils.h b/core/include/utils/succinct_utils.h index a76569a..33f65a9 100644 --- a/core/include/utils/succinct_utils.h +++ b/core/include/utils/succinct_utils.h @@ -91,6 +91,8 @@ class SuccinctUtils { return data; } + + // Writes an integer array to file template static void WriteToFile(T* data, size_t size, std::string outfile) { diff --git a/core/src/npa/elias_gamma_encoded_npa.cc b/core/src/npa/elias_gamma_encoded_npa.cc index ca5e0c2..a1884e7 100644 --- a/core/src/npa/elias_gamma_encoded_npa.cc +++ b/core/src/npa/elias_gamma_encoded_npa.cc @@ -14,6 +14,19 @@ EliasGammaEncodedNPA::EliasGammaEncodedNPA(uint64_t npa_size, Encode(isa_file, col_offsets, npa_file); } +EliasGammaEncodedNPA::EliasGammaEncodedNPA(uint64_t npa_size, + uint64_t sigma_size, + uint32_t context_len, + uint32_t sampling_rate, + int64_t* lISA, + std::vector& col_offsets, + SuccinctAllocator &s_allocator) + : DeltaEncodedNPA(npa_size, sigma_size, context_len, sampling_rate, + NPAEncodingScheme::ELIAS_GAMMA_ENCODED, s_allocator) { + InitPrefixSum(); + EncodeInMem(lISA, col_offsets); +} + EliasGammaEncodedNPA::EliasGammaEncodedNPA(uint32_t context_len, uint32_t sampling_rate, SuccinctAllocator &s_allocator) diff --git a/core/src/succinct_core.cc b/core/src/succinct_core.cc index 8e0f317..8f7c21a 100644 --- a/core/src/succinct_core.cc +++ b/core/src/succinct_core.cc @@ -140,7 +140,8 @@ void SuccinctCore::ConstructFromContent(const std::string &input, } data[fsize] = 1; - Construct(data, fsize + 1, sa_sampling_rate, isa_sampling_rate, + //Construct in memory + Construct(1, data, fsize + 1, sa_sampling_rate, isa_sampling_rate, npa_sampling_rate, context_len, sa_sampling_scheme, isa_sampling_scheme, npa_encoding_scheme, sampling_range); } @@ -165,13 +166,14 @@ void SuccinctCore::Construct(const std::string &filename, fclose(f); data[fsize] = 1; - Construct(data, fsize + 1, sa_sampling_rate, isa_sampling_rate, + //Construct in file + Construct(0, data, fsize + 1, sa_sampling_rate, isa_sampling_rate, npa_sampling_rate, context_len, sa_sampling_scheme, isa_sampling_scheme, npa_encoding_scheme, sampling_range); } /* Primary Construct function */ -void SuccinctCore::Construct(uint8_t *input, size_t input_size, +void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, uint32_t sa_sampling_rate, uint32_t isa_sampling_rate, uint32_t npa_sampling_rate, uint32_t context_len, @@ -180,156 +182,312 @@ void SuccinctCore::Construct(uint8_t *input, size_t input_size, NPA::NPAEncodingScheme npa_encoding_scheme, uint32_t sampling_range) { - std::string sa_file = ".tmp.sa"; - std::string isa_file = ".tmp.isa"; - std::string npa_file = ".tmp.npa"; - - // Save metadata - input_size_ = input_size; - uint32_t bits = SuccinctUtils::IntegerLog2(input_size_ + 1); - - // Construct Suffix Array - auto *lSA = (int64_t *) s_allocator.s_calloc(sizeof(int64_t), input_size_); - divsufsortxx::constructSA(input, (input + input_size_), lSA, - lSA + input_size_, 256); - - // Write Suffix Array to file - SuccinctUtils::WriteToFile(lSA, input_size_, sa_file); - - ArrayStream sa_stream(sa_file); - s_allocator.s_free(lSA); - - // Allocate space for Inverse Suffix Array - auto *lISA = (int64_t *) s_allocator.s_calloc(sizeof(int64_t), - input_size_); - - // Auxiliary Data Structures for NPA - std::vector col_offsets; - uint64_t cur_sa, prv_sa; - - prv_sa = cur_sa = sa_stream.Get(); - lISA[cur_sa] = 0; - alphabet_size_ = 1; - alphabet_map_[input[cur_sa]] = std::pair(0, 0); - col_offsets.push_back(0); - for (uint64_t i = 1; i < input_size_; i++) { - cur_sa = sa_stream.Get(); - lISA[cur_sa] = i; - if (input[cur_sa] != input[prv_sa]) { - alphabet_map_[input[cur_sa]] = std::pair( - i, alphabet_size_++); - col_offsets.push_back(i); + if (in_mem == 0){ + //USE FILES ON DISK TO STORE ARRAYS + + std::string sa_file = ".tmp.sa"; + std::string isa_file = ".tmp.isa"; + std::string npa_file = ".tmp.npa"; + + // Save metadata + input_size_ = input_size; + uint32_t bits = SuccinctUtils::IntegerLog2(input_size_ + 1); + + // Construct Suffix Array + auto *lSA = (int64_t *) s_allocator.s_calloc(sizeof(int64_t), input_size_); + divsufsortxx::constructSA(input, (input + input_size_), lSA, + lSA + input_size_, 256); + + // Write Suffix Array to file + SuccinctUtils::WriteToFile(lSA, input_size_, sa_file); + + ArrayStream sa_stream(sa_file); + s_allocator.s_free(lSA); + + // Allocate space for Inverse Suffix Array + auto *lISA = (int64_t *) s_allocator.s_calloc(sizeof(int64_t), + input_size_); + + // Auxiliary Data Structures for NPA + std::vector col_offsets; + uint64_t cur_sa, prv_sa; + + prv_sa = cur_sa = sa_stream.Get(); + lISA[cur_sa] = 0; + alphabet_size_ = 1; + alphabet_map_[input[cur_sa]] = std::pair(0, 0); + col_offsets.push_back(0); + for (uint64_t i = 1; i < input_size_; i++) { + cur_sa = sa_stream.Get(); + lISA[cur_sa] = i; + if (input[cur_sa] != input[prv_sa]) { + alphabet_map_[input[cur_sa]] = std::pair( + i, alphabet_size_++); + col_offsets.push_back(i); + } + prv_sa = cur_sa; } - prv_sa = cur_sa; - } - - alphabet_map_[(char) 0] = std::pair(input_size_, - alphabet_size_); - assert(sa_stream.GetCurrentIndex() == input_size_); - sa_stream.Reset(); - alphabet_ = new char[alphabet_size_ + 1]; - for (auto alphabet_entry : alphabet_map_) { - alphabet_[alphabet_entry.second.second] = alphabet_entry.first; - } + alphabet_map_[(char) 0] = std::pair(input_size_, + alphabet_size_); + assert(sa_stream.GetCurrentIndex() == input_size_); + sa_stream.Reset(); - // Write Inverse Suffix Array to file - SuccinctUtils::WriteToFile(lISA, input_size_, isa_file); - s_allocator.s_free(lISA); - ArrayStream isa_stream(isa_file); - - // Compact input data (if needed) - Bitmap *data_bitmap = nullptr; - if (npa_encoding_scheme == NPA::NPAEncodingScheme::WAVELET_TREE_ENCODED) { - data_bitmap = new Bitmap; - int sigma_bits = SuccinctUtils::IntegerLog2(alphabet_size_ + 1); - InitBitmap(&data_bitmap, input_size_ * sigma_bits, s_allocator); - for (uint64_t i = 0; i < input_size_; i++) { - SetBitmapArray(&data_bitmap, i, alphabet_map_[input[i]].second, - sigma_bits); + alphabet_ = new char[alphabet_size_ + 1]; + for (auto alphabet_entry : alphabet_map_) { + alphabet_[alphabet_entry.second.second] = alphabet_entry.first; } - } - s_allocator.s_free(input); - switch (npa_encoding_scheme) { - case NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED: { - npa_ = new EliasGammaEncodedNPA(input_size_, alphabet_size_, context_len, - npa_sampling_rate, isa_file, col_offsets, - npa_file, s_allocator); - break; + // Write Inverse Suffix Array to file + SuccinctUtils::WriteToFile(lISA, input_size_, isa_file); + s_allocator.s_free(lISA); + ArrayStream isa_stream(isa_file); + + // Compact input data (if needed) + Bitmap *data_bitmap = nullptr; + if (npa_encoding_scheme == NPA::NPAEncodingScheme::WAVELET_TREE_ENCODED) { + data_bitmap = new Bitmap; + int sigma_bits = SuccinctUtils::IntegerLog2(alphabet_size_ + 1); + InitBitmap(&data_bitmap, input_size_ * sigma_bits, s_allocator); + for (uint64_t i = 0; i < input_size_; i++) { + SetBitmapArray(&data_bitmap, i, alphabet_map_[input[i]].second, + sigma_bits); + } } - case NPA::NPAEncodingScheme::ELIAS_DELTA_ENCODED: { - npa_ = new EliasDeltaEncodedNPA(input_size_, alphabet_size_, context_len, - npa_sampling_rate, isa_file, col_offsets, - npa_file, s_allocator); - return; + s_allocator.s_free(input); + + switch (npa_encoding_scheme) { + case NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED: { + npa_ = new EliasGammaEncodedNPA(input_size_, alphabet_size_, context_len, + npa_sampling_rate, isa_file, col_offsets, + npa_file, s_allocator); + break; + } + case NPA::NPAEncodingScheme::ELIAS_DELTA_ENCODED: { + npa_ = new EliasDeltaEncodedNPA(input_size_, alphabet_size_, context_len, + npa_sampling_rate, isa_file, col_offsets, + npa_file, s_allocator); + return; + } + case NPA::NPAEncodingScheme::WAVELET_TREE_ENCODED: { + isa_stream.CloseAndRemove(); + Bitmap *compactSA = ReadAsBitmap(input_size_, bits, s_allocator, sa_file); + Bitmap *compactISA = ReadAsBitmap(input_size_, bits, s_allocator, + isa_file); + npa_ = new WaveletTreeEncodedNPA(input_size_, alphabet_size_, context_len, + npa_sampling_rate, data_bitmap, + compactSA, compactISA, s_allocator); + DestroyBitmap(&data_bitmap, s_allocator); + break; + } + default:npa_ = nullptr; } - case NPA::NPAEncodingScheme::WAVELET_TREE_ENCODED: { - isa_stream.CloseAndRemove(); - Bitmap *compactSA = ReadAsBitmap(input_size_, bits, s_allocator, sa_file); - Bitmap *compactISA = ReadAsBitmap(input_size_, bits, s_allocator, - isa_file); - npa_ = new WaveletTreeEncodedNPA(input_size_, alphabet_size_, context_len, - npa_sampling_rate, data_bitmap, - compactSA, compactISA, s_allocator); - DestroyBitmap(&data_bitmap, s_allocator); - break; + assert(npa_ != nullptr); + + switch (sa_sampling_scheme) { + case SamplingScheme::FLAT_SAMPLE_BY_INDEX: + sa_ = new SampledByIndexSA(sa_sampling_rate, npa_, sa_stream, input_size_, + s_allocator); + break; + case SamplingScheme::FLAT_SAMPLE_BY_VALUE: + sa_ = new SampledByValueSA(sa_sampling_rate, npa_, sa_stream, input_size_, + s_allocator); + break; + case SamplingScheme::LAYERED_SAMPLE_BY_INDEX: + sa_ = new LayeredSampledSA(sa_sampling_rate, + sa_sampling_rate * sampling_range, npa_, + sa_stream, input_size_, s_allocator); + break; + case SamplingScheme::OPPORTUNISTIC_LAYERED_SAMPLE_BY_INDEX: + sa_ = new OpportunisticLayeredSampledSA(sa_sampling_rate, + sa_sampling_rate * sampling_range, + npa_, sa_stream, input_size_, + s_allocator); + break; + default:sa_ = nullptr; + } + sa_stream.Reset(); + assert(sa_ != nullptr); + + switch (isa_sampling_scheme) { + case SamplingScheme::FLAT_SAMPLE_BY_INDEX: + isa_ = new SampledByIndexISA(isa_sampling_rate, npa_, sa_stream, + input_size_, s_allocator); + break; + case SamplingScheme::FLAT_SAMPLE_BY_VALUE:assert(sa_->GetSamplingScheme() == SamplingScheme::FLAT_SAMPLE_BY_VALUE); + isa_ = new SampledByValueISA( + sa_sampling_rate, npa_, sa_stream, input_size_, + ((SampledByValueSA *) sa_)->GetSampledPositions(), s_allocator); + break; + case SamplingScheme::LAYERED_SAMPLE_BY_INDEX: + isa_ = new LayeredSampledISA(isa_sampling_rate, + isa_sampling_rate * sampling_range, npa_, + sa_stream, input_size_, s_allocator); + break; + case SamplingScheme::OPPORTUNISTIC_LAYERED_SAMPLE_BY_INDEX: + isa_ = new OpportunisticLayeredSampledISA( + isa_sampling_rate, isa_sampling_rate * sampling_range, npa_, + sa_stream, input_size_, s_allocator); + break; + default:isa_ = nullptr; + } + sa_stream.Reset(); + assert(isa_ != nullptr); + + sa_stream.CloseAndRemove(); + } else { + //USE MEMORY TO STORE ARRAYS + + // Save metadata + input_size_ = input_size; + uint32_t bits = SuccinctUtils::IntegerLog2(input_size_ + 1); + + // Construct Suffix Array + auto *lSA = (int64_t *) s_allocator.s_calloc(sizeof(int64_t), input_size_); + divsufsortxx::constructSA(input, (input + input_size_), lSA, + lSA + input_size_, 256); + + // Write Suffix Array to file + // SuccinctUtils::WriteToFile(lSA, input_size_, sa_file); + + ArrayInput sa_array(lSA); + + // Allocate space for Inverse Suffix Array + auto *lISA = (int64_t *) s_allocator.s_calloc(sizeof(int64_t), + input_size_); + + // Auxiliary Data Structures for NPA + std::vector col_offsets; + uint64_t cur_sa, prv_sa; + + prv_sa = cur_sa = sa_array.Get(); + lISA[cur_sa] = 0; + alphabet_size_ = 1; + alphabet_map_[input[cur_sa]] = std::pair(0, 0); + col_offsets.push_back(0); + for (uint64_t i = 1; i < input_size_; i++) { + cur_sa = sa_array.Get(); + lISA[cur_sa] = i; + if (input[cur_sa] != input[prv_sa]) { + alphabet_map_[input[cur_sa]] = std::pair( + i, alphabet_size_++); + col_offsets.push_back(i); + } + prv_sa = cur_sa; } - default:npa_ = nullptr; - } - assert(npa_ != nullptr); - switch (sa_sampling_scheme) { - case SamplingScheme::FLAT_SAMPLE_BY_INDEX: - sa_ = new SampledByIndexSA(sa_sampling_rate, npa_, sa_stream, input_size_, - s_allocator); - break; - case SamplingScheme::FLAT_SAMPLE_BY_VALUE: - sa_ = new SampledByValueSA(sa_sampling_rate, npa_, sa_stream, input_size_, - s_allocator); - break; - case SamplingScheme::LAYERED_SAMPLE_BY_INDEX: - sa_ = new LayeredSampledSA(sa_sampling_rate, - sa_sampling_rate * sampling_range, npa_, - sa_stream, input_size_, s_allocator); - break; - case SamplingScheme::OPPORTUNISTIC_LAYERED_SAMPLE_BY_INDEX: - sa_ = new OpportunisticLayeredSampledSA(sa_sampling_rate, - sa_sampling_rate * sampling_range, - npa_, sa_stream, input_size_, - s_allocator); - break; - default:sa_ = nullptr; - } - sa_stream.Reset(); - assert(sa_ != nullptr); + alphabet_map_[(char) 0] = std::pair(input_size_, + alphabet_size_); + assert(sa_array.GetCurrentIndex() == input_size_); + sa_array.Reset(); - switch (isa_sampling_scheme) { - case SamplingScheme::FLAT_SAMPLE_BY_INDEX: - isa_ = new SampledByIndexISA(isa_sampling_rate, npa_, sa_stream, - input_size_, s_allocator); - break; - case SamplingScheme::FLAT_SAMPLE_BY_VALUE:assert(sa_->GetSamplingScheme() == SamplingScheme::FLAT_SAMPLE_BY_VALUE); - isa_ = new SampledByValueISA( - sa_sampling_rate, npa_, sa_stream, input_size_, - ((SampledByValueSA *) sa_)->GetSampledPositions(), s_allocator); - break; - case SamplingScheme::LAYERED_SAMPLE_BY_INDEX: - isa_ = new LayeredSampledISA(isa_sampling_rate, - isa_sampling_rate * sampling_range, npa_, - sa_stream, input_size_, s_allocator); - break; - case SamplingScheme::OPPORTUNISTIC_LAYERED_SAMPLE_BY_INDEX: - isa_ = new OpportunisticLayeredSampledISA( - isa_sampling_rate, isa_sampling_rate * sampling_range, npa_, - sa_stream, input_size_, s_allocator); - break; - default:isa_ = nullptr; - } - sa_stream.Reset(); - assert(isa_ != nullptr); + alphabet_ = new char[alphabet_size_ + 1]; + for (auto alphabet_entry : alphabet_map_) { + alphabet_[alphabet_entry.second.second] = alphabet_entry.first; + } + + // Write Inverse Suffix Array to file + //SuccinctUtils::WriteToFile(lISA, input_size_, isa_file); + ArrayInput isa_array(lISA); + //ArrayStream isa_stream(isa_file); + + // Compact input data (if needed) + Bitmap *data_bitmap = nullptr; + if (npa_encoding_scheme == NPA::NPAEncodingScheme::WAVELET_TREE_ENCODED) { + data_bitmap = new Bitmap; + int sigma_bits = SuccinctUtils::IntegerLog2(alphabet_size_ + 1); + InitBitmap(&data_bitmap, input_size_ * sigma_bits, s_allocator); + for (uint64_t i = 0; i < input_size_; i++) { + SetBitmapArray(&data_bitmap, i, alphabet_map_[input[i]].second, + sigma_bits); + } + } + s_allocator.s_free(input); + + switch (npa_encoding_scheme) { + case NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED: { + npa_ = new EliasGammaEncodedNPA(input_size_, alphabet_size_, context_len, + npa_sampling_rate, lISA, col_offsets, + s_allocator); + break; + } + // case NPA::NPAEncodingScheme::ELIAS_DELTA_ENCODED: { + // npa_ = new EliasDeltaEncodedNPA(input_size_, alphabet_size_, context_len, + // npa_sampling_rate, isa_file, col_offsets, + // npa_file, s_allocator); + // return; + // } + // case NPA::NPAEncodingScheme::WAVELET_TREE_ENCODED: { + // //isa_stream.CloseAndRemove(); + // Bitmap *compactSA = ReadAsBitmap(input_size_, bits, s_allocator, sa_file); + // Bitmap *compactISA = ReadAsBitmap(input_size_, bits, s_allocator, + // isa_file); + // npa_ = new WaveletTreeEncodedNPA(input_size_, alphabet_size_, context_len, + // npa_sampling_rate, data_bitmap, + // compactSA, compactISA, s_allocator); + // DestroyBitmap(&data_bitmap, s_allocator); + // break; + // } + // default:npa_ = nullptr; + } + // assert(npa_ != nullptr); + + // switch (sa_sampling_scheme) { + // case SamplingScheme::FLAT_SAMPLE_BY_INDEX: + // sa_ = new SampledByIndexSA(sa_sampling_rate, npa_, sa_stream, input_size_, + // s_allocator); + // break; + // case SamplingScheme::FLAT_SAMPLE_BY_VALUE: + // sa_ = new SampledByValueSA(sa_sampling_rate, npa_, sa_stream, input_size_, + // s_allocator); + // break; + // case SamplingScheme::LAYERED_SAMPLE_BY_INDEX: + // sa_ = new LayeredSampledSA(sa_sampling_rate, + // sa_sampling_rate * sampling_range, npa_, + // sa_stream, input_size_, s_allocator); + // break; + // case SamplingScheme::OPPORTUNISTIC_LAYERED_SAMPLE_BY_INDEX: + // sa_ = new OpportunisticLayeredSampledSA(sa_sampling_rate, + // sa_sampling_rate * sampling_range, + // npa_, sa_stream, input_size_, + // s_allocator); + // break; + // default:sa_ = nullptr; + // } + // sa_stream.Reset(); + // assert(sa_ != nullptr); + + // switch (isa_sampling_scheme) { + // case SamplingScheme::FLAT_SAMPLE_BY_INDEX: + // isa_ = new SampledByIndexISA(isa_sampling_rate, npa_, sa_stream, + // input_size_, s_allocator); + // break; + // case SamplingScheme::FLAT_SAMPLE_BY_VALUE:assert(sa_->GetSamplingScheme() == SamplingScheme::FLAT_SAMPLE_BY_VALUE); + // isa_ = new SampledByValueISA( + // sa_sampling_rate, npa_, sa_stream, input_size_, + // ((SampledByValueSA *) sa_)->GetSampledPositions(), s_allocator); + // break; + // case SamplingScheme::LAYERED_SAMPLE_BY_INDEX: + // isa_ = new LayeredSampledISA(isa_sampling_rate, + // isa_sampling_rate * sampling_range, npa_, + // sa_stream, input_size_, s_allocator); + // break; + // case SamplingScheme::OPPORTUNISTIC_LAYERED_SAMPLE_BY_INDEX: + // isa_ = new OpportunisticLayeredSampledISA( + // isa_sampling_rate, isa_sampling_rate * sampling_range, npa_, + // sa_stream, input_size_, s_allocator); + // break; + // default:isa_ = nullptr; + // } + // sa_stream.Reset(); + // assert(isa_ != nullptr); + + //sa_stream.CloseAndRemove(); + + s_allocator.s_free(lISA); + s_allocator.s_free(lSA); + } - sa_stream.CloseAndRemove(); } /* Lookup functions for each of the core data structures */ From 9b15eaf322b25ac034028e0ce16e24231e17656b Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Wed, 24 Jun 2020 20:44:29 -0400 Subject: [PATCH 29/39] Changed code to construct in memory arrays instead of files on disk (2) --- core/include/npa/elias_delta_encoded_npa.h | 6 + .../sampledarray/layered_sampled_isa.h | 5 + .../include/sampledarray/layered_sampled_sa.h | 6 + .../opportunistic_layered_sampled_isa.h | 6 + .../opportunistic_layered_sampled_sa.h | 6 + core/include/sampledarray/sampled_array.h | 1 + .../sampledarray/sampled_by_index_isa.h | 4 + .../sampledarray/sampled_by_index_sa.h | 4 + .../sampledarray/sampled_by_value_isa.h | 6 + .../sampledarray/sampled_by_value_sa.h | 5 + core/include/succinct_core.h | 16 +++ core/src/npa/elias_delta_encoded_npa.cc | 12 ++ core/src/sampledarray/layered_sampled_isa.cc | 22 ++++ core/src/sampledarray/layered_sampled_sa.cc | 22 ++++ .../opportunistic_layered_sampled_isa.cc | 21 +++ .../opportunistic_layered_sampled_sa.cc | 22 ++++ core/src/sampledarray/sampled_by_index_isa.cc | 29 +++++ core/src/sampledarray/sampled_by_index_sa.cc | 30 +++++ core/src/sampledarray/sampled_by_value_isa.cc | 34 +++++ core/src/sampledarray/sampled_by_value_sa.cc | 40 ++++++ core/src/succinct_core.cc | 122 +++++++++--------- pysuccinct/file.cpp | 2 +- 22 files changed, 362 insertions(+), 59 deletions(-) diff --git a/core/include/npa/elias_delta_encoded_npa.h b/core/include/npa/elias_delta_encoded_npa.h index 344e077..2a5094c 100644 --- a/core/include/npa/elias_delta_encoded_npa.h +++ b/core/include/npa/elias_delta_encoded_npa.h @@ -21,6 +21,12 @@ class EliasDeltaEncodedNPA : public DeltaEncodedNPA { std::vector& col_offsets, std::string npa_file, SuccinctAllocator &s_allocator); + EliasDeltaEncodedNPA(uint64_t npa_size, uint64_t sigma_size, + uint32_t context_len, uint32_t sampling_rate, + int64_t* lISA, + std::vector& col_offsets, + SuccinctAllocator &s_allocator); + EliasDeltaEncodedNPA(uint32_t context_len, uint32_t sampling_rate, SuccinctAllocator &s_allocator); diff --git a/core/include/sampledarray/layered_sampled_isa.h b/core/include/sampledarray/layered_sampled_isa.h index 1a98f66..ad36871 100644 --- a/core/include/sampledarray/layered_sampled_isa.h +++ b/core/include/sampledarray/layered_sampled_isa.h @@ -9,6 +9,10 @@ class LayeredSampledISA : public LayeredSampledArray { NPA *npa, ArrayStream& sa_stream, uint64_t sa_n, SuccinctAllocator &s_allocator); + LayeredSampledISA(uint32_t target_sampling_rate, uint32_t base_sampling_rate, + NPA *npa, ArrayInput& sa_array, uint64_t sa_n, + SuccinctAllocator &s_allocator); + LayeredSampledISA(uint32_t target_sampling_rate, uint32_t base_sampling_rate, NPA *npa, SuccinctAllocator &s_allocator); @@ -19,6 +23,7 @@ class LayeredSampledISA : public LayeredSampledArray { protected: void SampleLayered(ArrayStream& sa_stream, uint64_t n); + void SampleLayeredInMem(ArrayInput& sa_array, uint64_t n); NPA *npa; }; diff --git a/core/include/sampledarray/layered_sampled_sa.h b/core/include/sampledarray/layered_sampled_sa.h index 382f644..a511501 100644 --- a/core/include/sampledarray/layered_sampled_sa.h +++ b/core/include/sampledarray/layered_sampled_sa.h @@ -9,6 +9,10 @@ class LayeredSampledSA : public LayeredSampledArray { NPA *npa, ArrayStream& sa_stream, uint64_t sa_n, SuccinctAllocator &s_allocator); + LayeredSampledSA(uint32_t target_sampling_rate, uint32_t base_sampling_rate, + NPA *npa, ArrayInput& sa_array, uint64_t sa_n, + SuccinctAllocator &s_allocator); + LayeredSampledSA(uint32_t target_sampling_rate, uint32_t base_sampling_rate, NPA *npa, SuccinctAllocator &s_allocator); @@ -20,6 +24,8 @@ class LayeredSampledSA : public LayeredSampledArray { protected: void SampleLayered(ArrayStream& sa_stream, uint64_t n); + void SampleLayeredInMem(ArrayInput& sa_array, uint64_t n); + NPA *npa; }; diff --git a/core/include/sampledarray/opportunistic_layered_sampled_isa.h b/core/include/sampledarray/opportunistic_layered_sampled_isa.h index d3f8606..615707d 100644 --- a/core/include/sampledarray/opportunistic_layered_sampled_isa.h +++ b/core/include/sampledarray/opportunistic_layered_sampled_isa.h @@ -10,6 +10,11 @@ class OpportunisticLayeredSampledISA : public OpportunisticLayeredSampledArray { ArrayStream& sa_stream, uint64_t sa_n, SuccinctAllocator &s_allocator); + OpportunisticLayeredSampledISA(uint32_t target_sampling_rate, + uint32_t base_sampling_rate, NPA *npa, + ArrayInput& sa_array, uint64_t sa_n, + SuccinctAllocator &s_allocator); + OpportunisticLayeredSampledISA(uint32_t target_sampling_rate, uint32_t base_sampling_rate, NPA *npa, SuccinctAllocator &s_allocator); @@ -19,6 +24,7 @@ class OpportunisticLayeredSampledISA : public OpportunisticLayeredSampledArray { protected: void SampleLayered(ArrayStream& sa_stream, uint64_t n); + void SampleLayeredInMem(ArrayInput& sa_array, uint64_t n); NPA *npa_; }; diff --git a/core/include/sampledarray/opportunistic_layered_sampled_sa.h b/core/include/sampledarray/opportunistic_layered_sampled_sa.h index db4b0cb..ec9b3fc 100644 --- a/core/include/sampledarray/opportunistic_layered_sampled_sa.h +++ b/core/include/sampledarray/opportunistic_layered_sampled_sa.h @@ -10,6 +10,11 @@ class OpportunisticLayeredSampledSA : public OpportunisticLayeredSampledArray { ArrayStream& sa_stream, uint64_t sa_n, SuccinctAllocator &s_allocator); + OpportunisticLayeredSampledSA(uint32_t target_sampling_rate, + uint32_t base_sampling_rate, NPA *npa, + ArrayInput& sa_array, uint64_t sa_n, + SuccinctAllocator &s_allocator); + OpportunisticLayeredSampledSA(uint32_t target_sampling_rate, uint32_t base_sampling_rate, NPA *npa, SuccinctAllocator &s_allocator); @@ -21,6 +26,7 @@ class OpportunisticLayeredSampledSA : public OpportunisticLayeredSampledArray { NPA *npa; void SampleLayered(ArrayStream& sa_stream, uint64_t n); + void SampleLayeredInMem(ArrayInput& sa_array, uint64_t n); }; #endif diff --git a/core/include/sampledarray/sampled_array.h b/core/include/sampledarray/sampled_array.h index df18ca6..a56d92b 100644 --- a/core/include/sampledarray/sampled_array.h +++ b/core/include/sampledarray/sampled_array.h @@ -3,6 +3,7 @@ #include "sampling_scheme.h" #include "utils/array_stream.h" +#include "utils/array_input.h" class SampledArray { public: diff --git a/core/include/sampledarray/sampled_by_index_isa.h b/core/include/sampledarray/sampled_by_index_isa.h index a40ce49..99fa8fa 100644 --- a/core/include/sampledarray/sampled_by_index_isa.h +++ b/core/include/sampledarray/sampled_by_index_isa.h @@ -8,6 +8,9 @@ class SampledByIndexISA : public FlatSampledArray { // Constructor SampledByIndexISA(uint32_t sampling_rate, NPA *npa, ArrayStream& sa_stream, uint64_t sa_n, SuccinctAllocator &s_allocator); + + SampledByIndexISA(uint32_t sampling_rate, NPA *npa, ArrayInput& sa_array, + uint64_t sa_n, SuccinctAllocator &s_allocator); SampledByIndexISA(uint32_t sampling_rate, NPA *npa, SuccinctAllocator &s_allocator); @@ -18,6 +21,7 @@ class SampledByIndexISA : public FlatSampledArray { protected: // Sample by index for ISA using original SA virtual void Sample(ArrayStream& original, uint64_t n); + virtual void SampleInMem(ArrayInput& original, uint64_t n); }; #endif diff --git a/core/include/sampledarray/sampled_by_index_sa.h b/core/include/sampledarray/sampled_by_index_sa.h index f7ab941..ab38fa9 100644 --- a/core/include/sampledarray/sampled_by_index_sa.h +++ b/core/include/sampledarray/sampled_by_index_sa.h @@ -9,6 +9,9 @@ class SampledByIndexSA : public FlatSampledArray { SampledByIndexSA(uint32_t sampling_rate, NPA *npa, ArrayStream& sa_stream, uint64_t sa_n, SuccinctAllocator &s_allocator); + SampledByIndexSA(uint32_t sampling_rate, NPA *npa, ArrayInput& sa_array, + uint64_t sa_n, SuccinctAllocator &s_allocator); + SampledByIndexSA(uint32_t sampling_rate, NPA *npa, SuccinctAllocator &s_allocator); @@ -18,6 +21,7 @@ class SampledByIndexSA : public FlatSampledArray { protected: // Sample original SA by index virtual void Sample(ArrayStream& sa_stream, uint64_t n); + virtual void SampleInMem(ArrayInput& sa_array, uint64_t n); }; #endif diff --git a/core/include/sampledarray/sampled_by_value_isa.h b/core/include/sampledarray/sampled_by_value_isa.h index 24b0cd3..729cd13 100644 --- a/core/include/sampledarray/sampled_by_value_isa.h +++ b/core/include/sampledarray/sampled_by_value_isa.h @@ -10,6 +10,10 @@ class SampledByValueISA : public FlatSampledArray { uint64_t sa_n, Dictionary *d_bpos, SuccinctAllocator &s_allocator); + SampledByValueISA(uint32_t sampling_rate, NPA *npa, ArrayInput& sa_array, + uint64_t sa_n, Dictionary *d_bpos, + SuccinctAllocator &s_allocator); + SampledByValueISA(uint32_t sampling_rate, NPA *npa, SuccinctAllocator &s_allocator); @@ -26,6 +30,8 @@ class SampledByValueISA : public FlatSampledArray { // Sample by value for ISA using original SA virtual void Sample(ArrayStream& sa_stream, uint64_t n); + virtual void SampleInMem(ArrayInput& sa_array, uint64_t n); + private: Dictionary *sampled_positions_; }; diff --git a/core/include/sampledarray/sampled_by_value_sa.h b/core/include/sampledarray/sampled_by_value_sa.h index 4a21e88..9d0f500 100644 --- a/core/include/sampledarray/sampled_by_value_sa.h +++ b/core/include/sampledarray/sampled_by_value_sa.h @@ -8,6 +8,9 @@ class SampledByValueSA : public FlatSampledArray { // Constructor SampledByValueSA(uint32_t sampling_rate, NPA *npa, ArrayStream& sa_stream, uint64_t sa_n, SuccinctAllocator &s_allocator); + + SampledByValueSA(uint32_t sampling_rate, NPA *npa, ArrayInput& sa_array, + uint64_t sa_n, SuccinctAllocator &s_allocator); SampledByValueSA(uint32_t sampling_rate, NPA *npa, SuccinctAllocator &s_allocator); @@ -25,6 +28,8 @@ class SampledByValueSA : public FlatSampledArray { // Sample original SA by value virtual void Sample(ArrayStream& original, uint64_t n); + virtual void SampleInMem(ArrayInput& original, uint64_t n); + // Check if index is sampled bool IsSampled(uint64_t i); diff --git a/core/include/succinct_core.h b/core/include/succinct_core.h index 4e7688f..ef304ee 100644 --- a/core/include/succinct_core.h +++ b/core/include/succinct_core.h @@ -3,6 +3,7 @@ #include #include +#include #include "npa/elias_delta_encoded_npa.h" #include "npa/elias_gamma_encoded_npa.h" @@ -217,6 +218,21 @@ class SuccinctCore : public SuccinctBase { // } // return false; // } + static Bitmap* ArrayToBitmap(size_t size, uint8_t bits, + SuccinctAllocator& s_allocator, + int64_t* array_input) { + Bitmap* B = new Bitmap; + InitBitmap(&B, size * bits, s_allocator); + //std::ifstream in(infile); + for (uint64_t i = 0; i < size; i++) { + uint64_t val; + + //in.read(reinterpret_cast(&val), size * sizeof(uint64_t)); + SetBitmapArray(&B, i, val, bits); + } + return B; + } + static Bitmap* ReadAsBitmap(size_t size, uint8_t bits, SuccinctAllocator& s_allocator, diff --git a/core/src/npa/elias_delta_encoded_npa.cc b/core/src/npa/elias_delta_encoded_npa.cc index 5835368..f1bf9fe 100644 --- a/core/src/npa/elias_delta_encoded_npa.cc +++ b/core/src/npa/elias_delta_encoded_npa.cc @@ -13,6 +13,18 @@ EliasDeltaEncodedNPA::EliasDeltaEncodedNPA(uint64_t npa_size, Encode(isa_file, col_offsets, npa_file); } +EliasDeltaEncodedNPA::EliasDeltaEncodedNPA(uint64_t npa_size, + uint64_t sigma_size, + uint32_t context_len, + uint32_t sampling_rate, + int64_t* lISA, + std::vector& col_offsets, + SuccinctAllocator &s_allocator) + : DeltaEncodedNPA(npa_size, sigma_size, context_len, sampling_rate, + NPAEncodingScheme::ELIAS_DELTA_ENCODED, s_allocator) { + EncodeInMem(lISA, col_offsets); +} + EliasDeltaEncodedNPA::EliasDeltaEncodedNPA(uint32_t context_len, uint32_t sampling_rates, SuccinctAllocator &s_allocator) diff --git a/core/src/sampledarray/layered_sampled_isa.cc b/core/src/sampledarray/layered_sampled_isa.cc index 5b536ab..283ff01 100644 --- a/core/src/sampledarray/layered_sampled_isa.cc +++ b/core/src/sampledarray/layered_sampled_isa.cc @@ -10,6 +10,16 @@ LayeredSampledISA::LayeredSampledISA(uint32_t target_sampling_rate, SampleLayered(sa_stream, sa_n); } +LayeredSampledISA::LayeredSampledISA(uint32_t target_sampling_rate, + uint32_t base_sampling_rate, NPA *npa, + ArrayInput& sa_array, uint64_t sa_n, + SuccinctAllocator &s_allocator) + : LayeredSampledArray(target_sampling_rate, base_sampling_rate, sa_n, + s_allocator) { + this->npa = npa; + SampleLayeredInMem(sa_array, sa_n); +} + LayeredSampledISA::LayeredSampledISA(uint32_t target_sampling_rate, uint32_t base_sampling_rate, NPA *npa, SuccinctAllocator &s_allocator) @@ -17,6 +27,18 @@ LayeredSampledISA::LayeredSampledISA(uint32_t target_sampling_rate, this->npa = npa; } +void LayeredSampledISA::SampleLayeredInMem(ArrayInput& sa_array, uint64_t n) { + for (uint64_t i = 0; i < n; i++) { + uint64_t sa_val = sa_array.Get(); + if (sa_val % target_sampling_rate_ == 0) { + Layer l; + GetLayer(&l, sa_val); + bitmap_t *data = layer_data_[l.layer_id]; + SuccinctBase::SetBitmapArray(&data, l.layer_idx, i, data_bits_); + } + } +} + void LayeredSampledISA::SampleLayered(ArrayStream& sa_stream, uint64_t n) { for (uint64_t i = 0; i < n; i++) { uint64_t sa_val = sa_stream.Get(); diff --git a/core/src/sampledarray/layered_sampled_sa.cc b/core/src/sampledarray/layered_sampled_sa.cc index 1c1222e..4468d66 100644 --- a/core/src/sampledarray/layered_sampled_sa.cc +++ b/core/src/sampledarray/layered_sampled_sa.cc @@ -10,6 +10,16 @@ LayeredSampledSA::LayeredSampledSA(uint32_t target_sampling_rate, SampleLayered(sa_stream, sa_n); } +LayeredSampledSA::LayeredSampledSA(uint32_t target_sampling_rate, + uint32_t base_sampling_rate, NPA *npa, + ArrayInput& sa_array, uint64_t sa_n, + SuccinctAllocator &s_allocator) + : LayeredSampledArray(target_sampling_rate, base_sampling_rate, sa_n, + s_allocator) { + this->npa = npa; + SampleLayeredInMem(sa_array, sa_n); +} + LayeredSampledSA::LayeredSampledSA(uint32_t target_sampling_rate, uint32_t base_sampling_rate, NPA *npa, SuccinctAllocator &s_allocator) @@ -17,6 +27,18 @@ LayeredSampledSA::LayeredSampledSA(uint32_t target_sampling_rate, this->npa = npa; } +void LayeredSampledSA::SampleLayeredInMem(ArrayInput& sa_array, uint64_t n) { + for (uint64_t i = 0; i < n; i++) { + uint64_t sa_val = sa_array.Get(); + if (i % target_sampling_rate_ == 0) { + Layer l; + GetLayer(&l, i); + bitmap_t *data = layer_data_[l.layer_id]; + SuccinctBase::SetBitmapArray(&data, l.layer_idx, sa_val, data_bits_); + } + } +} + void LayeredSampledSA::SampleLayered(ArrayStream& sa_stream, uint64_t n) { for (uint64_t i = 0; i < n; i++) { uint64_t sa_val = sa_stream.Get(); diff --git a/core/src/sampledarray/opportunistic_layered_sampled_isa.cc b/core/src/sampledarray/opportunistic_layered_sampled_isa.cc index 0ddcbd0..fc72854 100644 --- a/core/src/sampledarray/opportunistic_layered_sampled_isa.cc +++ b/core/src/sampledarray/opportunistic_layered_sampled_isa.cc @@ -9,6 +9,15 @@ OpportunisticLayeredSampledISA::OpportunisticLayeredSampledISA( SampleLayered(sa_stream, sa_n); } +OpportunisticLayeredSampledISA::OpportunisticLayeredSampledISA( + uint32_t target_sampling_rate, uint32_t base_sampling_rate, NPA *npa, + ArrayInput& sa_array, uint64_t sa_n, SuccinctAllocator &s_allocator) + : OpportunisticLayeredSampledArray(target_sampling_rate, base_sampling_rate, + sa_n, s_allocator) { + this->npa_ = npa; + SampleLayeredInMem(sa_array, sa_n); +} + OpportunisticLayeredSampledISA::OpportunisticLayeredSampledISA( uint32_t target_sampling_rate, uint32_t base_sampling_rate, NPA *npa, SuccinctAllocator &s_allocator) @@ -17,6 +26,18 @@ OpportunisticLayeredSampledISA::OpportunisticLayeredSampledISA( this->npa_ = npa; } +void OpportunisticLayeredSampledISA::SampleLayeredInMem(ArrayInput& sa_array, uint64_t n) { + for (uint64_t i = 0; i < n; i++) { + uint64_t sa_val = sa_array.Get(); + if (sa_val % target_sampling_rate_ == 0) { + Layer l; + GetLayer(&l, sa_val); + bitmap_t *data = layer_data_[l.layer_id]; + SuccinctBase::SetBitmapArray(&data, l.layer_idx, i, data_bits_); + } + } +} + void OpportunisticLayeredSampledISA::SampleLayered(ArrayStream& sa_stream, uint64_t n) { for (uint64_t i = 0; i < n; i++) { uint64_t sa_val = sa_stream.Get(); diff --git a/core/src/sampledarray/opportunistic_layered_sampled_sa.cc b/core/src/sampledarray/opportunistic_layered_sampled_sa.cc index 550f490..bd64f98 100644 --- a/core/src/sampledarray/opportunistic_layered_sampled_sa.cc +++ b/core/src/sampledarray/opportunistic_layered_sampled_sa.cc @@ -9,6 +9,15 @@ OpportunisticLayeredSampledSA::OpportunisticLayeredSampledSA( SampleLayered(sa_stream, sa_n); } +OpportunisticLayeredSampledSA::OpportunisticLayeredSampledSA( + uint32_t target_sampling_rate, uint32_t base_sampling_rate, NPA *npa, + ArrayInput& sa_array, uint64_t sa_n, SuccinctAllocator &s_allocator) + : OpportunisticLayeredSampledArray(target_sampling_rate, base_sampling_rate, + sa_n, s_allocator) { + this->npa = npa; + SampleLayeredInMem(sa_array, sa_n); +} + OpportunisticLayeredSampledSA::OpportunisticLayeredSampledSA( uint32_t target_sampling_rate, uint32_t base_sampling_rate, NPA *npa, SuccinctAllocator &s_allocator) @@ -30,6 +39,19 @@ void OpportunisticLayeredSampledSA::SampleLayered(ArrayStream& sa_stream, } } +void OpportunisticLayeredSampledSA::SampleLayeredInMem(ArrayInput& sa_array, + uint64_t n) { + for (uint64_t i = 0; i < n; i++) { + uint64_t sa_val = sa_array.Get(); + if (i % target_sampling_rate_ == 0) { + Layer l; + GetLayer(&l, i); + bitmap_t *data = layer_data_[l.layer_id]; + SuccinctBase::SetBitmapArray(&data, l.layer_idx, sa_val, data_bits_); + } + } +} + uint64_t OpportunisticLayeredSampledSA::operator[](uint64_t i) { assert(i < original_size_); diff --git a/core/src/sampledarray/sampled_by_index_isa.cc b/core/src/sampledarray/sampled_by_index_isa.cc index c5ea02a..8c3d7e5 100644 --- a/core/src/sampledarray/sampled_by_index_isa.cc +++ b/core/src/sampledarray/sampled_by_index_isa.cc @@ -11,6 +11,17 @@ SampledByIndexISA::SampledByIndexISA(uint32_t sampling_rate, NPA *npa, } +SampledByIndexISA::SampledByIndexISA(uint32_t sampling_rate, NPA *npa, + ArrayInput& sa_array, uint64_t sa_n, + SuccinctAllocator &s_allocator) + : FlatSampledArray(sampling_rate, SamplingScheme::FLAT_SAMPLE_BY_INDEX, npa, + s_allocator) { + + this->original_size_ = sa_n; + SampleInMem(sa_array, sa_n); + +} + SampledByIndexISA::SampledByIndexISA(uint32_t sampling_rate, NPA *npa, SuccinctAllocator &s_allocator) : FlatSampledArray(sampling_rate, SamplingScheme::FLAT_SAMPLE_BY_INDEX, npa, @@ -41,6 +52,24 @@ void SampledByIndexISA::Sample(ArrayStream& sa_stream, uint64_t n) { } } +void SampledByIndexISA::SampleInMem(ArrayInput& sa_array, uint64_t n) { + + data_bits_ = SuccinctUtils::IntegerLog2(n + 1); + data_size_ = (n / sampling_rate_) + 1; + + data_ = new bitmap_t; + SuccinctBase::InitBitmap(&data_, data_size_ * data_bits_, + succinct_allocator_); + + for (uint64_t i = 0; i < n; i++) { + uint64_t sa_val = sa_array.Get(); + if (sa_val % sampling_rate_ == 0) { + SuccinctBase::SetBitmapArray(&data_, (sa_val / sampling_rate_), i, + data_bits_); + } + } +} + uint64_t SampledByIndexISA::operator [](uint64_t i) { assert(i < original_size_); diff --git a/core/src/sampledarray/sampled_by_index_sa.cc b/core/src/sampledarray/sampled_by_index_sa.cc index f3da3da..4ec4f64 100644 --- a/core/src/sampledarray/sampled_by_index_sa.cc +++ b/core/src/sampledarray/sampled_by_index_sa.cc @@ -11,6 +11,17 @@ SampledByIndexSA::SampledByIndexSA(uint32_t sampling_rate, NPA *npa, } +SampledByIndexSA::SampledByIndexSA(uint32_t sampling_rate, NPA *npa, + ArrayInput& sa_array, uint64_t sa_n, + SuccinctAllocator &s_allocator) + : FlatSampledArray(sampling_rate, SamplingScheme::FLAT_SAMPLE_BY_INDEX, npa, + s_allocator) { + + this->original_size_ = sa_n; + SampleInMem(sa_array, sa_n); + +} + SampledByIndexSA::SampledByIndexSA(uint32_t sampling_rate, NPA *npa, SuccinctAllocator &s_allocator) : FlatSampledArray(sampling_rate, SamplingScheme::FLAT_SAMPLE_BY_INDEX, npa, @@ -23,6 +34,25 @@ SampledByIndexSA::SampledByIndexSA(uint32_t sampling_rate, NPA *npa, } +void SampledByIndexSA::SampleInMem(ArrayInput& sa_array, uint64_t n) { + + data_bits_ = SuccinctUtils::IntegerLog2(n + 1); + data_size_ = (n / sampling_rate_) + 1; + + data_ = new bitmap_t; + SuccinctBase::InitBitmap(&data_, data_size_ * data_bits_, + succinct_allocator_); + + for (uint64_t i = 0; i < n; i++) { + uint64_t sa_val = sa_array.Get(); + if (i % sampling_rate_ == 0) { + SuccinctBase::SetBitmapArray(&data_, (i / sampling_rate_), sa_val, + data_bits_); + } + } +} + + void SampledByIndexSA::Sample(ArrayStream& sa_stream, uint64_t n) { data_bits_ = SuccinctUtils::IntegerLog2(n + 1); diff --git a/core/src/sampledarray/sampled_by_value_isa.cc b/core/src/sampledarray/sampled_by_value_isa.cc index 050c39c..2567909 100644 --- a/core/src/sampledarray/sampled_by_value_isa.cc +++ b/core/src/sampledarray/sampled_by_value_isa.cc @@ -14,6 +14,20 @@ SampledByValueISA::SampledByValueISA(uint32_t sampling_rate, NPA *npa, Sample(sa_stream, sa_n); } +SampledByValueISA::SampledByValueISA(uint32_t sampling_rate, NPA *npa, + ArrayInput& sa_input, uint64_t sa_n, + Dictionary *d_bpos, + SuccinctAllocator &s_allocator) + : FlatSampledArray(sampling_rate, SamplingScheme::FLAT_SAMPLE_BY_VALUE, npa, + s_allocator) { + + assert(ISPOWOF2(sampling_rate)); + + this->sampled_positions_ = d_bpos; + this->original_size_ = sa_n; + SampleInMem(sa_input, sa_n); +} + SampledByValueISA::SampledByValueISA(uint32_t sampling_rate, NPA *npa, SuccinctAllocator &s_allocator) : FlatSampledArray(sampling_rate, SamplingScheme::FLAT_SAMPLE_BY_VALUE, npa, @@ -29,6 +43,26 @@ SampledByValueISA::SampledByValueISA(uint32_t sampling_rate, NPA *npa, } +void SampledByValueISA::SampleInMem(ArrayInput& sa_array, uint64_t n) { + data_size_ = (n / sampling_rate_) + 1; + data_bits_ = SuccinctUtils::IntegerLog2(data_size_ + 1); + uint64_t sa_val, pos = 0; + uint32_t orig_bits = SuccinctUtils::IntegerLog2(n + 1); + + data_ = new bitmap_t; + SuccinctBase::InitBitmap(&data_, data_size_ * data_bits_, + succinct_allocator_); + + for (uint64_t i = 0; i < n; i++) { + sa_val = sa_array.Get(); + if (sa_val % sampling_rate_ == 0) { + SuccinctBase::SetBitmapArray(&data_, sa_val / sampling_rate_, pos++, + data_bits_); + } + } + +} + void SampledByValueISA::Sample(ArrayStream& sa_stream, uint64_t n) { data_size_ = (n / sampling_rate_) + 1; data_bits_ = SuccinctUtils::IntegerLog2(data_size_ + 1); diff --git a/core/src/sampledarray/sampled_by_value_sa.cc b/core/src/sampledarray/sampled_by_value_sa.cc index 1484d98..37698d1 100644 --- a/core/src/sampledarray/sampled_by_value_sa.cc +++ b/core/src/sampledarray/sampled_by_value_sa.cc @@ -13,6 +13,19 @@ SampledByValueSA::SampledByValueSA(uint32_t sampling_rate, NPA *npa, Sample(sa_stream, sa_n); } +SampledByValueSA::SampledByValueSA(uint32_t sampling_rate, NPA *npa, + ArrayInput& sa_array, uint64_t sa_n, + SuccinctAllocator &s_allocator) + : FlatSampledArray(sampling_rate, SamplingScheme::FLAT_SAMPLE_BY_VALUE, npa, + s_allocator) { + + assert(ISPOWOF2(sampling_rate)); + + this->sampled_positions_ = NULL; + this->original_size_ = sa_n; + SampleInMem(sa_array, sa_n); +} + SampledByValueSA::SampledByValueSA(uint32_t sampling_rate, NPA *npa, SuccinctAllocator &s_allocator) : FlatSampledArray(sampling_rate, SamplingScheme::FLAT_SAMPLE_BY_VALUE, npa, @@ -28,6 +41,33 @@ SampledByValueSA::SampledByValueSA(uint32_t sampling_rate, NPA *npa, } +void SampledByValueSA::SampleInMem(ArrayInput& sa_array, uint64_t n) { + data_size_ = (n / sampling_rate_) + 1; + data_bits_ = SuccinctUtils::IntegerLog2(data_size_ + 1); + uint64_t sa_val, pos = 0; + uint32_t orig_bits = SuccinctUtils::IntegerLog2(n + 1); + + bitmap_t *BPos = new bitmap_t; + data_ = new bitmap_t; + SuccinctBase::InitBitmap(&data_, data_size_ * data_bits_, + succinct_allocator_); + + SuccinctBase::InitBitmap(&BPos, n, succinct_allocator_); + + for (uint64_t i = 0; i < n; i++) { + sa_val = sa_array.Get(); + if (sa_val % sampling_rate_ == 0) { + SuccinctBase::SetBitmapArray(&data_, pos++, sa_val / sampling_rate_, + data_bits_); + SETBITVAL(BPos, i); + } + } + + sampled_positions_ = new Dictionary; + SuccinctBase::CreateDictionary(BPos, sampled_positions_, succinct_allocator_); + SuccinctBase::DestroyBitmap(&BPos, succinct_allocator_); +} + void SampledByValueSA::Sample(ArrayStream& sa_stream, uint64_t n) { data_size_ = (n / sampling_rate_) + 1; data_bits_ = SuccinctUtils::IntegerLog2(data_size_ + 1); diff --git a/core/src/succinct_core.cc b/core/src/succinct_core.cc index 8f7c21a..fc22578 100644 --- a/core/src/succinct_core.cc +++ b/core/src/succinct_core.cc @@ -140,6 +140,8 @@ void SuccinctCore::ConstructFromContent(const std::string &input, } data[fsize] = 1; + std::cout << "ConstructFromContent Line 143" << std::endl; + //Construct in memory Construct(1, data, fsize + 1, sa_sampling_rate, isa_sampling_rate, npa_sampling_rate, context_len, sa_sampling_scheme, @@ -183,6 +185,7 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, uint32_t sampling_range) { if (in_mem == 0){ + std::cout << "Storing data on disk" << std::endl; //USE FILES ON DISK TO STORE ARRAYS std::string sa_file = ".tmp.sa"; @@ -336,6 +339,7 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, sa_stream.CloseAndRemove(); } else { + std::cout << "Storing data in memory" << std::endl; //USE MEMORY TO STORE ARRAYS // Save metadata @@ -411,12 +415,14 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, s_allocator); break; } - // case NPA::NPAEncodingScheme::ELIAS_DELTA_ENCODED: { - // npa_ = new EliasDeltaEncodedNPA(input_size_, alphabet_size_, context_len, - // npa_sampling_rate, isa_file, col_offsets, - // npa_file, s_allocator); - // return; - // } + case NPA::NPAEncodingScheme::ELIAS_DELTA_ENCODED: { + npa_ = new EliasDeltaEncodedNPA(input_size_, alphabet_size_, context_len, + npa_sampling_rate, lISA, col_offsets, + s_allocator); + return; + } + // We'll deal with bitmap later + // case NPA::NPAEncodingScheme::WAVELET_TREE_ENCODED: { // //isa_stream.CloseAndRemove(); // Bitmap *compactSA = ReadAsBitmap(input_size_, bits, s_allocator, sa_file); @@ -428,59 +434,59 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, // DestroyBitmap(&data_bitmap, s_allocator); // break; // } - // default:npa_ = nullptr; + default:npa_ = nullptr; + } + assert(npa_ != nullptr); + + switch (sa_sampling_scheme) { + case SamplingScheme::FLAT_SAMPLE_BY_INDEX: + sa_ = new SampledByIndexSA(sa_sampling_rate, npa_, sa_array, input_size_, + s_allocator); + break; + case SamplingScheme::FLAT_SAMPLE_BY_VALUE: + sa_ = new SampledByValueSA(sa_sampling_rate, npa_, sa_array, input_size_, + s_allocator); + break; + case SamplingScheme::LAYERED_SAMPLE_BY_INDEX: + sa_ = new LayeredSampledSA(sa_sampling_rate, + sa_sampling_rate * sampling_range, npa_, + sa_array, input_size_, s_allocator); + break; + case SamplingScheme::OPPORTUNISTIC_LAYERED_SAMPLE_BY_INDEX: + sa_ = new OpportunisticLayeredSampledSA(sa_sampling_rate, + sa_sampling_rate * sampling_range, + npa_, sa_array, input_size_, + s_allocator); + break; + default:sa_ = nullptr; } - // assert(npa_ != nullptr); - - // switch (sa_sampling_scheme) { - // case SamplingScheme::FLAT_SAMPLE_BY_INDEX: - // sa_ = new SampledByIndexSA(sa_sampling_rate, npa_, sa_stream, input_size_, - // s_allocator); - // break; - // case SamplingScheme::FLAT_SAMPLE_BY_VALUE: - // sa_ = new SampledByValueSA(sa_sampling_rate, npa_, sa_stream, input_size_, - // s_allocator); - // break; - // case SamplingScheme::LAYERED_SAMPLE_BY_INDEX: - // sa_ = new LayeredSampledSA(sa_sampling_rate, - // sa_sampling_rate * sampling_range, npa_, - // sa_stream, input_size_, s_allocator); - // break; - // case SamplingScheme::OPPORTUNISTIC_LAYERED_SAMPLE_BY_INDEX: - // sa_ = new OpportunisticLayeredSampledSA(sa_sampling_rate, - // sa_sampling_rate * sampling_range, - // npa_, sa_stream, input_size_, - // s_allocator); - // break; - // default:sa_ = nullptr; - // } - // sa_stream.Reset(); - // assert(sa_ != nullptr); - - // switch (isa_sampling_scheme) { - // case SamplingScheme::FLAT_SAMPLE_BY_INDEX: - // isa_ = new SampledByIndexISA(isa_sampling_rate, npa_, sa_stream, - // input_size_, s_allocator); - // break; - // case SamplingScheme::FLAT_SAMPLE_BY_VALUE:assert(sa_->GetSamplingScheme() == SamplingScheme::FLAT_SAMPLE_BY_VALUE); - // isa_ = new SampledByValueISA( - // sa_sampling_rate, npa_, sa_stream, input_size_, - // ((SampledByValueSA *) sa_)->GetSampledPositions(), s_allocator); - // break; - // case SamplingScheme::LAYERED_SAMPLE_BY_INDEX: - // isa_ = new LayeredSampledISA(isa_sampling_rate, - // isa_sampling_rate * sampling_range, npa_, - // sa_stream, input_size_, s_allocator); - // break; - // case SamplingScheme::OPPORTUNISTIC_LAYERED_SAMPLE_BY_INDEX: - // isa_ = new OpportunisticLayeredSampledISA( - // isa_sampling_rate, isa_sampling_rate * sampling_range, npa_, - // sa_stream, input_size_, s_allocator); - // break; - // default:isa_ = nullptr; - // } - // sa_stream.Reset(); - // assert(isa_ != nullptr); + sa_array.Reset(); + assert(sa_ != nullptr); + + switch (isa_sampling_scheme) { + case SamplingScheme::FLAT_SAMPLE_BY_INDEX: + isa_ = new SampledByIndexISA(isa_sampling_rate, npa_, sa_array, + input_size_, s_allocator); + break; + case SamplingScheme::FLAT_SAMPLE_BY_VALUE:assert(sa_->GetSamplingScheme() == SamplingScheme::FLAT_SAMPLE_BY_VALUE); + isa_ = new SampledByValueISA( + sa_sampling_rate, npa_, sa_array, input_size_, + ((SampledByValueSA *) sa_)->GetSampledPositions(), s_allocator); + break; + case SamplingScheme::LAYERED_SAMPLE_BY_INDEX: + isa_ = new LayeredSampledISA(isa_sampling_rate, + isa_sampling_rate * sampling_range, npa_, + sa_array, input_size_, s_allocator); + break; + case SamplingScheme::OPPORTUNISTIC_LAYERED_SAMPLE_BY_INDEX: + isa_ = new OpportunisticLayeredSampledISA( + isa_sampling_rate, isa_sampling_rate * sampling_range, npa_, + sa_array, input_size_, s_allocator); + break; + default:isa_ = nullptr; + } + sa_array.Reset(); + assert(isa_ != nullptr); //sa_stream.CloseAndRemove(); diff --git a/pysuccinct/file.cpp b/pysuccinct/file.cpp index 208c89e..c634eff 100644 --- a/pysuccinct/file.cpp +++ b/pysuccinct/file.cpp @@ -116,7 +116,7 @@ struct File { uint32_t npa_sampling_rate, int sampling_opt, int npa_opt){ s_file_ = nullptr; // Compresses a the data from "input" in memory - std::cout << "Constructing Succinct data structures...\n"; + std::cout << "Constructing Succinct data structures... FROM CONTENT\n"; s_file_ = new SuccinctFile(input, SuccinctMode::CONSTRUCT_FROM_CONTENT, sa_sampling_rate, isa_sampling_rate, From a80a8f9cbee568ad5f515a01b8628cc0642d2a6c Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Tue, 30 Jun 2020 15:18:50 -0400 Subject: [PATCH 30/39] Bug fixed: Array input out of bounds error --- .vscode/settings.json | 4 +- core/include/npa/delta_encoded_npa.h | 75 ++++++++++++++++--------- core/include/utils/array_input.h | 6 +- core/include/utils/array_stream.h | 16 ++++++ core/src/npa/elias_gamma_encoded_npa.cc | 2 + core/src/succinct_core.cc | 44 ++++++++++----- 6 files changed, 106 insertions(+), 41 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 090e946..70c1b98 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -48,7 +48,9 @@ "utility": "cpp", "sstream": "cpp", "strstream": "cpp", - "__string": "cpp" + "__string": "cpp", + "thread": "cpp", + "istream": "cpp" }, "python.linting.enabled": false } \ No newline at end of file diff --git a/core/include/npa/delta_encoded_npa.h b/core/include/npa/delta_encoded_npa.h index da101cb..7c43adb 100644 --- a/core/include/npa/delta_encoded_npa.h +++ b/core/include/npa/delta_encoded_npa.h @@ -2,6 +2,7 @@ #define DELTA_ENCODED_NPA_H #include +#include #include "utils/succinct_utils.h" #include "utils/definitions.h" @@ -122,10 +123,14 @@ class DeltaEncodedNPA : public NPA { // Get all NPA values int64_t *lNPA = new int64_t[npa_size_](); + // fprintf(stderr, "the npa_size_ is: %" PRIu64 "\n", npa_size_); uint64_t first_idx, cur_idx, nxt_idx, num_elements_per_chunk; std::thread constructor_thread[8]; ArrayStream isa_stream(isa_file); + // Output file contents + isa_stream.PrintStream(); + first_idx = isa_stream.Get(); num_elements_per_chunk = SuccinctUtils::NumBlocks(npa_size_, 8); for (uint8_t i = 0; i < 8; i++) { @@ -134,6 +139,7 @@ class DeltaEncodedNPA : public NPA { 0 : npa_size_ - i * num_elements_per_chunk; uint64_t num_elements = SuccinctUtils::Min(remaining_elements, num_elements_per_chunk); + // fprintf(stderr, "loop: %i\n", i); constructor_thread[i] = std::thread(&DeltaEncodedNPA::ConstructNPAChunk, lNPA, isa_file, i * num_elements_per_chunk, num_elements, (i == 7) ? first_idx : -1ULL); @@ -169,9 +175,14 @@ class DeltaEncodedNPA : public NPA { // Get all NPA values int64_t *lNPA = new int64_t[npa_size_](); + // fprintf(stderr, "the npa_size_ is: %" PRIu64 "\n", npa_size_); uint64_t first_idx, cur_idx, nxt_idx, num_elements_per_chunk; std::thread constructor_thread[8]; + // for ( int i = 0; i < sizeof(lISA)/sizeof(lISA[0]); i ++){ + // // fprintf(stderr, "lISA: %" PRIu64 "\n", lISA[i]); + // } + ArrayInput isa_array(lISA); first_idx = isa_array.Get(); num_elements_per_chunk = SuccinctUtils::NumBlocks(npa_size_, 8); @@ -181,9 +192,10 @@ class DeltaEncodedNPA : public NPA { 0 : npa_size_ - i * num_elements_per_chunk; uint64_t num_elements = SuccinctUtils::Min(remaining_elements, num_elements_per_chunk); + // fprintf(stderr, "loop: %i\n", i); constructor_thread[i] = std::thread(&DeltaEncodedNPA::ConstructNPAChunkInMem, lNPA, lISA, i * num_elements_per_chunk, - num_elements, (i == 7) ? first_idx : -1ULL); + num_elements, (i == 7) ? first_idx : -1ULL, npa_size_); } for (uint8_t i = 0; i < 8; i++) { @@ -193,7 +205,7 @@ class DeltaEncodedNPA : public NPA { //isa_stream.CloseAndRemove(); //SuccinctUtils::WriteToFile(lNPA, npa_size_, npa_file); - + // fprintf(stderr, "200\n"); del_npa_ = new DeltaEncodedVector[sigma_size_]; ThreadPool pool(8); for (uint64_t i = 0; i < col_offsets_.size(); i++) { @@ -205,6 +217,7 @@ class DeltaEncodedNPA : public NPA { } pool.ShutDown(); delete[] lNPA; + // fprintf(stderr, "finished EncodeInMem\n"); // remove(npa_file.c_str()); } @@ -340,61 +353,75 @@ class DeltaEncodedNPA : public NPA { int64_t first_idx) { // ISA Stream is configured to start reading from correct position uint64_t cur_idx, nxt_idx; + // fprintf(stderr, "*** start_pos: %" PRIu64 ", first_idx: %" PRId64 "\n", start_pos, first_idx); ArrayStream isa_stream(isa_file, start_pos); cur_idx = isa_stream.Get(); for (uint64_t i = 0; i < n_elems; i++) { nxt_idx = isa_stream.Get(); + // fprintf(stderr, "Setting lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, nxt_idx); lNPA[cur_idx] = nxt_idx; cur_idx = nxt_idx; } - + // fprintf(stderr, "cur_idx is :%" PRIu64 " when exiting the loop\n", cur_idx); if (first_idx > 0) { + // fprintf(stderr, "TRYING TO SET lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, first_idx); lNPA[cur_idx] = first_idx; } isa_stream.Close(); } - //On-disk ConstructNPAChunk function - void EncodeNPAChunk(DeltaEncodedVector *dv, std::string npa_file, uint64_t start_offset, - uint64_t end_offset) { - ArrayStream npa_stream(npa_file, start_offset); - std::vector column; - for (uint64_t j = start_offset; j < end_offset; j++) { - column.push_back(npa_stream.Get()); - } - assert(column.size() > 0); - CreateDeltaEncodedVector(dv, column); - npa_stream.Close(); - } - //In memory ConstructNPAChunk function static void ConstructNPAChunkInMem(int64_t *lNPA, int64_t *lISA, uint64_t start_pos, uint64_t n_elems, - int64_t first_idx) { + int64_t first_idx, uint64_t npa_size) { // ISA Stream is configured to start reading from correct position uint64_t cur_idx, nxt_idx; + //fprintf(stderr, "*** start_pos: %" PRIu64 ", first_idx: %" PRId64 "\n", start_pos, first_idx); ArrayInput isa_array(lISA, start_pos); cur_idx = isa_array.Get(); - - for (uint64_t i = 0; i < n_elems; i++) { - nxt_idx = isa_array.Get(); - lNPA[cur_idx] = nxt_idx; - cur_idx = nxt_idx; + if (cur_idx >= npa_size){ + cur_idx = 0; + } else { + for (uint64_t i = 0; i < n_elems; i++) { + nxt_idx = isa_array.Get(); + // fprintf(stderr, "Setting lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, nxt_idx); + lNPA[cur_idx] = nxt_idx; + cur_idx = nxt_idx; + } } + // fprintf(stderr, "cur_idx is :%" PRIu64 " when exiting the loop\n", cur_idx); if (first_idx > 0) { + // fprintf(stderr, "TRYING TO SET lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, first_idx); lNPA[cur_idx] = first_idx; } } + //On-disk ConstructNPAChunk function + void EncodeNPAChunk(DeltaEncodedVector *dv, std::string npa_file, uint64_t start_offset, + uint64_t end_offset) { + ArrayStream npa_stream(npa_file, start_offset); + std::vector column; + for (uint64_t j = start_offset; j < end_offset; j++) { + uint64_t temp = npa_stream.Get(); + column.push_back(temp); + // fprintf(stderr, "pushing back %" PRIu64 "\n", temp); + } + assert(column.size() > 0); + CreateDeltaEncodedVector(dv, column); + npa_stream.Close(); + } + //In memory EncodeNPAChunk function void EncodeNPAChunkInMem(DeltaEncodedVector *dv, int64_t *lNPA, uint64_t start_offset, uint64_t end_offset) { ArrayInput npa_array(lNPA, start_offset); std::vector column; for (uint64_t j = start_offset; j < end_offset; j++) { - column.push_back(npa_array.Get()); + uint64_t temp = npa_array.Get(); + column.push_back(temp); + // fprintf(stderr, "pushing back %" PRIu64 "\n", temp); } assert(column.size() > 0); CreateDeltaEncodedVector(dv, column); @@ -402,8 +429,6 @@ class DeltaEncodedNPA : public NPA { - - }; #endif diff --git a/core/include/utils/array_input.h b/core/include/utils/array_input.h index c514217..8ea53f1 100644 --- a/core/include/utils/array_input.h +++ b/core/include/utils/array_input.h @@ -4,6 +4,7 @@ #include #include #include +#include #include "utils/succinct_utils.h" @@ -16,6 +17,7 @@ class ArrayInput { uint64_t Get() { uint64_t val = array_[current_idx_]; + fprintf(stderr, "item at index %" PRIu64 " is %" PRIu64 "\n", current_idx_, val); current_idx_++; return val; } @@ -27,11 +29,11 @@ class ArrayInput { void Reset() { current_idx_ = 0; } - + private: int64_t *array_; uint64_t current_idx_; - + uint64_t size_; }; #endif // CORE_ARRAY_STREAM_H_ diff --git a/core/include/utils/array_stream.h b/core/include/utils/array_stream.h index 1ea0831..9e19385 100644 --- a/core/include/utils/array_stream.h +++ b/core/include/utils/array_stream.h @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include "utils/succinct_utils.h" @@ -24,12 +26,26 @@ class ArrayStream { } } + void PrintStream(){ + in_.clear(); + in_.seekg(0, std::ios::beg); + fprintf(stderr, "PRINTING STREAM\n"); + for (int i = 0; i < 200; i++){ + uint64_t val; + in_.read(reinterpret_cast(&(val)), sizeof(uint64_t)); + fprintf(stderr, "%" PRIu64 "\n", val); + } + in_.clear(); + in_.seekg(0, std::ios::beg); + } + uint64_t Get() { if (memory_map_) { return data_[current_idx_++]; } uint64_t val; in_.read(reinterpret_cast(&(val)), sizeof(uint64_t)); + fprintf(stderr, "item at index %" PRIu64 " is %" PRIu64 "\n", current_idx_, val); current_idx_++; return val; } diff --git a/core/src/npa/elias_gamma_encoded_npa.cc b/core/src/npa/elias_gamma_encoded_npa.cc index a1884e7..7d5321e 100644 --- a/core/src/npa/elias_gamma_encoded_npa.cc +++ b/core/src/npa/elias_gamma_encoded_npa.cc @@ -135,7 +135,9 @@ void EliasGammaEncodedNPA::CreateDeltaEncodedVector( } } else { long delta = data[i] - last_val; + fprintf(stderr, "138\n"); assert(delta > 0); + fprintf(stderr, "140\n"); _deltas.push_back(delta); delta_enc_size = EliasGammaEncodingSize(delta); diff --git a/core/src/succinct_core.cc b/core/src/succinct_core.cc index fc22578..f5ebb16 100644 --- a/core/src/succinct_core.cc +++ b/core/src/succinct_core.cc @@ -32,7 +32,7 @@ SuccinctCore::SuccinctCore(const std::string &filename, SuccinctMode s_mode, break; } case SuccinctMode::CONSTRUCT_MEMORY_MAPPED: { - fprintf(stderr, "Unsupported mode.\n"); + // fprintf(stderr, "Unsupported mode.\n"); assert(0); break; } @@ -140,7 +140,7 @@ void SuccinctCore::ConstructFromContent(const std::string &input, } data[fsize] = 1; - std::cout << "ConstructFromContent Line 143" << std::endl; + // fprintf(stderr, "ConstructFromContent Line 143\n"); //Construct in memory Construct(1, data, fsize + 1, sa_sampling_rate, isa_sampling_rate, @@ -185,7 +185,6 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, uint32_t sampling_range) { if (in_mem == 0){ - std::cout << "Storing data on disk" << std::endl; //USE FILES ON DISK TO STORE ARRAYS std::string sa_file = ".tmp.sa"; @@ -222,6 +221,7 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, col_offsets.push_back(0); for (uint64_t i = 1; i < input_size_; i++) { cur_sa = sa_stream.Get(); + //fprintf(stderr, "INITIALIZING lISA[ %" PRIu64 "] = %" PRIu64 "\n", cur_sa, i); lISA[cur_sa] = i; if (input[cur_sa] != input[prv_sa]) { alphabet_map_[input[cur_sa]] = std::pair( @@ -339,13 +339,14 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, sa_stream.CloseAndRemove(); } else { - std::cout << "Storing data in memory" << std::endl; //USE MEMORY TO STORE ARRAYS + // fprintf(stderr, "Save metadata\n"); // Save metadata input_size_ = input_size; uint32_t bits = SuccinctUtils::IntegerLog2(input_size_ + 1); + // fprintf(stderr, "Construct Suffix Array\n"); // Construct Suffix Array auto *lSA = (int64_t *) s_allocator.s_calloc(sizeof(int64_t), input_size_); divsufsortxx::constructSA(input, (input + input_size_), lSA, @@ -356,22 +357,30 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, ArrayInput sa_array(lSA); + // fprintf(stderr, "Allocate space for Inverse Suffix Array\n"); // Allocate space for Inverse Suffix Array auto *lISA = (int64_t *) s_allocator.s_calloc(sizeof(int64_t), input_size_); + // fprintf(stderr, "Auxiliary Data Structures for NPA\n"); // Auxiliary Data Structures for NPA std::vector col_offsets; uint64_t cur_sa, prv_sa; - + // fprintf(stderr, "368\n"); prv_sa = cur_sa = sa_array.Get(); + // fprintf(stderr, "370\n"); lISA[cur_sa] = 0; + // fprintf(stderr, "372\n"); alphabet_size_ = 1; + // fprintf(stderr, "374\n"); alphabet_map_[input[cur_sa]] = std::pair(0, 0); + // fprintf(stderr, "376\n"); col_offsets.push_back(0); + // fprintf(stderr, "378\n"); for (uint64_t i = 1; i < input_size_; i++) { cur_sa = sa_array.Get(); lISA[cur_sa] = i; + // fprintf(stderr, "INITIALIZING lISA[ %" PRIu64 "] = %" PRIu64 "\n", cur_sa, i); if (input[cur_sa] != input[prv_sa]) { alphabet_map_[input[cur_sa]] = std::pair( i, alphabet_size_++); @@ -380,11 +389,13 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, prv_sa = cur_sa; } + // fprintf(stderr, "389\n"); alphabet_map_[(char) 0] = std::pair(input_size_, alphabet_size_); assert(sa_array.GetCurrentIndex() == input_size_); sa_array.Reset(); + // fprintf(stderr, "396\n"); alphabet_ = new char[alphabet_size_ + 1]; for (auto alphabet_entry : alphabet_map_) { alphabet_[alphabet_entry.second.second] = alphabet_entry.first; @@ -396,6 +407,7 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, //ArrayStream isa_stream(isa_file); // Compact input data (if needed) + // fprintf(stderr, "Compact input data (if needed)\n"); Bitmap *data_bitmap = nullptr; if (npa_encoding_scheme == NPA::NPAEncodingScheme::WAVELET_TREE_ENCODED) { data_bitmap = new Bitmap; @@ -408,6 +420,7 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, } s_allocator.s_free(input); + // fprintf(stderr, "npa switch\n"); switch (npa_encoding_scheme) { case NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED: { npa_ = new EliasGammaEncodedNPA(input_size_, alphabet_size_, context_len, @@ -438,6 +451,7 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, } assert(npa_ != nullptr); + // fprintf(stderr, "sa switch\n"); switch (sa_sampling_scheme) { case SamplingScheme::FLAT_SAMPLE_BY_INDEX: sa_ = new SampledByIndexSA(sa_sampling_rate, npa_, sa_array, input_size_, @@ -463,6 +477,7 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, sa_array.Reset(); assert(sa_ != nullptr); + // fprintf(stderr, "isa switch\n"); switch (isa_sampling_scheme) { case SamplingScheme::FLAT_SAMPLE_BY_INDEX: isa_ = new SampledByIndexISA(isa_sampling_rate, npa_, sa_array, @@ -492,6 +507,9 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, s_allocator.s_free(lISA); s_allocator.s_free(lSA); + + // fprintf(stderr, "npa switch\n"); + } } @@ -529,8 +547,8 @@ size_t SuccinctCore::SerializeFromContent(std::ostream &path) { // mode_t create_mode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; // if (stat(path.c_str(), &st) != 0) { // if (mkdir(path.c_str(), create_mode) != 0) { - // fprintf(stderr, "Failed to create path '%s'\n", path.c_str()); - // fprintf(stderr, "Terminating the serialization process.\n"); + // // fprintf(stderr, "Failed to create path '%s'\n", path.c_str()); + // // fprintf(stderr, "Terminating the serialization process.\n"); // return 0; // } // } @@ -586,8 +604,8 @@ size_t SuccinctCore::Serialize(const std::string &path) { mode_t create_mode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; if (stat(path.c_str(), &st) != 0) { if (mkdir(path.c_str(), create_mode) != 0) { - fprintf(stderr, "Failed to create path '%s'\n", path.c_str()); - fprintf(stderr, "Terminating the serialization process.\n"); + // fprintf(stderr, "Failed to create path '%s'\n", path.c_str()); + // fprintf(stderr, "Terminating the serialization process.\n"); return 0; } } @@ -800,10 +818,10 @@ void SuccinctCore::PrintStorageBreakdown() { + alphabet_map_.size() * (sizeof(char) + sizeof(uint64_t) + sizeof(uint32_t)); metadata_size += sizeof(alphabet_size_) + alphabet_size_ * sizeof(char); - fprintf(stderr, "Metadata size = %zu\n", metadata_size); - fprintf(stderr, "SA size = %zu\n", sa_->StorageSize()); - fprintf(stderr, "ISA size = %zu\n", isa_->StorageSize()); - fprintf(stderr, "NPA size = %zu\n", npa_->StorageSize()); + // fprintf(stderr, "Metadata size = %zu\n", metadata_size); + // fprintf(stderr, "SA size = %zu\n", sa_->StorageSize()); + // fprintf(stderr, "ISA size = %zu\n", isa_->StorageSize()); + // fprintf(stderr, "NPA size = %zu\n", npa_->StorageSize()); } std::pair SuccinctCore::BwdSearch(std::string mgram) { From 2734a7aa49dfbdf93f82b4e49065dda1f87d7bb9 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Tue, 30 Jun 2020 15:20:27 -0400 Subject: [PATCH 31/39] Removed debugging outputs --- core/include/npa/delta_encoded_npa.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/core/include/npa/delta_encoded_npa.h b/core/include/npa/delta_encoded_npa.h index 7c43adb..92dd35c 100644 --- a/core/include/npa/delta_encoded_npa.h +++ b/core/include/npa/delta_encoded_npa.h @@ -128,8 +128,6 @@ class DeltaEncodedNPA : public NPA { std::thread constructor_thread[8]; ArrayStream isa_stream(isa_file); - // Output file contents - isa_stream.PrintStream(); first_idx = isa_stream.Get(); num_elements_per_chunk = SuccinctUtils::NumBlocks(npa_size_, 8); From bfac64561ed812a35710d33b93f84be474b710f7 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Tue, 30 Jun 2020 15:23:10 -0400 Subject: [PATCH 32/39] Removed more debugging outputs --- core/include/utils/array_input.h | 2 +- core/include/utils/array_stream.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/include/utils/array_input.h b/core/include/utils/array_input.h index 8ea53f1..b7ee64c 100644 --- a/core/include/utils/array_input.h +++ b/core/include/utils/array_input.h @@ -17,7 +17,7 @@ class ArrayInput { uint64_t Get() { uint64_t val = array_[current_idx_]; - fprintf(stderr, "item at index %" PRIu64 " is %" PRIu64 "\n", current_idx_, val); + // fprintf(stderr, "item at index %" PRIu64 " is %" PRIu64 "\n", current_idx_, val); current_idx_++; return val; } diff --git a/core/include/utils/array_stream.h b/core/include/utils/array_stream.h index 9e19385..3b28758 100644 --- a/core/include/utils/array_stream.h +++ b/core/include/utils/array_stream.h @@ -45,7 +45,7 @@ class ArrayStream { } uint64_t val; in_.read(reinterpret_cast(&(val)), sizeof(uint64_t)); - fprintf(stderr, "item at index %" PRIu64 " is %" PRIu64 "\n", current_idx_, val); + // fprintf(stderr, "item at index %" PRIu64 " is %" PRIu64 "\n", current_idx_, val); current_idx_++; return val; } From 09c5aae0e801ceee4342167dfd0629823760f9fa Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Tue, 30 Jun 2020 15:28:02 -0400 Subject: [PATCH 33/39] Removed all debugging outputs --- core/src/npa/elias_gamma_encoded_npa.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/core/src/npa/elias_gamma_encoded_npa.cc b/core/src/npa/elias_gamma_encoded_npa.cc index 7d5321e..a1884e7 100644 --- a/core/src/npa/elias_gamma_encoded_npa.cc +++ b/core/src/npa/elias_gamma_encoded_npa.cc @@ -135,9 +135,7 @@ void EliasGammaEncodedNPA::CreateDeltaEncodedVector( } } else { long delta = data[i] - last_val; - fprintf(stderr, "138\n"); assert(delta > 0); - fprintf(stderr, "140\n"); _deltas.push_back(delta); delta_enc_size = EliasGammaEncodingSize(delta); From 0de377ab81fb5a3600b84626310f333822bfe67b Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Mon, 6 Jul 2020 20:53:16 -0400 Subject: [PATCH 34/39] sequential and multi-threading example in lambdatest.py --- core/src/succinct_core.cc | 19 +-- examples/lambda/lambdatest.py | 186 ++++++++++++++++++++++++---- examples/pysuccinct/pyfile_input.py | 4 +- 3 files changed, 177 insertions(+), 32 deletions(-) diff --git a/core/src/succinct_core.cc b/core/src/succinct_core.cc index f5ebb16..8ac903a 100644 --- a/core/src/succinct_core.cc +++ b/core/src/succinct_core.cc @@ -219,18 +219,22 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, alphabet_size_ = 1; alphabet_map_[input[cur_sa]] = std::pair(0, 0); col_offsets.push_back(0); + // fprintf(stderr, "input size: %zd\n", input_size_); for (uint64_t i = 1; i < input_size_; i++) { cur_sa = sa_stream.Get(); - //fprintf(stderr, "INITIALIZING lISA[ %" PRIu64 "] = %" PRIu64 "\n", cur_sa, i); + // fprintf(stderr, "INITIALIZING lISA[ %" PRIu64 "] = %" PRIu64 "\n", cur_sa, i); lISA[cur_sa] = i; if (input[cur_sa] != input[prv_sa]) { alphabet_map_[input[cur_sa]] = std::pair( i, alphabet_size_++); col_offsets.push_back(i); + // fprintf(stderr, "PUSHING BACK %" PRIu64 "\n", i); } prv_sa = cur_sa; } + // fprintf(stderr, "234\n"); + alphabet_map_[(char) 0] = std::pair(input_size_, alphabet_size_); assert(sa_stream.GetCurrentIndex() == input_size_); @@ -242,11 +246,13 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, } // Write Inverse Suffix Array to file + // fprintf(stderr, "Write Inverse Suffix Array to file\n"); SuccinctUtils::WriteToFile(lISA, input_size_, isa_file); s_allocator.s_free(lISA); ArrayStream isa_stream(isa_file); // Compact input data (if needed) + // fprintf(stderr, "Compact input data (if needed)\n"); Bitmap *data_bitmap = nullptr; if (npa_encoding_scheme == NPA::NPAEncodingScheme::WAVELET_TREE_ENCODED) { data_bitmap = new Bitmap; @@ -259,6 +265,7 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, } s_allocator.s_free(input); + // fprintf(stderr, "npa switch\n"); switch (npa_encoding_scheme) { case NPA::NPAEncodingScheme::ELIAS_GAMMA_ENCODED: { npa_ = new EliasGammaEncodedNPA(input_size_, alphabet_size_, context_len, @@ -287,6 +294,7 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, } assert(npa_ != nullptr); + // fprintf(stderr, "sa switch\n"); switch (sa_sampling_scheme) { case SamplingScheme::FLAT_SAMPLE_BY_INDEX: sa_ = new SampledByIndexSA(sa_sampling_rate, npa_, sa_stream, input_size_, @@ -312,6 +320,7 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, sa_stream.Reset(); assert(sa_ != nullptr); + // fprintf(stderr, "isa switch\n"); switch (isa_sampling_scheme) { case SamplingScheme::FLAT_SAMPLE_BY_INDEX: isa_ = new SampledByIndexISA(isa_sampling_rate, npa_, sa_stream, @@ -366,17 +375,12 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, // Auxiliary Data Structures for NPA std::vector col_offsets; uint64_t cur_sa, prv_sa; - // fprintf(stderr, "368\n"); prv_sa = cur_sa = sa_array.Get(); - // fprintf(stderr, "370\n"); lISA[cur_sa] = 0; - // fprintf(stderr, "372\n"); alphabet_size_ = 1; - // fprintf(stderr, "374\n"); alphabet_map_[input[cur_sa]] = std::pair(0, 0); - // fprintf(stderr, "376\n"); col_offsets.push_back(0); - // fprintf(stderr, "378\n"); + // fprintf(stderr, "input size: %zd\n", input_size_); for (uint64_t i = 1; i < input_size_; i++) { cur_sa = sa_array.Get(); lISA[cur_sa] = i; @@ -385,6 +389,7 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, alphabet_map_[input[cur_sa]] = std::pair( i, alphabet_size_++); col_offsets.push_back(i); + // fprintf(stderr, "PUSHING BACK %" PRIu64 "\n", i); } prv_sa = cur_sa; } diff --git a/examples/lambda/lambdatest.py b/examples/lambda/lambdatest.py index beccf8a..cc97a15 100644 --- a/examples/lambda/lambdatest.py +++ b/examples/lambda/lambdatest.py @@ -5,17 +5,143 @@ from os import path import boto3 import tempfile +import time +from threading import Thread # Change the filenames to succinctdi_sa, succinctdir_isa .... and upload to s3 def uploadDirectory(path,bucketname, name): s3 = boto3.client("s3") for root,dirs,files in os.walk(path): for f in files: - s3.upload_file(os.path.join(root,f), bucketname, f) + s3.upload_file(os.path.join(root,f), bucketname, f) + +def read_chunk(i, chunk_string, read_chunks, s3): + # print("reading chunk " + str(i)) + obj = s3.get_object(Bucket='succinct-datasets', Key=chunk_string + str(i) + ".succinct") + read_chunks.append(obj['Body'].read().decode('utf-8')) + +def compress_chunk(i, read_chunks, compressed_chunks): + # print("compressing chunk " + str(i)) + q = file.File(0, read_chunks[i], 32, 32, 128, 0, 1) + compressed_chunks.append(q.GetContent().tobytes()) + read_chunks[i] = None + +def upload_chunk(i, chunk_string, compressed_chunks, s3): + # print("uploading chunk " + str(i)) + s3.put_object(Body=compressed_chunks[i], Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct") + # Remove chunk here + compressed_chunks[i] = None + + # compress file def call_compress (event, context): - # # Upload a file to the bucket + # Define variables + chunk_string = event['key1'] + "-chunk-" + s3 = boto3.client("s3") + # Depends on number of chunks file is split into + num_chunks = 292 + read_chunks = [] + compressed_chunks = [] + + # # **** QUERY FOR NUMBER OF CHUNK FILES **** + # s3 = boto3.resource("s3") + # bucket = s3.Bucket('succinct-datasets') + # for bucket_obj in bucket.objects.all(): + # if (chunk_string in str(bucket_obj)): + # num_chunks += 1 + # print("The num chunks is: " + str(num_chunks)) + + # **** PARALLEL EXECUTION **** + read_threads = [] + compress_threads = [] + upload_threads = [] + + for i in range(0, num_chunks + 2): + if (i == 0): + # Create read thread + read_thread = Thread(target=read_chunk, args=(0, chunk_string, read_chunks, s3)) + read_thread.start() + read_threads.append(read_thread) + elif (i == 1): + # Create read thread + read_thread = Thread(target=read_chunk, args=(1, chunk_string, read_chunks, s3)) + read_thread.start() + read_threads.append(read_thread) + + # If read_threads[0] isn't done, finish + if read_threads[0].is_alive(): + read_threads[0].join() + # Create compress thread + compress_thread = Thread(target=compress_chunk, args=(0, read_chunks, compressed_chunks)) + compress_thread.start() + compress_threads.append(compress_thread) + elif (i == num_chunks): + # If read_threads[num_chunks - 1] isn't done, finish + if read_threads[i - 1].is_alive(): + read_threads[i - 1].join() + # Create compress thread + compress_thread = Thread(target=compress_chunk, args=(i - 1, read_chunks, compressed_chunks)) + compress_thread.start() + compress_threads.append(compress_thread) + + # If compress_threads[num_chunks - 2] isn't done, finish + if compress_threads[i - 2].is_alive(): + compress_threads[i - 2].join() + # Create upload thread + upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3)) + upload_thread.start() + upload_threads.append(upload_thread) + elif (i == num_chunks + 1): + # If compress_threads[num_chunks - 1] isn't done, finish + if compress_threads[i - 2].is_alive(): + compress_threads[i - 2].join() + # Create upload thread + upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3)) + upload_thread.start() + upload_threads.append(upload_thread) + else: + # Create read thread + read_thread = Thread(target=read_chunk, args=(i, chunk_string, read_chunks, s3)) + read_thread.start() + read_threads.append(read_thread) + + # If read_threads[i - 1] isn't done, finish + if read_threads[i - 1].is_alive(): + read_threads[i - 1].join() + # Create compress thread + compress_thread = Thread(target=compress_chunk, args=(i - 1, read_chunks, compressed_chunks)) + compress_thread.start() + compress_threads.append(compress_thread) + + # If compress_threads[i - 2] isn't done, finish + if compress_threads[i - 2].is_alive(): + compress_threads[i - 2].join() + # Create upload thread + upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3)) + upload_thread.start() + upload_threads.append(upload_thread) + + + # # **** SEQUENTIAL EXECUTION **** + # for i in range(0, num_chunks + 2): + # if (i == 0): + # read_chunk(0, chunk_string, read_chunks, s3) + # elif (i == 1): + # read_chunk(1, chunk_string, read_chunks, s3) + # compress_chunk(0, read_chunks, compressed_chunks) + # elif (i == num_chunks): + # compress_chunk(num_chunks - 1, read_chunks, compressed_chunks) + # upload_chunk(num_chunks - 2, chunk_string, compressed_chunks, s3) + # elif (i == num_chunks + 1): + # upload_chunk(num_chunks - 1, chunk_string, compressed_chunks, s3) + # else: + # read_chunk(i, chunk_string, read_chunks, s3) + # compress_chunk(i - 1, read_chunks, compressed_chunks) + # upload_chunk(i - 2, chunk_string, compressed_chunks, s3) + + + # # **** UPLOAD FILE TO BUCKET **** # s3 = boto3.resource("s3") # os.chdir("/tmp") # f = open("test.txt","w") @@ -23,33 +149,47 @@ def call_compress (event, context): # f.close() # s3.meta.client.upload_file("/tmp/test.txt", "succinct-datasets", "test.txt") - # Get file content from S3 and save as "input" - s3 = boto3.client("s3") - obj = client.get_object(Bucket='succinct-datasets', Key=event['key1']) - input = obj.get()['Body'].read().decode('utf-8') - - # with open(event['key1'], 'r') as f: - # print(f.read()) + # # **** COMPRESS BY INPUT **** + # # Get file content from S3 and save as "input" + # s3 = boto3.client("s3") + # obj = s3.get_object(Bucket='succinct-datasets', Key=event['key1']) + # input = obj['Body'].read().decode('utf-8') - # Compress the input using file module - q = file.File(input, 32, 32, 128, 0, 1) - content = (q.GetContent().tobytes()) + # # Compress the input using file module + # q = file.File(0, input, 32, 32, 128, 0, 1) + # content = q.GetContent().tobytes() - # Upload content back onto S3 in .succinct file - client.put_object(Body=content, Bucket='succinct-datasets', Key=event['key1'] + ".succinct") + # # Upload content back onto S3 in .succinct file + # s3.put_object(Body=content, Bucket='succinct-datasets', Key=event['key1'] + ".succinct") - # for f in os.listdir("/tmp"): - # print(f) + # # **** COMPRESS BY FILE **** + # s3 = boto3.resource("s3") + # os.chdir("/tmp") + # s3.Bucket('succinct-datasets').download_file(event['key1'], event['key1']) + # q = file.File(event['key1'], 32, 32, 128, 0, 1) + + + # **** CALCULATE COMPRESSION RATIO **** + # s3 = boto3.resource("s3") + # bucket = s3.Bucket('succinct-datasets') + # orignal_sizes = 0 + # compressed_sizes = 0 + # for i in range (0, num_chunks): + # orignal_sizes += boto3.resource('s3').Bucket('succinct-datasets').Object(event['key1'] + "-chunk-" + str(i) + ".succinct").content_length + # compressed_sizes += boto3.resource('s3').Bucket('succinct-datasets').Object(event['key1'] + "-chunk-compressed-" + str(i) + ".succinct").content_length + + # print(orignal_sizes) + # print(compressed_sizes) + # print(compressed_sizes/orignal_sizes) - # uploadDirectory("/tmp/" + event['key1'] + ".succinct", "succinct-datasets", event['key1']) - - # out = os.path.isfile("/tmp/" + event['key1'] + ".succinct") - # print(out) - # Remove compressed .succinct file contents from s3 + # # **** REMOVE OBJECTS FROM S3 **** # s3 = boto3.resource("s3") - # obj = s3.Object("succinct-datasets", event['key1'] + ".succinct.metadata") - # obj.delete() + # for i in range (0, 200): + # obj = s3.Object("succinct-datasets", event['key1'] + "-chunk-compressed-" + str(i) + ".succinct") + # obj.delete() + # obj = s3.Object("succinct-datasets", event['key1'] + "-chunk-" + str(i) + ".succinct") + # obj.delete() print("File compression and upload is complete") def call_query (event, context): diff --git a/examples/pysuccinct/pyfile_input.py b/examples/pysuccinct/pyfile_input.py index 1e3ff8d..fc63ded 100644 --- a/examples/pysuccinct/pyfile_input.py +++ b/examples/pysuccinct/pyfile_input.py @@ -81,9 +81,9 @@ def RepresentsInt(s): isa_sampling_rate, npa_sampling_rate, sampling_scheme, npa_encoding_scheme) - content = (q.GetContent().tobytes()) + content = string(q.GetContent()).encode('ASCII') # print(str(content,'ISO-8859-1')) - text_file = open(inputpath + ".succinct", "wb") + text_file = open(inputpath + ".succinct", "w") text_file.write(content) else: From 70b2e6fcbe93d41770379837cc56d13bb72974e2 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Fri, 31 Jul 2020 15:42:51 -0400 Subject: [PATCH 35/39] Added python files for benchmark testing --- .vscode/settings.json | 3 +- benchmark-testing/chunk_compressor.py | 465 ++++++++++++++++++++++++ benchmark-testing/file_splitter.py | 22 ++ core/include/npa/delta_encoded_npa.h | 27 +- core/src/npa/elias_gamma_encoded_npa.cc | 5 +- core/src/succinct_core.cc | 2 - examples/lambda/lambdatest.py | 39 +- examples/pysuccinct/pyfile_input.py | 2 +- pysuccinct/file.cpp | 7 + 9 files changed, 542 insertions(+), 30 deletions(-) create mode 100644 benchmark-testing/chunk_compressor.py create mode 100644 benchmark-testing/file_splitter.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 70c1b98..84e9b3b 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -52,5 +52,6 @@ "thread": "cpp", "istream": "cpp" }, - "python.linting.enabled": false + "python.linting.enabled": false, + "git.ignoreLimitWarning": true } \ No newline at end of file diff --git a/benchmark-testing/chunk_compressor.py b/benchmark-testing/chunk_compressor.py new file mode 100644 index 0000000..deb89c5 --- /dev/null +++ b/benchmark-testing/chunk_compressor.py @@ -0,0 +1,465 @@ +import file +import boto3 +import time +import os +from threading import Thread, Event +import resource +import signal + +# Compress time global +compress_time = 0 +read_time = 0 +upload_time = 0 + +# PID = os.getpid() + +# Reusable Thread class from: https://www.codeproject.com/Tips/1271787/Python-Reusable-Thread-Class +class ReusableThread(Thread): + """ + This class provides code for a restartale / reusable thread + + join() will only wait for one (target)functioncall to finish + finish() will finish the whole thread (after that, it's not restartable anymore) + + """ + + def __init__(self, target, args): + self._startSignal = Event() + self._oneRunFinished = Event() + self._finishIndicator = False + self._callable = target + self._callableArgs = args + + Thread.__init__(self) + + def restart(self): + """make sure to always call join() before restarting""" + self._startSignal.set() + + def run(self): + """ This class will reprocess the object "processObject" forever. + Through the change of data inside processObject and start signals + we can reuse the thread's resources""" + + self.restart() + while(True): + # wait until we should process + self._startSignal.wait() + + self._startSignal.clear() + + if(self._finishIndicator):# check, if we want to stop + self._oneRunFinished.set() + return + + # call the threaded function + self._callable(*self._callableArgs) + + # notify about the run's end + self._oneRunFinished.set() + + def join(self): + """ This join will only wait for one single run (target functioncall) to be finished""" + self._oneRunFinished.wait() + self._oneRunFinished.clear() + + def finish(self): + self._finishIndicator = True + self.restart() + self.join() + +def do_nothing(*args): + pass + +# **** PARALLEL EXECUTION WITH REUSABLE THREADS **** +def read_chunk(read_index, chunk_string, read_arg, s3, code_start): + # Starting to read chunk + global read_time + i = read_index[0] + print("reading chunk " + str(i)) + start = time.time() + + ### + obj = s3.get_object(Bucket='succinct-datasets', Key=chunk_string + str(i) + ".succinct") + read_arg[0] = obj['Body'].read().decode('utf-8') + del obj + ### + + # write chunk times to file and mark time + # end = time.time() + # read_time += end - start + # read_time_file = open("read_time_file.txt","a") + # read_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") + +def compress_chunk(compress_index, read_arg, compressed_arg, code_start): + # Starting to compress chunk + global compress_time + i = compress_index[0] + print("compressing chunk " + str(i)) + start = time.time() + + ### + q = file.File(0, read_arg[0], 32, 32, 128, 0, 1) + compressed_arg[0] = q.GetContent().tobytes() + q.DeleteContent() + del q + ### + + # write chunk times to file and mark time + # end = time.time() + # compress_time += end - start + # compress_time_file = open("compress_time_file.txt","a") + # compress_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") + +def upload_chunk(upload_index, chunk_string, compressed_arg, s3, code_start): + # Starting to upload chunk + global upload_time + i = upload_index[0] + print("uploading chunk " + str(i)) + start = time.time() + + ### + s3.put_object(Body=compressed_arg[0], Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct") + ### + + # write chunk times to file and mark time + # end = time.time() + # upload_time += end - start + # upload_time_file = open("upload_time_file.txt","a") + # upload_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") + + +# # **** PARALLEL EXECUTION WITH LIST OF THREAD FUNCTIONS **** + +# def read_chunk(i, chunk_string, read_chunks, s3, code_start): +# print("reading chunk " + str(i)) +# global read_time +# start = time.time() +# obj = s3.get_object(Bucket='succinct-datasets', Key=chunk_string + str(i) + ".succinct") +# read_chunks.append(obj['Body'].read().decode('utf-8')) +# del obj +# end = time.time() +# read_time += end - start + +# # # write chunk times to file +# # read_time_file = open("read_time_file.txt","a") +# # read_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " finish: " + str(end - code_start) + "\n") + +# def compress_chunk(i, read_chunks, compressed_chunks, code_start): +# print("compressing chunk " + str(i)) +# global compress_time +# start = time.time() +# q = file.File(0, read_chunks[i], 32, 32, 128, 0, 1) +# compressed_chunks.append(q.GetContent().tobytes()) +# q.DeleteContent() +# del q +# read_chunks[i] = None +# end = time.time() +# compress_time += end - start + +# # # write chunk times to file +# # compress_time_file = open("compress_time_file.txt","a") +# # compress_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " finish: " + str(end - code_start) + "\n") + +# def upload_chunk(i, chunk_string, compressed_chunks, s3, code_start): +# print("uploading chunk " + str(i)) +# global upload_time +# start = time.time() +# s3.put_object(Body=compressed_chunks[i], Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct") +# compressed_chunks[i] = None +# end = time.time() +# upload_time += end - start + + # # write chunk times to file + # upload_time_file = open("upload_time_file.txt","a") + # upload_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " finish: " + str(end - code_start) + "\n") + +# **** FUNCTION START **** +# compress file +start = time.time() +# Define variables +chunk_string = "sample.txt" + "-chunk-" +s3 = boto3.client("s3", aws_access_key_id="AKIAJS52NT4AB7WQAZKA", + aws_secret_access_key="5nc0Bc5HD8ugeqLRfF3h6kR9OCfLfQ/VTRYOaxy/", + region_name="us-east-2") +# Depends on number of chunks file is split into +num_chunks = 1167 +read_chunks = [] +compressed_chunks = [] + +# COMPRESS WITH FILES +# q = file.File("sample.txt-chunk-1160.succinct", 32, 32, 128, 0, 1) + +# COMPRESS IN MEM +# f = open("sample.txt-chunk-1166.succinct","r") +# content = f.read() +# q = file.File(0, content, 32, 32, 128, 0, 1) + + +# # **** QUERY FOR NUMBER OF CHUNK FILES **** +# s3 = boto3.resource("s3") +# bucket = s3.Bucket('succinct-datasets') +# for bucket_obj in bucket.objects.all(): +# if (chunk_string in str(bucket_obj)): +# num_chunks += 1 +# print("The num chunks is: " + str(num_chunks)) + +# signal.signal(signal.SIGUSR1, do_nothing) + +# **** PARALLEL EXECUTION WITH REUSABLE THREADS **** + +read_index = [0] +compress_index = [0] +upload_index = [0] + + +# Variables to keep track of read and compressed data +read_arg = [0] +compressed_arg = [0] + +read_thread = ReusableThread(target = read_chunk, args=(read_index, chunk_string, read_arg, s3, start)) +compress_thread = ReusableThread(target = compress_chunk, args=(compress_index, read_arg, compressed_arg, start)) +upload_thread = ReusableThread(target = upload_chunk, args=(upload_index, chunk_string, compressed_arg, s3, start)) + +for i in range(0, num_chunks + 2): + if (i == 0): + # START: read + read_index[0] = i + + read_thread.start() + elif (i == 1): + # RESTART: read START: compress + read_thread.join() + + read_index[0] = i + compress_index[0] = i - 1 + + compress_thread.start() + read_thread.restart() + elif (i == 2): + # RESTART: compress, read START: upload + read_thread.join() + compress_thread.join() + + read_index[0] = i + compress_index[0] = i - 1 + upload_index[0] = i - 2 + + upload_thread.start() + compress_thread.restart() + read_thread.restart() + + elif (i == num_chunks): + # RESTART: upload, compress + read_thread.join() + compress_thread.join() + upload_thread.join() + + compress_index [0] = i - 1 + upload_index [0] = i - 2 + + upload_thread.restart() + compress_thread.restart() + elif (i == num_chunks + 1): + # RESTART: upload + compress_thread.join() + upload_thread.join() + + upload_index [0] = i - 2 + + upload_thread.restart() + + # Join last thread and finish all threads + upload_thread.join() + + read_thread.finish() + compress_thread.finish() + upload_thread.finish() + else: + # RESTART: upload, compress, read + read_thread.join() + compress_thread.join() + upload_thread.join() + + read_index[0] = i + compress_index[0] = i - 1 + upload_index[0] = i - 2 + + upload_thread.restart() + compress_thread.restart() + read_thread.restart() + +# # **** PARALLEL EXECUTION WITH LIST OF THREADS **** +# read_threads = [] +# compress_threads = [] +# upload_threads = [] + +# for i in range(0, num_chunks + 2): +# if (i == 0): +# # Create read thread +# read_thread = Thread(target=read_chunk, args=(0, chunk_string, read_chunks, s3, start)) +# read_thread.start() +# read_threads.append(read_thread) +# elif (i == 1): +# # Create read thread +# read_thread = Thread(target=read_chunk, args=(1, chunk_string, read_chunks, s3, start)) +# read_thread.start() +# read_threads.append(read_thread) + +# # If read_threads[0] isn't done, finish +# if read_threads[0].is_alive(): +# read_threads[0].join() +# # Create compress thread +# compress_thread = Thread(target=compress_chunk, args=(0, read_chunks, compressed_chunks, start)) +# compress_thread.start() +# compress_threads.append(compress_thread) +# elif (i == num_chunks): +# # If read_threads[num_chunks - 1] isn't done, finish +# if read_threads[i - 1].is_alive(): +# read_threads[i - 1].join() + +# # If compress_threads[num_chunks - 2] isn't done, finish +# if compress_threads[i - 2].is_alive(): +# compress_threads[i - 2].join() + +# # Create compress thread +# compress_thread = Thread(target=compress_chunk, args=(i - 1, read_chunks, compressed_chunks, start)) +# compress_thread.start() +# compress_threads.append(compress_thread) +# # Create upload thread +# upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) +# upload_thread.start() +# upload_threads.append(upload_thread) +# elif (i == num_chunks + 1): +# # If compress_threads[num_chunks - 1] isn't done, finish +# if compress_threads[i - 2].is_alive(): +# compress_threads[i - 2].join() +# # Create upload thread +# upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) +# upload_thread.start() +# upload_threads.append(upload_thread) +# else: +# # Create read thread +# read_thread = Thread(target=read_chunk, args=(i, chunk_string, read_chunks, s3, start)) +# read_thread.start() +# read_threads.append(read_thread) + +# # If read_threads[i - 1] isn't done, finish +# if read_threads[i - 1].is_alive(): +# read_threads[i - 1].join() + +# # If compress_threads[i - 2] isn't done, finish +# if compress_threads[i - 2].is_alive(): +# compress_threads[i - 2].join() + +# # Create compress thread +# compress_thread = Thread(target=compress_chunk, args=(i - 1, read_chunks, compressed_chunks, start)) +# compress_thread.start() +# compress_threads.append(compress_thread) + +# # Create upload thread +# upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) +# upload_thread.start() +# upload_threads.append(upload_thread) + +# os.kill(PID, signal.SIGUSR1) + + +# # **** SEQUENTIAL EXECUTION **** +# for i in range(0, num_chunks + 2): +# if (i == 0): +# read_chunk(0, chunk_string, read_chunks, s3) +# elif (i == 1): +# read_chunk(1, chunk_string, read_chunks, s3) +# compress_chunk(0, read_chunks, compressed_chunks) +# elif (i == num_chunks): +# compress_chunk(num_chunks - 1, read_chunks, compressed_chunks) +# upload_chunk(num_chunks - 2, chunk_string, compressed_chunks, s3) +# elif (i == num_chunks + 1): +# upload_chunk(num_chunks - 1, chunk_string, compressed_chunks, s3) +# else: +# read_chunk(i, chunk_string, read_chunks, s3) +# compress_chunk(i - 1, read_chunks, compressed_chunks) +# upload_chunk(i - 2, chunk_string, compressed_chunks, s3) + + +# signal.signal(signal.SIGUSR1, do_nothing) + +# #with s3 +# for i in range(11,12): +# obj = s3.get_object(Bucket='succinct-datasets', Key=chunk_string + str(i) + ".succinct") +# print("read " + str(i)) +# q = file.File(0, obj['Body'].read().decode('utf-8'), 32, 32, 128, 0, 1) +# compressed_content = q.GetContent().tobytes() +# print("compressed " + str(i)) +# s3.put_object(Body=compressed_content, Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct") +# print("uploaded " + str(i)) + +# # without s3 +# for i in range(1660, 1667): +# f = open(chunk_string + str(i) + ".succinct","r") +# content = f.read() +# q = file.File(0, content, 32, 32, 128, 0, 1) +# text_file = open(chunk_string + str(i) + "-compressed-local.succinct", "wb") +# text_file.write(q.GetContent().tobytes()) + +# os.kill(PID, signal.SIGUSR1) +# print("done") + +# # **** UPLOAD FILE TO BUCKET **** +# s3 = boto3.resource("s3") +# os.chdir("/tmp") +# f = open("test.txt","w") +# f.write("hello this is a test") +# f.close() +# s3.meta.client.upload_file("/tmp/test.txt", "succinct-datasets", "test.txt") + +# # **** COMPRESS BY INPUT **** +# # Get file content from S3 and save as "input" +# s3 = boto3.client("s3") +# obj = s3.get_object(Bucket='succinct-datasets', Key=event['key1']) +# input = obj['Body'].read().decode('utf-8') + +# # Compress the input using file module +# q = file.File(0, input, 32, 32, 128, 0, 1) +# content = q.GetContent().tobytes() + +# # Upload content back onto S3 in .succinct file +# s3.put_object(Body=content, Bucket='succinct-datasets', Key=event['key1'] + ".succinct") + +# # **** COMPRESS BY FILE **** +# s3 = boto3.resource("s3") +# os.chdir("/tmp") +# s3.Bucket('succinct-datasets').download_file(event['key1'], event['key1']) +# q = file.File(event['key1'], 32, 32, 128, 0, 1) + +# # **** CALCULATE COMPRESSION RATIO **** +# s3 = boto3.resource("s3", aws_access_key_id="AKIAJS52NT4AB7WQAZKA", +# aws_secret_access_key="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", +# region_name="us-east-2") +# bucket = s3.Bucket('succinct-datasets') +# orignal_sizes = 0 +# compressed_sizes = 0 +# for i in range (0, num_chunks): +# orignal_sizes += bucket.Object("sample.txt-chunk-" + str(i) + ".succinct").content_length +# compressed_sizes += bucket.Object("sample.txt-chunk-compressed-" + str(i) + ".succinct").content_length + +# print(orignal_sizes) +# print(compressed_sizes) +# print(orignal_sizes/compressed_sizes) + + +# # **** REMOVE OBJECTS FROM S3 **** +# s3 = boto3.resource("s3") +# for i in range (0, 200): +# obj = s3.Object("succinct-datasets", event['key1'] + "-chunk-compressed-" + str(i) + ".succinct") +# obj.delete() +# obj = s3.Object("succinct-datasets", event['key1'] + "-chunk-" + str(i) + ".succinct") +# obj.delete() + +print("File compression and upload is complete") +print("read time: " + str(read_time)) +print("compress time: " + str(compress_time)) +print("upload time: " + str(upload_time)) +print("TOTAL DURATION: " + str(time.time() - start)) +print(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) \ No newline at end of file diff --git a/benchmark-testing/file_splitter.py b/benchmark-testing/file_splitter.py new file mode 100644 index 0000000..5e182d3 --- /dev/null +++ b/benchmark-testing/file_splitter.py @@ -0,0 +1,22 @@ +# Declare variables +column_12 = [] +output_chunks = [] +num_records = 100000 + +# Open file +f = open("sample.txt", "r", encoding = "ISO-8859-1") + +# Parse input +for line in f: + last_item = (line.split())[-1] + column_12.append(last_item) + +# Separate column_12 list into chunks of size num_records +for i in range(0, len(column_12), num_records): + output_chunks.append(column_12[i:i+num_records]) + +# Output these chunks into their own files +for i in range(0, len(output_chunks)): + output_file = open("sample.txt-chunk-" + str(i) + ".succinct", "w") + for j in range(0, len(output_chunks[i])): + output_file.write(str(output_chunks[i][j]) + "\n") \ No newline at end of file diff --git a/core/include/npa/delta_encoded_npa.h b/core/include/npa/delta_encoded_npa.h index 92dd35c..561deca 100644 --- a/core/include/npa/delta_encoded_npa.h +++ b/core/include/npa/delta_encoded_npa.h @@ -123,7 +123,7 @@ class DeltaEncodedNPA : public NPA { // Get all NPA values int64_t *lNPA = new int64_t[npa_size_](); - // fprintf(stderr, "the npa_size_ is: %" PRIu64 "\n", npa_size_); + fprintf(stderr, "the npa_size_ is: %" PRIu64 "\n", npa_size_); uint64_t first_idx, cur_idx, nxt_idx, num_elements_per_chunk; std::thread constructor_thread[8]; @@ -173,7 +173,7 @@ class DeltaEncodedNPA : public NPA { // Get all NPA values int64_t *lNPA = new int64_t[npa_size_](); - // fprintf(stderr, "the npa_size_ is: %" PRIu64 "\n", npa_size_); + fprintf(stderr, "the npa_size_ is: %" PRIu64 "\n", npa_size_); uint64_t first_idx, cur_idx, nxt_idx, num_elements_per_chunk; std::thread constructor_thread[8]; @@ -197,6 +197,7 @@ class DeltaEncodedNPA : public NPA { } for (uint8_t i = 0; i < 8; i++) { + // fprintf(stderr, "thread loop: %i\n", i); constructor_thread[i].join(); } @@ -351,19 +352,20 @@ class DeltaEncodedNPA : public NPA { int64_t first_idx) { // ISA Stream is configured to start reading from correct position uint64_t cur_idx, nxt_idx; - // fprintf(stderr, "*** start_pos: %" PRIu64 ", first_idx: %" PRId64 "\n", start_pos, first_idx); + fprintf(stderr, "*** start_pos: %" PRIu64 ", first_idx: %" PRId64 "\n", start_pos, first_idx); ArrayStream isa_stream(isa_file, start_pos); cur_idx = isa_stream.Get(); + fprintf(stderr, "n_elems = %" PRIu64 "\n", n_elems); for (uint64_t i = 0; i < n_elems; i++) { nxt_idx = isa_stream.Get(); - // fprintf(stderr, "Setting lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, nxt_idx); + fprintf(stderr, "Setting lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, nxt_idx); lNPA[cur_idx] = nxt_idx; cur_idx = nxt_idx; } - // fprintf(stderr, "cur_idx is :%" PRIu64 " when exiting the loop\n", cur_idx); + fprintf(stderr, "cur_idx is :%" PRIu64 " when exiting the loop\n", cur_idx); if (first_idx > 0) { - // fprintf(stderr, "TRYING TO SET lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, first_idx); + fprintf(stderr, "TRYING TO SET lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, first_idx); lNPA[cur_idx] = first_idx; } isa_stream.Close(); @@ -375,23 +377,26 @@ class DeltaEncodedNPA : public NPA { int64_t first_idx, uint64_t npa_size) { // ISA Stream is configured to start reading from correct position uint64_t cur_idx, nxt_idx; - //fprintf(stderr, "*** start_pos: %" PRIu64 ", first_idx: %" PRId64 "\n", start_pos, first_idx); + fprintf(stderr, "*** start_pos: %" PRIu64 ", first_idx: %" PRId64 "\n", start_pos, first_idx); ArrayInput isa_array(lISA, start_pos); cur_idx = isa_array.Get(); + fprintf(stderr, "n_elems = %" PRIu64 "\n", n_elems); + if (cur_idx >= npa_size){ + fprintf(stderr, "out of bounds\n"); cur_idx = 0; } else { for (uint64_t i = 0; i < n_elems; i++) { nxt_idx = isa_array.Get(); - // fprintf(stderr, "Setting lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, nxt_idx); + fprintf(stderr, "Setting lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, nxt_idx); lNPA[cur_idx] = nxt_idx; cur_idx = nxt_idx; } } - // fprintf(stderr, "cur_idx is :%" PRIu64 " when exiting the loop\n", cur_idx); - if (first_idx > 0) { - // fprintf(stderr, "TRYING TO SET lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, first_idx); + fprintf(stderr, "cur_idx is :%" PRIu64 " when exiting the loop\n", cur_idx); + if (first_idx > 0 && cur_idx < npa_size) { + fprintf(stderr, "TRYING TO SET lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, first_idx); lNPA[cur_idx] = first_idx; } } diff --git a/core/src/npa/elias_gamma_encoded_npa.cc b/core/src/npa/elias_gamma_encoded_npa.cc index a1884e7..ecd78dc 100644 --- a/core/src/npa/elias_gamma_encoded_npa.cc +++ b/core/src/npa/elias_gamma_encoded_npa.cc @@ -135,7 +135,10 @@ void EliasGammaEncodedNPA::CreateDeltaEncodedVector( } } else { long delta = data[i] - last_val; - assert(delta > 0); + // assert(delta > 0); + if (delta <= 0){ + fprintf(stderr, "WARNING DELTA <= 0 --> %ld = %" PRIu64 " - %" PRIu64 "\n",delta,data[i],last_val); + } _deltas.push_back(delta); delta_enc_size = EliasGammaEncodingSize(delta); diff --git a/core/src/succinct_core.cc b/core/src/succinct_core.cc index 8ac903a..0fde7a3 100644 --- a/core/src/succinct_core.cc +++ b/core/src/succinct_core.cc @@ -513,8 +513,6 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, s_allocator.s_free(lISA); s_allocator.s_free(lSA); - // fprintf(stderr, "npa switch\n"); - } } diff --git a/examples/lambda/lambdatest.py b/examples/lambda/lambdatest.py index cc97a15..b1f0e87 100644 --- a/examples/lambda/lambdatest.py +++ b/examples/lambda/lambdatest.py @@ -8,29 +8,40 @@ import time from threading import Thread -# Change the filenames to succinctdi_sa, succinctdir_isa .... and upload to s3 -def uploadDirectory(path,bucketname, name): - s3 = boto3.client("s3") - for root,dirs,files in os.walk(path): - for f in files: - s3.upload_file(os.path.join(root,f), bucketname, f) +# Compress time global +compress_time = 0 +read_time = 0 +upload_time = 0 + def read_chunk(i, chunk_string, read_chunks, s3): - # print("reading chunk " + str(i)) + print("reading chunk " + str(i)) + global read_time + start = time.time() obj = s3.get_object(Bucket='succinct-datasets', Key=chunk_string + str(i) + ".succinct") read_chunks.append(obj['Body'].read().decode('utf-8')) + del obj + read_time += time.time() - start def compress_chunk(i, read_chunks, compressed_chunks): - # print("compressing chunk " + str(i)) + print("compressing chunk " + str(i)) + global compress_time + start = time.time() q = file.File(0, read_chunks[i], 32, 32, 128, 0, 1) compressed_chunks.append(q.GetContent().tobytes()) + q.DeleteContent() + del q read_chunks[i] = None + compress_time += time.time() - start def upload_chunk(i, chunk_string, compressed_chunks, s3): - # print("uploading chunk " + str(i)) + print("uploading chunk " + str(i)) + global upload_time + start = time.time() s3.put_object(Body=compressed_chunks[i], Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct") # Remove chunk here compressed_chunks[i] = None + upload_time += time.time() - start @@ -39,8 +50,8 @@ def call_compress (event, context): # Define variables chunk_string = event['key1'] + "-chunk-" s3 = boto3.client("s3") - # Depends on number of chunks file is split into - num_chunks = 292 + # Depends on number of chunks file is split + num_chunks = 1167 read_chunks = [] compressed_chunks = [] @@ -185,11 +196,11 @@ def call_compress (event, context): # # **** REMOVE OBJECTS FROM S3 **** # s3 = boto3.resource("s3") - # for i in range (0, 200): + # for i in range (0, 1500): # obj = s3.Object("succinct-datasets", event['key1'] + "-chunk-compressed-" + str(i) + ".succinct") # obj.delete() - # obj = s3.Object("succinct-datasets", event['key1'] + "-chunk-" + str(i) + ".succinct") - # obj.delete() + #obj = s3.Object("succinct-datasets", event['key1'] + "-chunk-" + str(i) + ".succinct") + #obj.delete() print("File compression and upload is complete") def call_query (event, context): diff --git a/examples/pysuccinct/pyfile_input.py b/examples/pysuccinct/pyfile_input.py index fc63ded..7d4681e 100644 --- a/examples/pysuccinct/pyfile_input.py +++ b/examples/pysuccinct/pyfile_input.py @@ -81,7 +81,7 @@ def RepresentsInt(s): isa_sampling_rate, npa_sampling_rate, sampling_scheme, npa_encoding_scheme) - content = string(q.GetContent()).encode('ASCII') + content = q.GetContent().tobytes() # print(str(content,'ISO-8859-1')) text_file = open(inputpath + ".succinct", "w") text_file.write(content) diff --git a/pysuccinct/file.cpp b/pysuccinct/file.cpp index c634eff..a808a5d 100644 --- a/pysuccinct/file.cpp +++ b/pysuccinct/file.cpp @@ -171,6 +171,12 @@ struct File { return result; } + // Deconstructor + void DeleteContent(){ + delete s_file_; + delete file_content_; + } + //File members SuccinctFile *s_file_; unsigned char *file_content_; @@ -188,6 +194,7 @@ BOOST_PYTHON_MODULE(file){ .def("Search", &File::Search) .def("Count", &File::Count) .def("Extract", &File::Extract) + .def("DeleteContent", &File::DeleteContent) ; } \ No newline at end of file From e63ed44b13506b7ce5352407035f3ea325dca6f5 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Fri, 31 Jul 2020 16:13:47 -0400 Subject: [PATCH 36/39] Removed keys --- benchmark-testing/chunk_compressor.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/benchmark-testing/chunk_compressor.py b/benchmark-testing/chunk_compressor.py index deb89c5..61654ae 100644 --- a/benchmark-testing/chunk_compressor.py +++ b/benchmark-testing/chunk_compressor.py @@ -179,9 +179,6 @@ def upload_chunk(upload_index, chunk_string, compressed_arg, s3, code_start): start = time.time() # Define variables chunk_string = "sample.txt" + "-chunk-" -s3 = boto3.client("s3", aws_access_key_id="AKIAJS52NT4AB7WQAZKA", - aws_secret_access_key="5nc0Bc5HD8ugeqLRfF3h6kR9OCfLfQ/VTRYOaxy/", - region_name="us-east-2") # Depends on number of chunks file is split into num_chunks = 1167 read_chunks = [] @@ -434,9 +431,6 @@ def upload_chunk(upload_index, chunk_string, compressed_arg, s3, code_start): # q = file.File(event['key1'], 32, 32, 128, 0, 1) # # **** CALCULATE COMPRESSION RATIO **** -# s3 = boto3.resource("s3", aws_access_key_id="AKIAJS52NT4AB7WQAZKA", -# aws_secret_access_key="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", -# region_name="us-east-2") # bucket = s3.Bucket('succinct-datasets') # orignal_sizes = 0 # compressed_sizes = 0 From 02b86f8f432d94cb054d6b4db4e6ec9d95f314fc Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Mon, 3 Aug 2020 21:15:06 -0400 Subject: [PATCH 37/39] pipelined threads are now added via indices instead of append --- benchmark-testing/chunk_compressor.py | 433 +++++++++++++------------- 1 file changed, 219 insertions(+), 214 deletions(-) diff --git a/benchmark-testing/chunk_compressor.py b/benchmark-testing/chunk_compressor.py index 61654ae..77bd00a 100644 --- a/benchmark-testing/chunk_compressor.py +++ b/benchmark-testing/chunk_compressor.py @@ -71,108 +71,110 @@ def finish(self): def do_nothing(*args): pass -# **** PARALLEL EXECUTION WITH REUSABLE THREADS **** -def read_chunk(read_index, chunk_string, read_arg, s3, code_start): - # Starting to read chunk - global read_time - i = read_index[0] - print("reading chunk " + str(i)) - start = time.time() - - ### - obj = s3.get_object(Bucket='succinct-datasets', Key=chunk_string + str(i) + ".succinct") - read_arg[0] = obj['Body'].read().decode('utf-8') - del obj - ### - - # write chunk times to file and mark time - # end = time.time() - # read_time += end - start - # read_time_file = open("read_time_file.txt","a") - # read_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") - -def compress_chunk(compress_index, read_arg, compressed_arg, code_start): - # Starting to compress chunk - global compress_time - i = compress_index[0] - print("compressing chunk " + str(i)) - start = time.time() - - ### - q = file.File(0, read_arg[0], 32, 32, 128, 0, 1) - compressed_arg[0] = q.GetContent().tobytes() - q.DeleteContent() - del q - ### - - # write chunk times to file and mark time - # end = time.time() - # compress_time += end - start - # compress_time_file = open("compress_time_file.txt","a") - # compress_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") - -def upload_chunk(upload_index, chunk_string, compressed_arg, s3, code_start): - # Starting to upload chunk - global upload_time - i = upload_index[0] - print("uploading chunk " + str(i)) - start = time.time() - - ### - s3.put_object(Body=compressed_arg[0], Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct") - ### - - # write chunk times to file and mark time - # end = time.time() - # upload_time += end - start - # upload_time_file = open("upload_time_file.txt","a") - # upload_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") - - -# # **** PARALLEL EXECUTION WITH LIST OF THREAD FUNCTIONS **** - -# def read_chunk(i, chunk_string, read_chunks, s3, code_start): -# print("reading chunk " + str(i)) +# # **** PARALLEL EXECUTION WITH REUSABLE THREADS **** +# def read_chunk(read_index, chunk_string, read_arg, s3, code_start): +# # Starting to read chunk # global read_time +# i = read_index[0] +# print("reading chunk " + str(i)) # start = time.time() + +# ### # obj = s3.get_object(Bucket='succinct-datasets', Key=chunk_string + str(i) + ".succinct") -# read_chunks.append(obj['Body'].read().decode('utf-8')) +# read_arg[0] = obj['Body'].read().decode('utf-8') # del obj -# end = time.time() -# read_time += end - start +# ### -# # # write chunk times to file +# # write chunk times to file and mark time +# # end = time.time() +# # read_time += end - start # # read_time_file = open("read_time_file.txt","a") -# # read_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " finish: " + str(end - code_start) + "\n") +# # read_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") -# def compress_chunk(i, read_chunks, compressed_chunks, code_start): -# print("compressing chunk " + str(i)) +# def compress_chunk(compress_index, read_arg, compressed_arg, code_start): +# # Starting to compress chunk # global compress_time +# i = compress_index[0] +# print("compressing chunk " + str(i)) # start = time.time() -# q = file.File(0, read_chunks[i], 32, 32, 128, 0, 1) -# compressed_chunks.append(q.GetContent().tobytes()) + +# ### +# q = file.File(0, read_arg[0], 32, 32, 128, 0, 1) +# compressed_arg[0] = q.GetContent().tobytes() # q.DeleteContent() # del q -# read_chunks[i] = None -# end = time.time() -# compress_time += end - start +# ### -# # # write chunk times to file +# # write chunk times to file and mark time +# # end = time.time() +# # compress_time += end - start # # compress_time_file = open("compress_time_file.txt","a") -# # compress_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " finish: " + str(end - code_start) + "\n") +# # compress_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") -# def upload_chunk(i, chunk_string, compressed_chunks, s3, code_start): -# print("uploading chunk " + str(i)) +# def upload_chunk(upload_index, chunk_string, compressed_arg, s3, code_start): +# # Starting to upload chunk # global upload_time +# i = upload_index[0] +# print("uploading chunk " + str(i)) # start = time.time() -# s3.put_object(Body=compressed_chunks[i], Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct") -# compressed_chunks[i] = None -# end = time.time() -# upload_time += end - start - # # write chunk times to file - # upload_time_file = open("upload_time_file.txt","a") - # upload_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " finish: " + str(end - code_start) + "\n") +# ### +# s3.put_object(Body=compressed_arg[0], Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct") +# ### + +# # write chunk times to file and mark time +# # end = time.time() +# # upload_time += end - start +# # upload_time_file = open("upload_time_file.txt","a") +# # upload_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") + + +# **** PARALLEL EXECUTION WITH LIST OF THREAD FUNCTIONS **** + +def read_chunk(i, chunk_string, read_chunks, s3, code_start): + print("reading chunk " + str(i)) + global read_time + start = time.time() + obj = s3.get_object(Bucket='succinct-datasets', Key=chunk_string + str(i) + ".succinct") + # writing read chunk str(i) to pos len(read_chunks) + read_chunks[i] = obj['Body'].read().decode('utf-8') + del obj + end = time.time() + read_time += end - start + + # write chunk times to file + read_time_file = open("read_time_file.txt","a") + read_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") + +def compress_chunk(i, read_chunks, compressed_chunks, code_start): + print("compressing chunk " + str(i)) + global compress_time + start = time.time() + q = file.File(0, read_chunks[i], 32, 32, 128, 0, 1) + # writing compressed chunk str(i) to pos len(compressed_chunks) + compressed_chunks[i] = q.GetContent().tobytes() + q.DeleteContent() + del q + read_chunks[i] = None + end = time.time() + compress_time += end - start + + # write chunk times to file + compress_time_file = open("compress_time_file.txt","a") + compress_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") + +def upload_chunk(i, chunk_string, compressed_chunks, s3, code_start): + print("uploading chunk " + str(i)) + global upload_time + start = time.time() + s3.put_object(Body=compressed_chunks[i], Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct") + compressed_chunks[i] = None + end = time.time() + upload_time += end - start + + # write chunk times to file + upload_time_file = open("upload_time_file.txt","a") + upload_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") # **** FUNCTION START **** # compress file @@ -181,8 +183,8 @@ def upload_chunk(upload_index, chunk_string, compressed_arg, s3, code_start): chunk_string = "sample.txt" + "-chunk-" # Depends on number of chunks file is split into num_chunks = 1167 -read_chunks = [] -compressed_chunks = [] +read_chunks = [None] * num_chunks +compressed_chunks = [None] * num_chunks # COMPRESS WITH FILES # q = file.File("sample.txt-chunk-1160.succinct", 32, 32, 128, 0, 1) @@ -203,161 +205,164 @@ def upload_chunk(upload_index, chunk_string, compressed_arg, s3, code_start): # signal.signal(signal.SIGUSR1, do_nothing) -# **** PARALLEL EXECUTION WITH REUSABLE THREADS **** +# # **** PARALLEL EXECUTION WITH REUSABLE THREADS **** -read_index = [0] -compress_index = [0] -upload_index = [0] +# read_index = [0] +# compress_index = [0] +# upload_index = [0] -# Variables to keep track of read and compressed data -read_arg = [0] -compressed_arg = [0] +# # Variables to keep track of read and compressed data +# read_arg = [0] +# compressed_arg = [0] -read_thread = ReusableThread(target = read_chunk, args=(read_index, chunk_string, read_arg, s3, start)) -compress_thread = ReusableThread(target = compress_chunk, args=(compress_index, read_arg, compressed_arg, start)) -upload_thread = ReusableThread(target = upload_chunk, args=(upload_index, chunk_string, compressed_arg, s3, start)) +# read_thread = ReusableThread(target = read_chunk, args=(read_index, chunk_string, read_arg, s3, start)) +# compress_thread = ReusableThread(target = compress_chunk, args=(compress_index, read_arg, compressed_arg, start)) +# upload_thread = ReusableThread(target = upload_chunk, args=(upload_index, chunk_string, compressed_arg, s3, start)) -for i in range(0, num_chunks + 2): - if (i == 0): - # START: read - read_index[0] = i +# for i in range(0, num_chunks + 2): +# if (i == 0): +# # START: read +# read_index[0] = i - read_thread.start() - elif (i == 1): - # RESTART: read START: compress - read_thread.join() +# read_thread.start() +# elif (i == 1): +# # RESTART: read START: compress +# read_thread.join() - read_index[0] = i - compress_index[0] = i - 1 +# read_index[0] = i +# compress_index[0] = i - 1 - compress_thread.start() - read_thread.restart() - elif (i == 2): - # RESTART: compress, read START: upload - read_thread.join() - compress_thread.join() +# compress_thread.start() +# read_thread.restart() +# elif (i == 2): +# # RESTART: compress, read START: upload +# read_thread.join() +# compress_thread.join() - read_index[0] = i - compress_index[0] = i - 1 - upload_index[0] = i - 2 +# read_index[0] = i +# compress_index[0] = i - 1 +# upload_index[0] = i - 2 - upload_thread.start() - compress_thread.restart() - read_thread.restart() +# upload_thread.start() +# compress_thread.restart() +# read_thread.restart() - elif (i == num_chunks): - # RESTART: upload, compress - read_thread.join() - compress_thread.join() - upload_thread.join() +# elif (i == num_chunks): +# # RESTART: upload, compress +# read_thread.join() +# compress_thread.join() +# upload_thread.join() - compress_index [0] = i - 1 - upload_index [0] = i - 2 +# compress_index [0] = i - 1 +# upload_index [0] = i - 2 - upload_thread.restart() - compress_thread.restart() - elif (i == num_chunks + 1): - # RESTART: upload - compress_thread.join() - upload_thread.join() +# upload_thread.restart() +# compress_thread.restart() +# elif (i == num_chunks + 1): +# # RESTART: upload +# compress_thread.join() +# upload_thread.join() - upload_index [0] = i - 2 +# upload_index [0] = i - 2 - upload_thread.restart() +# upload_thread.restart() - # Join last thread and finish all threads - upload_thread.join() +# # Join last thread and finish all threads +# upload_thread.join() - read_thread.finish() - compress_thread.finish() - upload_thread.finish() - else: - # RESTART: upload, compress, read - read_thread.join() - compress_thread.join() - upload_thread.join() +# read_thread.finish() +# compress_thread.finish() +# upload_thread.finish() +# else: +# # RESTART: upload, compress, read +# read_thread.join() +# compress_thread.join() +# upload_thread.join() - read_index[0] = i - compress_index[0] = i - 1 - upload_index[0] = i - 2 +# read_index[0] = i +# compress_index[0] = i - 1 +# upload_index[0] = i - 2 - upload_thread.restart() - compress_thread.restart() - read_thread.restart() +# upload_thread.restart() +# compress_thread.restart() +# read_thread.restart() -# # **** PARALLEL EXECUTION WITH LIST OF THREADS **** -# read_threads = [] -# compress_threads = [] -# upload_threads = [] +# **** PARALLEL EXECUTION WITH LIST OF THREADS **** +read_threads = [None] * num_chunks +compress_threads = [None] * num_chunks +upload_threads = [None] * num_chunks -# for i in range(0, num_chunks + 2): -# if (i == 0): -# # Create read thread -# read_thread = Thread(target=read_chunk, args=(0, chunk_string, read_chunks, s3, start)) -# read_thread.start() -# read_threads.append(read_thread) -# elif (i == 1): -# # Create read thread -# read_thread = Thread(target=read_chunk, args=(1, chunk_string, read_chunks, s3, start)) -# read_thread.start() -# read_threads.append(read_thread) +for i in range(0, num_chunks + 2): + if (i == 0): + # Create read thread + read_thread = Thread(target=read_chunk, args=(0, chunk_string, read_chunks, s3, start)) + read_thread.start() + read_threads[i] = read_thread -# # If read_threads[0] isn't done, finish -# if read_threads[0].is_alive(): -# read_threads[0].join() -# # Create compress thread -# compress_thread = Thread(target=compress_chunk, args=(0, read_chunks, compressed_chunks, start)) -# compress_thread.start() -# compress_threads.append(compress_thread) -# elif (i == num_chunks): -# # If read_threads[num_chunks - 1] isn't done, finish -# if read_threads[i - 1].is_alive(): -# read_threads[i - 1].join() + elif (i == 1): + # If read_threads[0] isn't done, finish + if read_threads[0].is_alive(): + read_threads[0].join() -# # If compress_threads[num_chunks - 2] isn't done, finish -# if compress_threads[i - 2].is_alive(): -# compress_threads[i - 2].join() + # Create read thread + read_thread = Thread(target=read_chunk, args=(1, chunk_string, read_chunks, s3, start)) + read_thread.start() + read_threads[i] = read_thread + + # Create compress thread + compress_thread = Thread(target=compress_chunk, args=(0, read_chunks, compressed_chunks, start)) + compress_thread.start() + compress_threads[i-1] = compress_thread + elif (i == num_chunks): + # If read_threads[num_chunks - 1] isn't done, finish + if read_threads[i - 1].is_alive(): + read_threads[i - 1].join() + + # If compress_threads[num_chunks - 2] isn't done, finish + if compress_threads[i - 2].is_alive(): + compress_threads[i - 2].join() -# # Create compress thread -# compress_thread = Thread(target=compress_chunk, args=(i - 1, read_chunks, compressed_chunks, start)) -# compress_thread.start() -# compress_threads.append(compress_thread) -# # Create upload thread -# upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) -# upload_thread.start() -# upload_threads.append(upload_thread) -# elif (i == num_chunks + 1): -# # If compress_threads[num_chunks - 1] isn't done, finish -# if compress_threads[i - 2].is_alive(): -# compress_threads[i - 2].join() -# # Create upload thread -# upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) -# upload_thread.start() -# upload_threads.append(upload_thread) -# else: -# # Create read thread -# read_thread = Thread(target=read_chunk, args=(i, chunk_string, read_chunks, s3, start)) -# read_thread.start() -# read_threads.append(read_thread) + # Create compress thread + compress_thread = Thread(target=compress_chunk, args=(i - 1, read_chunks, compressed_chunks, start)) + compress_thread.start() + compress_threads[i-1] = compress_thread -# # If read_threads[i - 1] isn't done, finish -# if read_threads[i - 1].is_alive(): -# read_threads[i - 1].join() + # Create upload thread + upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) + upload_thread.start() + upload_threads[i-2] = upload_thread + elif (i == num_chunks + 1): + # If compress_threads[num_chunks - 1] isn't done, finish + if compress_threads[i - 2].is_alive(): + compress_threads[i - 2].join() + # Create upload thread + upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) + upload_thread.start() + upload_threads[i-2] = upload_thread + else: + # If read_threads[i - 1] isn't done, finish + if read_threads[i - 1].is_alive(): + read_threads[i - 1].join() -# # If compress_threads[i - 2] isn't done, finish -# if compress_threads[i - 2].is_alive(): -# compress_threads[i - 2].join() + # If compress_threads[i - 2] isn't done, finish + if compress_threads[i - 2].is_alive(): + compress_threads[i - 2].join() -# # Create compress thread -# compress_thread = Thread(target=compress_chunk, args=(i - 1, read_chunks, compressed_chunks, start)) -# compress_thread.start() -# compress_threads.append(compress_thread) + # Create read thread + read_thread = Thread(target=read_chunk, args=(i, chunk_string, read_chunks, s3, start)) + read_thread.start() + read_threads[i] = read_thread -# # Create upload thread -# upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) -# upload_thread.start() -# upload_threads.append(upload_thread) + # Create compress thread + compress_thread = Thread(target=compress_chunk, args=(i - 1, read_chunks, compressed_chunks, start)) + compress_thread.start() + compress_threads[i-1] = compress_thread + + # Create upload thread + upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) + upload_thread.start() + upload_threads[i-2] = upload_thread # os.kill(PID, signal.SIGUSR1) From bd8de37a4d597d3acadec06d632f46b401a68555 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Fri, 7 Aug 2020 14:49:48 -0400 Subject: [PATCH 38/39] Modified chunk compressor to use aiobotocore --- benchmark-testing/chunk_compressor.py | 689 ++++++++++++------------ core/include/npa/delta_encoded_npa.h | 53 +- core/src/npa/elias_gamma_encoded_npa.cc | 8 +- core/src/succinct_core.cc | 4 +- 4 files changed, 377 insertions(+), 377 deletions(-) diff --git a/benchmark-testing/chunk_compressor.py b/benchmark-testing/chunk_compressor.py index 77bd00a..5999b33 100644 --- a/benchmark-testing/chunk_compressor.py +++ b/benchmark-testing/chunk_compressor.py @@ -5,6 +5,10 @@ from threading import Thread, Event import resource import signal +import asyncio +import aiobotocore +# from memory_profiler import profile +# from guppy import hpy # Compress time global compress_time = 0 @@ -72,73 +76,15 @@ def do_nothing(*args): pass # # **** PARALLEL EXECUTION WITH REUSABLE THREADS **** -# def read_chunk(read_index, chunk_string, read_arg, s3, code_start): -# # Starting to read chunk -# global read_time -# i = read_index[0] -# print("reading chunk " + str(i)) -# start = time.time() - -# ### -# obj = s3.get_object(Bucket='succinct-datasets', Key=chunk_string + str(i) + ".succinct") -# read_arg[0] = obj['Body'].read().decode('utf-8') -# del obj -# ### - -# # write chunk times to file and mark time -# # end = time.time() -# # read_time += end - start -# # read_time_file = open("read_time_file.txt","a") -# # read_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") - -# def compress_chunk(compress_index, read_arg, compressed_arg, code_start): -# # Starting to compress chunk -# global compress_time -# i = compress_index[0] -# print("compressing chunk " + str(i)) -# start = time.time() - -# ### -# q = file.File(0, read_arg[0], 32, 32, 128, 0, 1) -# compressed_arg[0] = q.GetContent().tobytes() -# q.DeleteContent() -# del q -# ### - -# # write chunk times to file and mark time -# # end = time.time() -# # compress_time += end - start -# # compress_time_file = open("compress_time_file.txt","a") -# # compress_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") - -# def upload_chunk(upload_index, chunk_string, compressed_arg, s3, code_start): -# # Starting to upload chunk -# global upload_time -# i = upload_index[0] -# print("uploading chunk " + str(i)) -# start = time.time() - -# ### -# s3.put_object(Body=compressed_arg[0], Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct") -# ### - -# # write chunk times to file and mark time -# # end = time.time() -# # upload_time += end - start -# # upload_time_file = open("upload_time_file.txt","a") -# # upload_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") - - -# **** PARALLEL EXECUTION WITH LIST OF THREAD FUNCTIONS **** - -def read_chunk(i, chunk_string, read_chunks, s3, code_start): +async def read_chunk(i, chunk_string, read_chunks, client, code_start): print("reading chunk " + str(i)) global read_time start = time.time() - obj = s3.get_object(Bucket='succinct-datasets', Key=chunk_string + str(i) + ".succinct") - # writing read chunk str(i) to pos len(read_chunks) - read_chunks[i] = obj['Body'].read().decode('utf-8') + + obj = await client.get_object(Bucket='succinct-datasets', Key=chunk_string + str(i) + ".succinct") + read_chunks[i] = await obj['Body'].read() del obj + end = time.time() read_time += end - start @@ -146,16 +92,17 @@ def read_chunk(i, chunk_string, read_chunks, s3, code_start): read_time_file = open("read_time_file.txt","a") read_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") -def compress_chunk(i, read_chunks, compressed_chunks, code_start): +async def compress_chunk(i, read_chunks, compressed_chunks, code_start): print("compressing chunk " + str(i)) global compress_time start = time.time() + q = file.File(0, read_chunks[i], 32, 32, 128, 0, 1) - # writing compressed chunk str(i) to pos len(compressed_chunks) compressed_chunks[i] = q.GetContent().tobytes() q.DeleteContent() del q read_chunks[i] = None + end = time.time() compress_time += end - start @@ -163,12 +110,14 @@ def compress_chunk(i, read_chunks, compressed_chunks, code_start): compress_time_file = open("compress_time_file.txt","a") compress_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") -def upload_chunk(i, chunk_string, compressed_chunks, s3, code_start): +async def upload_chunk(i, chunk_string, compressed_chunks, client, code_start): print("uploading chunk " + str(i)) global upload_time start = time.time() - s3.put_object(Body=compressed_chunks[i], Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct") + + await client.put_object(Body=compressed_chunks[i], Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct") compressed_chunks[i] = None + end = time.time() upload_time += end - start @@ -177,288 +126,328 @@ def upload_chunk(i, chunk_string, compressed_chunks, s3, code_start): upload_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") # **** FUNCTION START **** -# compress file -start = time.time() -# Define variables -chunk_string = "sample.txt" + "-chunk-" -# Depends on number of chunks file is split into -num_chunks = 1167 -read_chunks = [None] * num_chunks -compressed_chunks = [None] * num_chunks - -# COMPRESS WITH FILES -# q = file.File("sample.txt-chunk-1160.succinct", 32, 32, 128, 0, 1) - -# COMPRESS IN MEM -# f = open("sample.txt-chunk-1166.succinct","r") -# content = f.read() -# q = file.File(0, content, 32, 32, 128, 0, 1) - - -# # **** QUERY FOR NUMBER OF CHUNK FILES **** -# s3 = boto3.resource("s3") -# bucket = s3.Bucket('succinct-datasets') -# for bucket_obj in bucket.objects.all(): -# if (chunk_string in str(bucket_obj)): -# num_chunks += 1 -# print("The num chunks is: " + str(num_chunks)) - -# signal.signal(signal.SIGUSR1, do_nothing) - -# # **** PARALLEL EXECUTION WITH REUSABLE THREADS **** - -# read_index = [0] -# compress_index = [0] -# upload_index = [0] - - -# # Variables to keep track of read and compressed data -# read_arg = [0] -# compressed_arg = [0] - -# read_thread = ReusableThread(target = read_chunk, args=(read_index, chunk_string, read_arg, s3, start)) -# compress_thread = ReusableThread(target = compress_chunk, args=(compress_index, read_arg, compressed_arg, start)) -# upload_thread = ReusableThread(target = upload_chunk, args=(upload_index, chunk_string, compressed_arg, s3, start)) - -# for i in range(0, num_chunks + 2): -# if (i == 0): -# # START: read -# read_index[0] = i - -# read_thread.start() -# elif (i == 1): -# # RESTART: read START: compress -# read_thread.join() - -# read_index[0] = i -# compress_index[0] = i - 1 - -# compress_thread.start() -# read_thread.restart() -# elif (i == 2): -# # RESTART: compress, read START: upload -# read_thread.join() -# compress_thread.join() - -# read_index[0] = i -# compress_index[0] = i - 1 -# upload_index[0] = i - 2 - -# upload_thread.start() -# compress_thread.restart() -# read_thread.restart() - -# elif (i == num_chunks): -# # RESTART: upload, compress -# read_thread.join() -# compress_thread.join() -# upload_thread.join() - -# compress_index [0] = i - 1 -# upload_index [0] = i - 2 - -# upload_thread.restart() -# compress_thread.restart() -# elif (i == num_chunks + 1): -# # RESTART: upload -# compress_thread.join() -# upload_thread.join() - -# upload_index [0] = i - 2 - -# upload_thread.restart() - -# # Join last thread and finish all threads -# upload_thread.join() - -# read_thread.finish() -# compress_thread.finish() -# upload_thread.finish() -# else: -# # RESTART: upload, compress, read -# read_thread.join() -# compress_thread.join() -# upload_thread.join() - -# read_index[0] = i -# compress_index[0] = i - 1 -# upload_index[0] = i - 2 - -# upload_thread.restart() -# compress_thread.restart() -# read_thread.restart() - -# **** PARALLEL EXECUTION WITH LIST OF THREADS **** -read_threads = [None] * num_chunks -compress_threads = [None] * num_chunks -upload_threads = [None] * num_chunks - -for i in range(0, num_chunks + 2): - if (i == 0): - # Create read thread - read_thread = Thread(target=read_chunk, args=(0, chunk_string, read_chunks, s3, start)) - read_thread.start() - read_threads[i] = read_thread - - elif (i == 1): - # If read_threads[0] isn't done, finish - if read_threads[0].is_alive(): - read_threads[0].join() - - # Create read thread - read_thread = Thread(target=read_chunk, args=(1, chunk_string, read_chunks, s3, start)) - read_thread.start() - read_threads[i] = read_thread - - # Create compress thread - compress_thread = Thread(target=compress_chunk, args=(0, read_chunks, compressed_chunks, start)) - compress_thread.start() - compress_threads[i-1] = compress_thread - elif (i == num_chunks): - # If read_threads[num_chunks - 1] isn't done, finish - if read_threads[i - 1].is_alive(): - read_threads[i - 1].join() - - # If compress_threads[num_chunks - 2] isn't done, finish - if compress_threads[i - 2].is_alive(): - compress_threads[i - 2].join() +async def execute(): + # compress file + start = time.time() + # # Define variables + chunk_string = "sample.txt" + "-chunk-" + # # Depends on number of chunks file is split into + num_chunks = 10 + read_chunks = [None] * num_chunks + compressed_chunks = [None] * num_chunks + + # COMPRESS WITH FILES + # q = file.File("sample.txt-chunk-1160.succinct", 32, 32, 128, 0, 1) + + # COMPRESS IN MEM + # f = open("sample.txt-chunk-1166.succinct","r") + # content = f.read() + # q = file.File(0, content, 32, 32, 128, 0, 1) + + + # # **** QUERY FOR NUMBER OF CHUNK FILES **** + # s3 = boto3.resource("s3") + # bucket = s3.Bucket('succinct-datasets') + # for bucket_obj in bucket.objects.all(): + # if (chunk_string in str(bucket_obj)): + # num_chunks += 1 + # print("The num chunks is: " + str(num_chunks)) + + # signal.signal(signal.SIGUSR1, do_nothing) + + # **** PARALLEL EXECUTION WITH AIDBOTOCORE **** + + session = aiobotocore.get_session() + async with session.create_client('s3', region_name='us-east-2', + aws_secret_access_key=KEY, + aws_access_key_id=SECRET_KEY) as client: + + #i = 0 + await read_chunk(0, chunk_string, read_chunks, client, start) + + #i = 1 + tasks = [] + tasks.append(read_chunk(1, chunk_string, read_chunks, client, start)) + tasks.append(compress_chunk(0, read_chunks, compressed_chunks, start)) + await asyncio.gather(*tasks) - # Create compress thread - compress_thread = Thread(target=compress_chunk, args=(i - 1, read_chunks, compressed_chunks, start)) - compress_thread.start() - compress_threads[i-1] = compress_thread - - # Create upload thread - upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) - upload_thread.start() - upload_threads[i-2] = upload_thread - elif (i == num_chunks + 1): - # If compress_threads[num_chunks - 1] isn't done, finish - if compress_threads[i - 2].is_alive(): - compress_threads[i - 2].join() - # Create upload thread - upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) - upload_thread.start() - upload_threads[i-2] = upload_thread - else: - # If read_threads[i - 1] isn't done, finish - if read_threads[i - 1].is_alive(): - read_threads[i - 1].join() + #i = 2 to num_chunks - 1 + for i in range(2, num_chunks): + tasks = [] + tasks.append(read_chunk(i, chunk_string, read_chunks, client, start)) + tasks.append(compress_chunk(i - 1, read_chunks, compressed_chunks, start)) + tasks.append(upload_chunk(i - 2, chunk_string, compressed_chunks, client, start)) + await asyncio.gather(*tasks) + + # for i in range(0, num_chunks + 2): + # if (i == 0): + # await read_chunk(0, chunk_string, read_chunks, client, start) + # elif (i == 1): + # await read_chunk(1, chunk_string, read_chunks, client, start) + # await compress_chunk(0, read_chunks, compressed_chunks, start) + # elif (i == num_chunks): + # await compress_chunk(num_chunks - 1, read_chunks, compressed_chunks, start) + # await upload_chunk(num_chunks - 2, chunk_string, compressed_chunks, client, start) + # elif (i == num_chunks + 1): + # await upload_chunk(num_chunks - 1, chunk_string, compressed_chunks, client, start) + # else: + # await read_chunk(i, chunk_string, read_chunks, client, start) + # await compress_chunk(i - 1, read_chunks, compressed_chunks, start) + # await upload_chunk(i - 2, chunk_string, compressed_chunks, client, start) + + + # # **** PARALLEL EXECUTION WITH REUSABLE THREADS **** + # read_index = [0] + # compress_index = [0] + # upload_index = [0] + + + # read_thread = ReusableThread(target = read_chunk, args=(read_index, chunk_string, read_chunks, client, start)) + # compress_thread = ReusableThread(target = compress_chunk, args=(compress_index, read_chunks, compressed_chunks, start)) + # upload_thread = ReusableThread(target = upload_chunk, args=(upload_index, chunk_string, compressed_chunks, client, start)) + + # for i in range(0, num_chunks + 2): + # if (i == 0): + # # START: read + # read_index[0] = i + + # read_thread.start() + # elif (i == 1): + # # RESTART: read START: compress + # read_thread.join() + + # read_index[0] = i + # compress_index[0] = i - 1 + + # compress_thread.start() + # read_thread.restart() + # elif (i == 2): + # # RESTART: compress, read START: upload + # read_thread.join() + # compress_thread.join() + + # read_index[0] = i + # compress_index[0] = i - 1 + # upload_index[0] = i - 2 + + # upload_thread.start() + # compress_thread.restart() + # read_thread.restart() + # elif (i == num_chunks): + # # RESTART: upload, compress + # read_thread.join() + # compress_thread.join() + # upload_thread.join() + + # compress_index [0] = i - 1 + # upload_index [0] = i - 2 + + # upload_thread.restart() + # compress_thread.restart() + # elif (i == num_chunks + 1): + # # RESTART: upload + # compress_thread.join() + # upload_thread.join() + + # upload_index [0] = i - 2 + + # upload_thread.restart() + + # # Join last thread and finish all threads + # upload_thread.join() + + # read_thread.finish() + # compress_thread.finish() + # upload_thread.finish() + # else: + # # RESTART: upload, compress, read + # read_thread.join() + # compress_thread.join() + # upload_thread.join() + + # read_index[0] = i + # compress_index[0] = i - 1 + # upload_index[0] = i - 2 + + # upload_thread.restart() + # compress_thread.restart() + # read_thread.restart() + + # # **** PARALLEL EXECUTION WITH LIST OF THREADS **** + # read_threads = [None] * num_chunks + # compress_threads = [None] * num_chunks + # upload_threads = [None] * num_chunks + + # for i in range(0, num_chunks + 2): + # if (i == 0): + # # Create read thread + # read_thread = Thread(target=read_chunk, args=(0, chunk_string, read_chunks, s3, start)) + # read_thread.start() + # read_threads[i] = read_thread + + # elif (i == 1): + # # If read_threads[0] isn't done, finish + # if read_threads[0].is_alive(): + # read_threads[0].join() + + # # Create read thread + # read_thread = Thread(target=read_chunk, args=(1, chunk_string, read_chunks, s3, start)) + # read_thread.start() + # read_threads[i] = read_thread + + # # Create compress thread + # compress_thread = Thread(target=compress_chunk, args=(0, read_chunks, compressed_chunks, start)) + # compress_thread.start() + # compress_threads[i-1] = compress_thread + # elif (i == num_chunks): + # # If read_threads[num_chunks - 1] isn't done, finish + # if read_threads[i - 1].is_alive(): + # read_threads[i - 1].join() + + # # If compress_threads[num_chunks - 2] isn't done, finish + # if compress_threads[i - 2].is_alive(): + # compress_threads[i - 2].join() - # If compress_threads[i - 2] isn't done, finish - if compress_threads[i - 2].is_alive(): - compress_threads[i - 2].join() - - # Create read thread - read_thread = Thread(target=read_chunk, args=(i, chunk_string, read_chunks, s3, start)) - read_thread.start() - read_threads[i] = read_thread - - # Create compress thread - compress_thread = Thread(target=compress_chunk, args=(i - 1, read_chunks, compressed_chunks, start)) - compress_thread.start() - compress_threads[i-1] = compress_thread - - # Create upload thread - upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) - upload_thread.start() - upload_threads[i-2] = upload_thread - -# os.kill(PID, signal.SIGUSR1) - - -# # **** SEQUENTIAL EXECUTION **** -# for i in range(0, num_chunks + 2): -# if (i == 0): -# read_chunk(0, chunk_string, read_chunks, s3) -# elif (i == 1): -# read_chunk(1, chunk_string, read_chunks, s3) -# compress_chunk(0, read_chunks, compressed_chunks) -# elif (i == num_chunks): -# compress_chunk(num_chunks - 1, read_chunks, compressed_chunks) -# upload_chunk(num_chunks - 2, chunk_string, compressed_chunks, s3) -# elif (i == num_chunks + 1): -# upload_chunk(num_chunks - 1, chunk_string, compressed_chunks, s3) -# else: -# read_chunk(i, chunk_string, read_chunks, s3) -# compress_chunk(i - 1, read_chunks, compressed_chunks) -# upload_chunk(i - 2, chunk_string, compressed_chunks, s3) - - -# signal.signal(signal.SIGUSR1, do_nothing) - -# #with s3 -# for i in range(11,12): -# obj = s3.get_object(Bucket='succinct-datasets', Key=chunk_string + str(i) + ".succinct") -# print("read " + str(i)) -# q = file.File(0, obj['Body'].read().decode('utf-8'), 32, 32, 128, 0, 1) -# compressed_content = q.GetContent().tobytes() -# print("compressed " + str(i)) -# s3.put_object(Body=compressed_content, Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct") -# print("uploaded " + str(i)) - -# # without s3 -# for i in range(1660, 1667): -# f = open(chunk_string + str(i) + ".succinct","r") -# content = f.read() -# q = file.File(0, content, 32, 32, 128, 0, 1) -# text_file = open(chunk_string + str(i) + "-compressed-local.succinct", "wb") -# text_file.write(q.GetContent().tobytes()) - -# os.kill(PID, signal.SIGUSR1) -# print("done") - -# # **** UPLOAD FILE TO BUCKET **** -# s3 = boto3.resource("s3") -# os.chdir("/tmp") -# f = open("test.txt","w") -# f.write("hello this is a test") -# f.close() -# s3.meta.client.upload_file("/tmp/test.txt", "succinct-datasets", "test.txt") - -# # **** COMPRESS BY INPUT **** -# # Get file content from S3 and save as "input" -# s3 = boto3.client("s3") -# obj = s3.get_object(Bucket='succinct-datasets', Key=event['key1']) -# input = obj['Body'].read().decode('utf-8') - -# # Compress the input using file module -# q = file.File(0, input, 32, 32, 128, 0, 1) -# content = q.GetContent().tobytes() - -# # Upload content back onto S3 in .succinct file -# s3.put_object(Body=content, Bucket='succinct-datasets', Key=event['key1'] + ".succinct") - -# # **** COMPRESS BY FILE **** -# s3 = boto3.resource("s3") -# os.chdir("/tmp") -# s3.Bucket('succinct-datasets').download_file(event['key1'], event['key1']) -# q = file.File(event['key1'], 32, 32, 128, 0, 1) - -# # **** CALCULATE COMPRESSION RATIO **** -# bucket = s3.Bucket('succinct-datasets') -# orignal_sizes = 0 -# compressed_sizes = 0 -# for i in range (0, num_chunks): -# orignal_sizes += bucket.Object("sample.txt-chunk-" + str(i) + ".succinct").content_length -# compressed_sizes += bucket.Object("sample.txt-chunk-compressed-" + str(i) + ".succinct").content_length - -# print(orignal_sizes) -# print(compressed_sizes) -# print(orignal_sizes/compressed_sizes) - - -# # **** REMOVE OBJECTS FROM S3 **** -# s3 = boto3.resource("s3") -# for i in range (0, 200): -# obj = s3.Object("succinct-datasets", event['key1'] + "-chunk-compressed-" + str(i) + ".succinct") -# obj.delete() -# obj = s3.Object("succinct-datasets", event['key1'] + "-chunk-" + str(i) + ".succinct") -# obj.delete() - -print("File compression and upload is complete") -print("read time: " + str(read_time)) -print("compress time: " + str(compress_time)) -print("upload time: " + str(upload_time)) -print("TOTAL DURATION: " + str(time.time() - start)) -print(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) \ No newline at end of file + # # Create compress thread + # compress_thread = Thread(target=compress_chunk, args=(i - 1, read_chunks, compressed_chunks, start)) + # compress_thread.start() + # compress_threads[i-1] = compress_thread + + # # Create upload thread + # upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) + # upload_thread.start() + # upload_threads[i-2] = upload_thread + # elif (i == num_chunks + 1): + # # If compress_threads[num_chunks - 1] isn't done, finish + # if compress_threads[i - 2].is_alive(): + # compress_threads[i - 2].join() + # # Create upload thread + # upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) + # upload_thread.start() + # upload_threads[i-2] = upload_thread + # else: + # # If read_threads[i - 1] isn't done, finish + # if read_threads[i - 1].is_alive(): + # read_threads[i - 1].join() + + # # If compress_threads[i - 2] isn't done, finish + # if compress_threads[i - 2].is_alive(): + # compress_threads[i - 2].join() + + # # Create read thread + # read_thread = Thread(target=read_chunk, args=(i, chunk_string, read_chunks, s3, start)) + # read_thread.start() + # read_threads[i] = read_thread + + # # Create compress thread + # compress_thread = Thread(target=compress_chunk, args=(i - 1, read_chunks, compressed_chunks, start)) + # compress_thread.start() + # compress_threads[i-1] = compress_thread + + # # Create upload thread + # upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) + # upload_thread.start() + # upload_threads[i-2] = upload_thread + + # os.kill(PID, signal.SIGUSR1) + + + # **** SEQUENTIAL EXECUTION **** + # for i in range(0, num_chunks + 2): + # if (i == 0): + # read_chunk(0, chunk_string, read_chunks, s3, start) + # elif (i == 1): + # read_chunk(1, chunk_string, read_chunks, s3, start) + # compress_chunk(0, read_chunks, compressed_chunks, start) + # elif (i == num_chunks): + # compress_chunk(num_chunks - 1, read_chunks, compressed_chunks, start) + # upload_chunk(num_chunks - 2, chunk_string, compressed_chunks, s3, start) + # elif (i == num_chunks + 1): + # upload_chunk(num_chunks - 1, chunk_string, compressed_chunks, s3, start) + # else: + # read_chunk(i, chunk_string, read_chunks, s3, start) + # compress_chunk(i - 1, read_chunks, compressed_chunks, start) + # upload_chunk(i - 2, chunk_string, compressed_chunks, s3, start) + + + # signal.signal(signal.SIGUSR1, do_nothing) + + # #with s3 + # for i in range(11,12): + # obj = s3.get_object(Bucket='succinct-datasets', Key=chunk_string + str(i) + ".succinct") + # print("read " + str(i)) + # q = file.File(0, obj['Body'].read().decode('utf-8'), 32, 32, 128, 0, 1) + # compressed_content = q.GetContent().tobytes() + # print("compressed " + str(i)) + # s3.put_object(Body=compressed_content, Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct") + # print("uploaded " + str(i)) + + # # without s3 + # for i in range(1660, 1667): + # f = open(chunk_string + str(i) + ".succinct","r") + # content = f.read() + # q = file.File(0, content, 32, 32, 128, 0, 1) + # text_file = open(chunk_string + str(i) + "-compressed-local.succinct", "wb") + # text_file.write(q.GetContent().tobytes()) + + # os.kill(PID, signal.SIGUSR1) + # print("done") + + # # **** UPLOAD FILE TO BUCKET **** + # s3 = boto3.resource("s3") + # os.chdir("/tmp") + # f = open("test.txt","w") + # f.write("hello this is a test") + # f.close() + # s3.meta.client.upload_file("/tmp/test.txt", "succinct-datasets", "test.txt") + + # # **** COMPRESS BY INPUT **** + # # Get file content from S3 and save as "input" + # s3 = boto3.client("s3") + # obj = s3.get_object(Bucket='succinct-datasets', Key=event['key1']) + # input = obj['Body'].read().decode('utf-8') + + # # Compress the input using file module + # q = file.File(0, input, 32, 32, 128, 0, 1) + # content = q.GetContent().tobytes() + + # # Upload content back onto S3 in .succinct file + # s3.put_object(Body=content, Bucket='succinct-datasets', Key=event['key1'] + ".succinct") + + # # **** COMPRESS BY FILE **** + # s3 = boto3.resource("s3") + # os.chdir("/tmp") + # s3.Bucket('succinct-datasets').download_file(event['key1'], event['key1']) + # q = file.File(event['key1'], 32, 32, 128, 0, 1) + + # # **** CALCULATE COMPRESSION RATIO **** + # bucket = s3.Bucket('succinct-datasets') + # orignal_sizes = 0 + # compressed_sizes = 0 + # for i in range (0, num_chunks): + # orignal_sizes += bucket.Object("sample.txt-chunk-" + str(i) + ".succinct").content_length + # compressed_sizes += bucket.Object("sample.txt-chunk-compressed-" + str(i) + ".succinct").content_length + + # print(orignal_sizes) + # print(compressed_sizes) + # print(orignal_sizes/compressed_sizes) + + + # # **** REMOVE OBJECTS FROM S3 **** + # s3 = boto3.resource("s3") + # for i in range (0, 200): + # obj = s3.Object("succinct-datasets", event['key1'] + "-chunk-compressed-" + str(i) + ".succinct") + # obj.delete() + # obj = s3.Object("succinct-datasets", event['key1'] + "-chunk-" + str(i) + ".succinct") + # obj.delete() + + print("File compression and upload is complete") + print("read time: " + str(read_time)) + print("compress time: " + str(compress_time)) + print("upload time: " + str(upload_time)) + print("TOTAL DURATION: " + str(time.time() - start)) + print(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) + + +loop = asyncio.get_event_loop() +loop.run_until_complete(execute()) \ No newline at end of file diff --git a/core/include/npa/delta_encoded_npa.h b/core/include/npa/delta_encoded_npa.h index 561deca..b1dbcec 100644 --- a/core/include/npa/delta_encoded_npa.h +++ b/core/include/npa/delta_encoded_npa.h @@ -123,7 +123,7 @@ class DeltaEncodedNPA : public NPA { // Get all NPA values int64_t *lNPA = new int64_t[npa_size_](); - fprintf(stderr, "the npa_size_ is: %" PRIu64 "\n", npa_size_); + // fprintf(stderr, "the npa_size_ is: %" PRIu64 "\n", npa_size_); uint64_t first_idx, cur_idx, nxt_idx, num_elements_per_chunk; std::thread constructor_thread[8]; @@ -137,7 +137,7 @@ class DeltaEncodedNPA : public NPA { 0 : npa_size_ - i * num_elements_per_chunk; uint64_t num_elements = SuccinctUtils::Min(remaining_elements, num_elements_per_chunk); - // fprintf(stderr, "loop: %i\n", i); + // // fprintf(stderr, "loop: %i\n", i); constructor_thread[i] = std::thread(&DeltaEncodedNPA::ConstructNPAChunk, lNPA, isa_file, i * num_elements_per_chunk, num_elements, (i == 7) ? first_idx : -1ULL); @@ -147,6 +147,12 @@ class DeltaEncodedNPA : public NPA { constructor_thread[i].join(); } + + // fprintf(stderr, "NPA ARRAY\n"); + // for (uint64_t i= 0; i < npa_size_; i ++){ + // fprintf(stderr, "Setting lNPA[ %" PRIu64 "] = %" PRIu64 "\n", i, lNPA[i]); + // } + isa_stream.CloseAndRemove(); SuccinctUtils::WriteToFile(lNPA, npa_size_, npa_file); @@ -173,12 +179,12 @@ class DeltaEncodedNPA : public NPA { // Get all NPA values int64_t *lNPA = new int64_t[npa_size_](); - fprintf(stderr, "the npa_size_ is: %" PRIu64 "\n", npa_size_); + // fprintf(stderr, "the npa_size_ is: %" PRIu64 "\n", npa_size_); uint64_t first_idx, cur_idx, nxt_idx, num_elements_per_chunk; std::thread constructor_thread[8]; // for ( int i = 0; i < sizeof(lISA)/sizeof(lISA[0]); i ++){ - // // fprintf(stderr, "lISA: %" PRIu64 "\n", lISA[i]); + // // // fprintf(stderr, "lISA: %" PRIu64 "\n", lISA[i]); // } ArrayInput isa_array(lISA); @@ -190,21 +196,26 @@ class DeltaEncodedNPA : public NPA { 0 : npa_size_ - i * num_elements_per_chunk; uint64_t num_elements = SuccinctUtils::Min(remaining_elements, num_elements_per_chunk); - // fprintf(stderr, "loop: %i\n", i); + // // fprintf(stderr, "loop: %i\n", i); constructor_thread[i] = std::thread(&DeltaEncodedNPA::ConstructNPAChunkInMem, lNPA, lISA, i * num_elements_per_chunk, num_elements, (i == 7) ? first_idx : -1ULL, npa_size_); } for (uint8_t i = 0; i < 8; i++) { - // fprintf(stderr, "thread loop: %i\n", i); + // // fprintf(stderr, "thread loop: %i\n", i); constructor_thread[i].join(); } + // fprintf(stderr, "NPA ARRAY\n"); + // for (uint64_t i= 0; i < npa_size_; i ++){ + // fprintf(stderr, "Setting lNPA[ %" PRIu64 "] = %" PRIu64 "\n", i, lNPA[i]); + // } + //isa_stream.CloseAndRemove(); //SuccinctUtils::WriteToFile(lNPA, npa_size_, npa_file); - // fprintf(stderr, "200\n"); + // // fprintf(stderr, "200\n"); del_npa_ = new DeltaEncodedVector[sigma_size_]; ThreadPool pool(8); for (uint64_t i = 0; i < col_offsets_.size(); i++) { @@ -216,7 +227,7 @@ class DeltaEncodedNPA : public NPA { } pool.ShutDown(); delete[] lNPA; - // fprintf(stderr, "finished EncodeInMem\n"); + // // fprintf(stderr, "finished EncodeInMem\n"); // remove(npa_file.c_str()); } @@ -352,20 +363,20 @@ class DeltaEncodedNPA : public NPA { int64_t first_idx) { // ISA Stream is configured to start reading from correct position uint64_t cur_idx, nxt_idx; - fprintf(stderr, "*** start_pos: %" PRIu64 ", first_idx: %" PRId64 "\n", start_pos, first_idx); + // fprintf(stderr, "*** start_pos: %" PRIu64 ", first_idx: %" PRId64 "\n", start_pos, first_idx); ArrayStream isa_stream(isa_file, start_pos); cur_idx = isa_stream.Get(); - fprintf(stderr, "n_elems = %" PRIu64 "\n", n_elems); + // fprintf(stderr, "n_elems = %" PRIu64 "\n", n_elems); for (uint64_t i = 0; i < n_elems; i++) { nxt_idx = isa_stream.Get(); - fprintf(stderr, "Setting lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, nxt_idx); + // fprintf(stderr, "Setting lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, nxt_idx); lNPA[cur_idx] = nxt_idx; cur_idx = nxt_idx; } - fprintf(stderr, "cur_idx is :%" PRIu64 " when exiting the loop\n", cur_idx); + // fprintf(stderr, "cur_idx is :%" PRIu64 " when exiting the loop\n", cur_idx); if (first_idx > 0) { - fprintf(stderr, "TRYING TO SET lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, first_idx); + // fprintf(stderr, "TRYING TO SET lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, first_idx); lNPA[cur_idx] = first_idx; } isa_stream.Close(); @@ -377,26 +388,26 @@ class DeltaEncodedNPA : public NPA { int64_t first_idx, uint64_t npa_size) { // ISA Stream is configured to start reading from correct position uint64_t cur_idx, nxt_idx; - fprintf(stderr, "*** start_pos: %" PRIu64 ", first_idx: %" PRId64 "\n", start_pos, first_idx); + // fprintf(stderr, "*** start_pos: %" PRIu64 ", first_idx: %" PRId64 "\n", start_pos, first_idx); ArrayInput isa_array(lISA, start_pos); cur_idx = isa_array.Get(); - fprintf(stderr, "n_elems = %" PRIu64 "\n", n_elems); + // fprintf(stderr, "n_elems = %" PRIu64 "\n", n_elems); if (cur_idx >= npa_size){ - fprintf(stderr, "out of bounds\n"); + // fprintf(stderr, "out of bounds\n"); cur_idx = 0; } else { for (uint64_t i = 0; i < n_elems; i++) { nxt_idx = isa_array.Get(); - fprintf(stderr, "Setting lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, nxt_idx); + // fprintf(stderr, "Setting lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, nxt_idx); lNPA[cur_idx] = nxt_idx; cur_idx = nxt_idx; } } - fprintf(stderr, "cur_idx is :%" PRIu64 " when exiting the loop\n", cur_idx); + // fprintf(stderr, "cur_idx is :%" PRIu64 " when exiting the loop\n", cur_idx); if (first_idx > 0 && cur_idx < npa_size) { - fprintf(stderr, "TRYING TO SET lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, first_idx); + // fprintf(stderr, "TRYING TO SET lNPA[ %" PRIu64 "] = %" PRIu64 "\n", cur_idx, first_idx); lNPA[cur_idx] = first_idx; } } @@ -409,7 +420,7 @@ class DeltaEncodedNPA : public NPA { for (uint64_t j = start_offset; j < end_offset; j++) { uint64_t temp = npa_stream.Get(); column.push_back(temp); - // fprintf(stderr, "pushing back %" PRIu64 "\n", temp); + // // fprintf(stderr, "pushing back %" PRIu64 "\n", temp); } assert(column.size() > 0); CreateDeltaEncodedVector(dv, column); @@ -424,7 +435,7 @@ class DeltaEncodedNPA : public NPA { for (uint64_t j = start_offset; j < end_offset; j++) { uint64_t temp = npa_array.Get(); column.push_back(temp); - // fprintf(stderr, "pushing back %" PRIu64 "\n", temp); + // // fprintf(stderr, "pushing back %" PRIu64 "\n", temp); } assert(column.size() > 0); CreateDeltaEncodedVector(dv, column); diff --git a/core/src/npa/elias_gamma_encoded_npa.cc b/core/src/npa/elias_gamma_encoded_npa.cc index ecd78dc..d38a9b7 100644 --- a/core/src/npa/elias_gamma_encoded_npa.cc +++ b/core/src/npa/elias_gamma_encoded_npa.cc @@ -135,10 +135,10 @@ void EliasGammaEncodedNPA::CreateDeltaEncodedVector( } } else { long delta = data[i] - last_val; - // assert(delta > 0); - if (delta <= 0){ - fprintf(stderr, "WARNING DELTA <= 0 --> %ld = %" PRIu64 " - %" PRIu64 "\n",delta,data[i],last_val); - } + assert(delta > 0); + // if (delta <= 0){ + // fprintf(stderr, "WARNING DELTA <= 0 --> %ld = %" PRIu64 " - %" PRIu64 "\n",delta,data[i],last_val); + // } _deltas.push_back(delta); delta_enc_size = EliasGammaEncodingSize(delta); diff --git a/core/src/succinct_core.cc b/core/src/succinct_core.cc index 0fde7a3..a62724d 100644 --- a/core/src/succinct_core.cc +++ b/core/src/succinct_core.cc @@ -222,7 +222,7 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, // fprintf(stderr, "input size: %zd\n", input_size_); for (uint64_t i = 1; i < input_size_; i++) { cur_sa = sa_stream.Get(); - // fprintf(stderr, "INITIALIZING lISA[ %" PRIu64 "] = %" PRIu64 "\n", cur_sa, i); + fprintf(stderr, "INITIALIZING lISA[ %" PRIu64 "] = %" PRIu64 "\n", cur_sa, i); lISA[cur_sa] = i; if (input[cur_sa] != input[prv_sa]) { alphabet_map_[input[cur_sa]] = std::pair( @@ -384,7 +384,7 @@ void SuccinctCore::Construct(bool in_mem, uint8_t *input, size_t input_size, for (uint64_t i = 1; i < input_size_; i++) { cur_sa = sa_array.Get(); lISA[cur_sa] = i; - // fprintf(stderr, "INITIALIZING lISA[ %" PRIu64 "] = %" PRIu64 "\n", cur_sa, i); + fprintf(stderr, "INITIALIZING lISA[ %" PRIu64 "] = %" PRIu64 "\n", cur_sa, i); if (input[cur_sa] != input[prv_sa]) { alphabet_map_[input[cur_sa]] = std::pair( i, alphabet_size_++); From ab1a1466f18afe88dde84d2f86ed05195fe53490 Mon Sep 17 00:00:00 2001 From: Daniel Li Date: Sun, 13 Sep 2020 15:24:58 -0400 Subject: [PATCH 39/39] Added aiobotocore compression approach to chunk_compressor.py --- benchmark-testing/chunk_compressor.py | 452 ++++++++++---------------- benchmark-testing/upload.sh | 6 + 2 files changed, 185 insertions(+), 273 deletions(-) create mode 100644 benchmark-testing/upload.sh diff --git a/benchmark-testing/chunk_compressor.py b/benchmark-testing/chunk_compressor.py index 5999b33..27ad689 100644 --- a/benchmark-testing/chunk_compressor.py +++ b/benchmark-testing/chunk_compressor.py @@ -7,6 +7,9 @@ import signal import asyncio import aiobotocore +import sys +from promise import Promise + # from memory_profiler import profile # from guppy import hpy @@ -46,7 +49,7 @@ def run(self): we can reuse the thread's resources""" self.restart() - while(True): + while(True): # wait until we should process self._startSignal.wait() @@ -55,7 +58,7 @@ def run(self): if(self._finishIndicator):# check, if we want to stop self._oneRunFinished.set() return - + # call the threaded function self._callable(*self._callableArgs) @@ -75,8 +78,14 @@ def finish(self): def do_nothing(*args): pass -# # **** PARALLEL EXECUTION WITH REUSABLE THREADS **** + +# # **** PARALLEL EXECUTION FUNCTIONS **** async def read_chunk(i, chunk_string, read_chunks, client, code_start): + if ( i != 0 ): + # Await on previous chunk if not chunk 0 + await read_chunk(i - 1, chunk_string, read_chunks, client, code_start) + + # After calling on previous chunk, run current chunk i read print("reading chunk " + str(i)) global read_time start = time.time() @@ -93,38 +102,121 @@ async def read_chunk(i, chunk_string, read_chunks, client, code_start): read_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") async def compress_chunk(i, read_chunks, compressed_chunks, code_start): - print("compressing chunk " + str(i)) - global compress_time - start = time.time() - - q = file.File(0, read_chunks[i], 32, 32, 128, 0, 1) - compressed_chunks[i] = q.GetContent().tobytes() - q.DeleteContent() - del q - read_chunks[i] = None - - end = time.time() - compress_time += end - start - - # write chunk times to file + if ( i != 0 ): + # Await on previous chunk if not chunk 0 + await compress_chunk(i - 1, read_chunks, compressed_chunks, code_start) + + # After calling on previous chunk, run current chunk i upload, and check to see if corresponding compression is complete + # Loop that exits when read_chunks thread for chunk i is complete + while (True): + if (read_chunks[i] != None): + print("compressing chunk " + str(i)) + global compress_time + start = time.time() + + # If there is read data in the list, then we can go ahead and compress + await asyncio.sleep(0) + q = file.File(0, read_chunks[i], 32, 32, 128, 0, 1) + compressed_chunks[i] = q.GetContent().tobytes() + q.DeleteContent() + del q + read_chunks[i] = None + + end = time.time() + compress_time += end - start + break + else: + # otherwise wait 0.05 seconds and check read_chunks again + print("waiting for read") + await asyncio.sleep(0.01) + # write chunk times to file compress_time_file = open("compress_time_file.txt","a") compress_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") async def upload_chunk(i, chunk_string, compressed_chunks, client, code_start): - print("uploading chunk " + str(i)) - global upload_time - start = time.time() + if ( i != 0 ): + # Await on previous chunk if not chunk 0 + await upload_chunk(i - 1, chunk_string, compressed_chunks, client, code_start) - await client.put_object(Body=compressed_chunks[i], Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct") - compressed_chunks[i] = None + # After calling on previous chunk, run current chunk i upload, and check to see if corresponding compression is complete + + # Loop that exits when compressed_chunks thread for chunk i is complete + while (True): + if (compressed_chunks[i] != None): + print("uploading chunk " + str(i)) + global upload_time + start = time.time() + + # If there is compressed data in the list, then we can go ahead and upload + await client.put_object(Body=compressed_chunks[i], Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct", StorageClass='REDUCED_REDUNDANCY') + compressed_chunks[i] = None + + end = time.time() + upload_time += end - start + break + else: + # otherwise wait 0.05 seconds and check compressed_chunks again + print("waiting for compress") + await asyncio.sleep(0.01) - end = time.time() - upload_time += end - start # write chunk times to file upload_time_file = open("upload_time_file.txt","a") upload_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") + # # # **** SEQUENTIAL EXECUTION FUNCTIONS **** +# def read_chunk(i, chunk_string, read_chunks, client, code_start): +# print("reading chunk " + str(i)) +# global read_time +# start = time.time() + +# obj = client.get_object(Bucket='succinct-datasets', Key=chunk_string + str(i) + ".succinct") +# read_chunks[i] = obj['Body'].read() +# del obj + +# end = time.time() +# read_time += end - start + +# # # write chunk times to file +# # read_time_file = open("read_time_file.txt","a") +# # read_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") + +# def compress_chunk(i, read_chunks, compressed_chunks, code_start): +# start = time.time() + +# print("compressing chunk " + str(i)) +# global compress_time + +# # await asyncio.sleep(0) +# q = file.File(0, read_chunks[i], 32, 32, 128, 0, 1) +# compressed_chunks[i] = q.GetContent().tobytes() +# q.DeleteContent() +# del q +# read_chunks[i] = None + +# end = time.time() +# compress_time += end - start + +# # write chunk times to file +# # compress_time_file = open("compress_time_file.txt","a") +# # compress_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") + + +# def upload_chunk(i, chunk_string, compressed_chunks, client, code_start): +# print("uploading chunk " + str(i)) +# global upload_time +# start = time.time() + +# client.put_object(Body=compressed_chunks[i], Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct", StorageClass='REDUCED_REDUNDANCY') +# compressed_chunks[i] = None + +# end = time.time() +# upload_time += end - start + +# # write chunk times to file +# # upload_time_file = open("upload_time_file.txt","a") +# # upload_time_file.write("chunk " + str(i) + " start: " + str(start - code_start) + " duration: " + str(end - start) + "\n") + # **** FUNCTION START **** async def execute(): # compress file @@ -132,7 +224,7 @@ async def execute(): # # Define variables chunk_string = "sample.txt" + "-chunk-" # # Depends on number of chunks file is split into - num_chunks = 10 + num_chunks = 1167 read_chunks = [None] * num_chunks compressed_chunks = [None] * num_chunks @@ -155,282 +247,74 @@ async def execute(): # signal.signal(signal.SIGUSR1, do_nothing) - # **** PARALLEL EXECUTION WITH AIDBOTOCORE **** + # client = boto3.client('s3', region_name='us-east-2', + # aws_secret_access_key="AWS_SECRET_KEY", + # aws_access_key_id="AWS_ACCESS_ID") +# **** PARALLEL EXECUTION WITH AIDBOTOCORE **** + + # Event driven approach + # launch thread(0), right before compress, launch thread 1, + # Create a "promise" object right before compress, pass it to thread 1 + # Create another "promise" object right before upload, pass it to thread 1 + # thread(1) reads, wait for compress_promise object to be resolved before doing it's compress + + # compress operation session = aiobotocore.get_session() async with session.create_client('s3', region_name='us-east-2', - aws_secret_access_key=KEY, - aws_access_key_id=SECRET_KEY) as client: - - #i = 0 - await read_chunk(0, chunk_string, read_chunks, client, start) + aws_secret_access_key="AWS_SECRET_KEY", + aws_access_key_id="AWS_ACCESS_ID") as client: - #i = 1 tasks = [] - tasks.append(read_chunk(1, chunk_string, read_chunks, client, start)) - tasks.append(compress_chunk(0, read_chunks, compressed_chunks, start)) - await asyncio.gather(*tasks) - - #i = 2 to num_chunks - 1 - for i in range(2, num_chunks): - tasks = [] - tasks.append(read_chunk(i, chunk_string, read_chunks, client, start)) - tasks.append(compress_chunk(i - 1, read_chunks, compressed_chunks, start)) - tasks.append(upload_chunk(i - 2, chunk_string, compressed_chunks, client, start)) - await asyncio.gather(*tasks) + tasks.append(read_chunk(num_chunks-1, chunk_string, read_chunks, client, start)) + tasks.append(compress_chunk(num_chunks-1, read_chunks, compressed_chunks, start)) + tasks.append(upload_chunk(num_chunks-1, chunk_string, compressed_chunks, client, start)) + await asyncio.wait(tasks) + # await asyncio.gather(*tasks) + # for i in range(0, num_chunks + 2): + # tasks = [] + # if (i == 0): - # await read_chunk(0, chunk_string, read_chunks, client, start) + # tasks.append(read_chunk(i, chunk_string, read_chunks, client, start)) # elif (i == 1): - # await read_chunk(1, chunk_string, read_chunks, client, start) - # await compress_chunk(0, read_chunks, compressed_chunks, start) + # tasks.append(read_chunk(i, chunk_string, read_chunks, client, start)) + # tasks.append(compress_chunk(i - 1, read_chunks, compressed_chunks, start)) # elif (i == num_chunks): - # await compress_chunk(num_chunks - 1, read_chunks, compressed_chunks, start) - # await upload_chunk(num_chunks - 2, chunk_string, compressed_chunks, client, start) + # tasks.append(compress_chunk(i - 1, read_chunks, compressed_chunks, start)) + # tasks.append(upload_chunk(i - 2, chunk_string, compressed_chunks, client, start)) # elif (i == num_chunks + 1): - # await upload_chunk(num_chunks - 1, chunk_string, compressed_chunks, client, start) + # tasks.append(upload_chunk(i - 2, chunk_string, compressed_chunks, client, start)) # else: - # await read_chunk(i, chunk_string, read_chunks, client, start) - # await compress_chunk(i - 1, read_chunks, compressed_chunks, start) - # await upload_chunk(i - 2, chunk_string, compressed_chunks, client, start) - - - # # **** PARALLEL EXECUTION WITH REUSABLE THREADS **** - # read_index = [0] - # compress_index = [0] - # upload_index = [0] - - - # read_thread = ReusableThread(target = read_chunk, args=(read_index, chunk_string, read_chunks, client, start)) - # compress_thread = ReusableThread(target = compress_chunk, args=(compress_index, read_chunks, compressed_chunks, start)) - # upload_thread = ReusableThread(target = upload_chunk, args=(upload_index, chunk_string, compressed_chunks, client, start)) - - # for i in range(0, num_chunks + 2): - # if (i == 0): - # # START: read - # read_index[0] = i - - # read_thread.start() - # elif (i == 1): - # # RESTART: read START: compress - # read_thread.join() - - # read_index[0] = i - # compress_index[0] = i - 1 - - # compress_thread.start() - # read_thread.restart() - # elif (i == 2): - # # RESTART: compress, read START: upload - # read_thread.join() - # compress_thread.join() - - # read_index[0] = i - # compress_index[0] = i - 1 - # upload_index[0] = i - 2 - - # upload_thread.start() - # compress_thread.restart() - # read_thread.restart() - # elif (i == num_chunks): - # # RESTART: upload, compress - # read_thread.join() - # compress_thread.join() - # upload_thread.join() - - # compress_index [0] = i - 1 - # upload_index [0] = i - 2 - - # upload_thread.restart() - # compress_thread.restart() - # elif (i == num_chunks + 1): - # # RESTART: upload - # compress_thread.join() - # upload_thread.join() - - # upload_index [0] = i - 2 - - # upload_thread.restart() - - # # Join last thread and finish all threads - # upload_thread.join() - - # read_thread.finish() - # compress_thread.finish() - # upload_thread.finish() - # else: - # # RESTART: upload, compress, read - # read_thread.join() - # compress_thread.join() - # upload_thread.join() - - # read_index[0] = i - # compress_index[0] = i - 1 - # upload_index[0] = i - 2 - - # upload_thread.restart() - # compress_thread.restart() - # read_thread.restart() - - # # **** PARALLEL EXECUTION WITH LIST OF THREADS **** - # read_threads = [None] * num_chunks - # compress_threads = [None] * num_chunks - # upload_threads = [None] * num_chunks - - # for i in range(0, num_chunks + 2): - # if (i == 0): - # # Create read thread - # read_thread = Thread(target=read_chunk, args=(0, chunk_string, read_chunks, s3, start)) - # read_thread.start() - # read_threads[i] = read_thread - - # elif (i == 1): - # # If read_threads[0] isn't done, finish - # if read_threads[0].is_alive(): - # read_threads[0].join() - - # # Create read thread - # read_thread = Thread(target=read_chunk, args=(1, chunk_string, read_chunks, s3, start)) - # read_thread.start() - # read_threads[i] = read_thread - - # # Create compress thread - # compress_thread = Thread(target=compress_chunk, args=(0, read_chunks, compressed_chunks, start)) - # compress_thread.start() - # compress_threads[i-1] = compress_thread - # elif (i == num_chunks): - # # If read_threads[num_chunks - 1] isn't done, finish - # if read_threads[i - 1].is_alive(): - # read_threads[i - 1].join() - - # # If compress_threads[num_chunks - 2] isn't done, finish - # if compress_threads[i - 2].is_alive(): - # compress_threads[i - 2].join() - - # # Create compress thread - # compress_thread = Thread(target=compress_chunk, args=(i - 1, read_chunks, compressed_chunks, start)) - # compress_thread.start() - # compress_threads[i-1] = compress_thread - - # # Create upload thread - # upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) - # upload_thread.start() - # upload_threads[i-2] = upload_thread - # elif (i == num_chunks + 1): - # # If compress_threads[num_chunks - 1] isn't done, finish - # if compress_threads[i - 2].is_alive(): - # compress_threads[i - 2].join() - # # Create upload thread - # upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) - # upload_thread.start() - # upload_threads[i-2] = upload_thread - # else: - # # If read_threads[i - 1] isn't done, finish - # if read_threads[i - 1].is_alive(): - # read_threads[i - 1].join() - - # # If compress_threads[i - 2] isn't done, finish - # if compress_threads[i - 2].is_alive(): - # compress_threads[i - 2].join() - - # # Create read thread - # read_thread = Thread(target=read_chunk, args=(i, chunk_string, read_chunks, s3, start)) - # read_thread.start() - # read_threads[i] = read_thread - - # # Create compress thread - # compress_thread = Thread(target=compress_chunk, args=(i - 1, read_chunks, compressed_chunks, start)) - # compress_thread.start() - # compress_threads[i-1] = compress_thread - - # # Create upload thread - # upload_thread = Thread(target=upload_chunk, args=(i - 2, chunk_string, compressed_chunks, s3, start)) - # upload_thread.start() - # upload_threads[i-2] = upload_thread + # tasks.append(read_chunk(i, chunk_string, read_chunks, client, start)) + # tasks.append(compress_chunk(i - 1, read_chunks, compressed_chunks, start)) + # tasks.append(upload_chunk(i - 2, chunk_string, compressed_chunks, client, start)) + # await asyncio.wait(tasks) # os.kill(PID, signal.SIGUSR1) # **** SEQUENTIAL EXECUTION **** # for i in range(0, num_chunks + 2): # if (i == 0): - # read_chunk(0, chunk_string, read_chunks, s3, start) + # read_chunk(0, chunk_string, read_chunks, client, start) # elif (i == 1): - # read_chunk(1, chunk_string, read_chunks, s3, start) + # read_chunk(1, chunk_string, read_chunks, client, start) # compress_chunk(0, read_chunks, compressed_chunks, start) # elif (i == num_chunks): # compress_chunk(num_chunks - 1, read_chunks, compressed_chunks, start) - # upload_chunk(num_chunks - 2, chunk_string, compressed_chunks, s3, start) + # upload_chunk(num_chunks - 2, chunk_string, compressed_chunks, client, start) # elif (i == num_chunks + 1): - # upload_chunk(num_chunks - 1, chunk_string, compressed_chunks, s3, start) + # upload_chunk(num_chunks - 1, chunk_string, compressed_chunks, client, start) # else: - # read_chunk(i, chunk_string, read_chunks, s3, start) + # read_chunk(i, chunk_string, read_chunks, client, start) # compress_chunk(i - 1, read_chunks, compressed_chunks, start) - # upload_chunk(i - 2, chunk_string, compressed_chunks, s3, start) + # upload_chunk(i - 2, chunk_string, compressed_chunks, client, start) # signal.signal(signal.SIGUSR1, do_nothing) - # #with s3 - # for i in range(11,12): - # obj = s3.get_object(Bucket='succinct-datasets', Key=chunk_string + str(i) + ".succinct") - # print("read " + str(i)) - # q = file.File(0, obj['Body'].read().decode('utf-8'), 32, 32, 128, 0, 1) - # compressed_content = q.GetContent().tobytes() - # print("compressed " + str(i)) - # s3.put_object(Body=compressed_content, Bucket='succinct-datasets', Key=chunk_string + "compressed-" + str(i) + ".succinct") - # print("uploaded " + str(i)) - - # # without s3 - # for i in range(1660, 1667): - # f = open(chunk_string + str(i) + ".succinct","r") - # content = f.read() - # q = file.File(0, content, 32, 32, 128, 0, 1) - # text_file = open(chunk_string + str(i) + "-compressed-local.succinct", "wb") - # text_file.write(q.GetContent().tobytes()) - - # os.kill(PID, signal.SIGUSR1) - # print("done") - - # # **** UPLOAD FILE TO BUCKET **** - # s3 = boto3.resource("s3") - # os.chdir("/tmp") - # f = open("test.txt","w") - # f.write("hello this is a test") - # f.close() - # s3.meta.client.upload_file("/tmp/test.txt", "succinct-datasets", "test.txt") - - # # **** COMPRESS BY INPUT **** - # # Get file content from S3 and save as "input" - # s3 = boto3.client("s3") - # obj = s3.get_object(Bucket='succinct-datasets', Key=event['key1']) - # input = obj['Body'].read().decode('utf-8') - - # # Compress the input using file module - # q = file.File(0, input, 32, 32, 128, 0, 1) - # content = q.GetContent().tobytes() - - # # Upload content back onto S3 in .succinct file - # s3.put_object(Body=content, Bucket='succinct-datasets', Key=event['key1'] + ".succinct") - - # # **** COMPRESS BY FILE **** - # s3 = boto3.resource("s3") - # os.chdir("/tmp") - # s3.Bucket('succinct-datasets').download_file(event['key1'], event['key1']) - # q = file.File(event['key1'], 32, 32, 128, 0, 1) - - # # **** CALCULATE COMPRESSION RATIO **** - # bucket = s3.Bucket('succinct-datasets') - # orignal_sizes = 0 - # compressed_sizes = 0 - # for i in range (0, num_chunks): - # orignal_sizes += bucket.Object("sample.txt-chunk-" + str(i) + ".succinct").content_length - # compressed_sizes += bucket.Object("sample.txt-chunk-compressed-" + str(i) + ".succinct").content_length - - # print(orignal_sizes) - # print(compressed_sizes) - # print(orignal_sizes/compressed_sizes) # # **** REMOVE OBJECTS FROM S3 **** @@ -448,6 +332,28 @@ async def execute(): print("TOTAL DURATION: " + str(time.time() - start)) print(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) - +# modify recursion limit +sys.setrecursionlimit(1500) loop = asyncio.get_event_loop() -loop.run_until_complete(execute()) \ No newline at end of file +loop.run_until_complete(execute()) +loop.close() + +# # **** CALCULATE COMPRESSION RATIO **** +# s3 = boto3.resource("s3", region_name='us-east-2', +# aws_secret_access_key="AWS_SECRET_KEY", +# aws_access_key_id="AWS_ACCESS_ID") +# bucket = s3.Bucket('succinct-datasets') +# orignal_sizes = 0 +# compressed_sizes = 0 +# num_chunks = 1167 +# for i in range (0, num_chunks): +# orignal_sizes += bucket.Object("sample.txt-chunk-" + str(i) + ".succinct").content_length +# compressed_sizes += bucket.Object("sample.txt-chunk-compressed-" + str(i) + ".succinct").content_length + +# print(orignal_sizes) +# print(compressed_sizes) +# print(orignal_sizes/compressed_sizes) + + + + diff --git a/benchmark-testing/upload.sh b/benchmark-testing/upload.sh new file mode 100644 index 0000000..2f50363 --- /dev/null +++ b/benchmark-testing/upload.sh @@ -0,0 +1,6 @@ +#! /bin/bash +FILES=/home/ec2-user/file_chunks/sample.txt-chunk-*.succinct +for f in $FILES +do + aws s3 cp $f s3://succinct-datasets +done \ No newline at end of file