diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..614d10d --- /dev/null +++ b/.gitignore @@ -0,0 +1,324 @@ +#==============================================================================# +# This file specifies intentionally untracked files that git should ignore. +#==============================================================================# + +#==============================================================================# +# File extensions to be ignored anywhere in the tree. +#==============================================================================# +# Temp files created by most text editors. +*~ +# Merge files created by git. +*.orig +# Java bytecode +*.class +# Byte compiled python modules. +*.pyc +# egg dirs from python +*.egg-info/ +# vim swap files +.*.sw? +.sw? +#OS X specific files. +.DS_store +# Core files +#core + +#==============================================================================# +# Explicit files to ignore (only matches one). +#==============================================================================# +# Various tag programs +/tags +/TAGS +/GPATH +/GRTAGS +/GSYMS +/GTAGS +.gitusers +autom4te.cache +cscope.files +cscope.out +autoconf/aclocal.m4 +autoconf/autom4te.cache +/compile_commands.json + +#==============================================================================# +# Directories to ignore (do not add trailing '/'s, they skip symlinks). +#==============================================================================# +# External projects that are tracked independently. +projects/* +!projects/*.* +!projects/Makefile + + +#==============================================================================# +# Autotools artifacts +#==============================================================================# +config/ +!/velox/**/config +configure +config-h.in +autom4te.cache +*Makefile.in +third_party/*/Makefile +libtool +aclocal.m4 +config.log +config.status +stamp-h1 +config.h +!/velox/**/config.h +m4/libtool.m4 +m4/ltoptions.m4 +m4/ltsugar.m4 +m4/ltversion.m4 +m4/lt~obsolete.m4 + +#==============================================================================# +# Build artifacts +#==============================================================================# +#m4/ +build/ +_build/ +.ccache/ +#*.m4 +*.o +*.lo +*.la +*~ +*.pdf +*.swp +a.out +CMake/resolve_dependency_module/boost/FindBoost.cmake +__cmake_systeminformation/ + +#==============================================================================# +# Kate Swap Files +#==============================================================================# +*.kate-swp +.#kate-* + +#==============================================================================# +# Backup artifacts +#==============================================================================# +~* +*~ +tmp/ + +#==============================================================================# +# KDevelop files +#==============================================================================# +.kdev4 +*.kdev4 +.dirstamp +.deps +.libs + +#==============================================================================# +# Eclipse files +#==============================================================================# +.wtpmodules +.classpath +.project +.cproject +.pydevproject +.settings +.autotools +.csettings + +/Debug/ +/misc/ + +#==============================================================================# +# Intellij files +#==============================================================================# +.idea +*.iml + +#==============================================================================# +# Code Coverage files +#==============================================================================# +*.gcno +*.gcda + +#==============================================================================# +# Scripts +#==============================================================================# +*.jar +scripts/PelotonTest/out +scripts/PelotonTest/lib + +#==============================================================================# +# Protobuf +#==============================================================================# +*.pb-c.c +*.pb-c.h +*.pb.cc +*.pb.h +*.pb.go + +#==============================================================================# +# Third party +#==============================================================================# +third_party/nanomsg/ +third_party/nvml/ +third_party/logcabin/ + +#==============================================================================# +# Eclipse +#==============================================================================# + +.metadata +bin/ +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.settings/ +.loadpath +.recommenders + +# Eclipse Core +.project + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# PyDev specific (Python IDE for Eclipse) +*.pydevproject + +# CDT-specific (C/C++ Development Tooling) +.cproject + +# JDT-specific (Eclipse Java Development Tools) +.classpath + +# Java annotation processor (APT) +.factorypath + +# PDT-specific (PHP Development Tools) +.buildpath + +# sbteclipse plugin +.target + +# Tern plugin +.tern-project + +# TeXlipse plugin +.texlipse + +# STS (Spring Tool Suite) +.springBeans + +# Code Recommenders +.recommenders/ +io_file + +## General + +# Compiled Object files +*.slo +*.lo +*.o +*.cuo + +# Compiled Dynamic libraries +*.so +*.dylib + +# Compiled Static libraries +*.lai +*.la +*.a + +# Compiled protocol buffers +*.pb.h +*.pb.cc +*_pb2.py + +# Compiled python +*.pyc + +# Compiled MATLAB +*.mex* + +# IPython notebook checkpoints +.ipynb_checkpoints + +# Editor temporaries +*.swp +*~ + +# Sublime Text settings +*.sublime-workspace +*.sublime-project + +# Eclipse Project settings +*.*project +.settings +.csettings + +# Visual Studio +.vs +settings.json +.vscode + +# QtCreator files +*.user + +# PyCharm files +.idea + +# OSX dir files +.DS_Store + +# User's build configuration +Makefile.config +CMakeUserPresets.json + +# build, distribute, and bins (+ python proto bindings) +build +.build_debug/* +.build_release/* +distribute/* +*.testbin +*.bin +cmake_build +.cmake_build +cmake-build-debug +cmake-build-release + +# tests +test/test.sql + +# SQLite logic tests +test/evidence/ +third_party/sqllogictest + +#imdb dataset +third_party/imdb/data + +# Format timer +.last_format +# Benchmarks +.last_benchmarked_commit +benchmark_results/ +duckdb_unittest_tempdir/ +grammar.y.tmp +src/amalgamation/ + +#eclipse +.project +.cproject +.settings +~ + +#docs +velox/docs/sphinx/source/README_generated_* +velox/docs/bindings/python/_generate/* +scripts/bm-report/report.html diff --git a/CMakeLists.txt b/CMakeLists.txt index ce6cc19..aa4d343 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,6 +19,8 @@ project(Nimble) set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED True) +add_compile_definitions(DISABLE_META_INTERNAL_COMPRESSOR=1) + # Sets new behavior for CMP0135, which controls how timestamps are extracted # when using ExternalProject_Add(): # https://cmake.org/cmake/help/latest/policy/CMP0135.html diff --git a/README.md b/README.md index a7ae340..34eebbe 100644 --- a/README.md +++ b/README.md @@ -1,70 +1,70 @@ # The Nimble File Format -Nimble (formerly known as *“Alpha”*) is a new columnar file format for large +Nimble (formerly known as _“Alpha”_) is a new columnar file format for large datasets created by Meta. Nimble is meant to be a replacement for file formats -such as Apache Parquet and ORC.  +such as Apache Parquet and ORC. Watch [this talk](https://www.youtube.com/watch?v=bISBNVtXZ6M) to learn more about Nimble’s internals. Nimble has the following design principles: -* **Wide:** Nimble is better suited for workloads that are wide in nature, such +- **Wide:** Nimble is better suited for workloads that are wide in nature, such as tables with thousands of columns (or streams) which are commonly found in - feature engineering workloads and training tables for machine learning.  + feature engineering workloads and training tables for machine learning. -* **Extensible:** Since the state-of-the-art in data encoding evolves faster +- **Extensible:** Since the state-of-the-art in data encoding evolves faster than the file layout itself, Nimble decouples stream encoding from the underlying physical layout. Nimble allows encodings to be extended by library - users and recursively applied (cascading).  + users and recursively applied (cascading). -* **Parallel:** Nimble is meant to fully leverage highly parallel hardware by +- **Parallel:** Nimble is meant to fully leverage highly parallel hardware by providing encodings which are SIMD and GPU friendly. Although this is not implemented yet, we intend to expose metadata to allow developers to better plan decoding trees and schedule kernels without requiring the data streams - themselves.  + themselves. -* **Unified:** More than a specification, Nimble is a product. We strongly - discourage developers to (re-)implement Nimble’s spec to prevent - environmental fragmentation issues observed with similar projects in the - past. We encourage developers to leverage the single unified Nimble library, - and create high-quality bindings to other languages as needed. +- **Unified:** More than a specification, Nimble is a product. We strongly + discourage developers to (re-)implement Nimble’s spec to prevent environmental + fragmentation issues observed with similar projects in the past. We encourage + developers to leverage the single unified Nimble library, and create + high-quality bindings to other languages as needed. Nimble has the following features: -* Lighter metadata organization to efficiently support thousands to tens of -thousands of columns and streams. +- Lighter metadata organization to efficiently support thousands to tens of + thousands of columns and streams. -* Use Flatbuffers instead of thrift/protobuf to more efficiently access large - metadata sections.  +- Use Flatbuffers instead of thrift/protobuf to more efficiently access large + metadata sections. -* Use block encoding instead of stream encoding to provide predictable memory +- Use block encoding instead of stream encoding to provide predictable memory usage while decoding/reading. -* Supports many encodings out-of-the-box, and additional encodings can be added - as needed.  +- Supports many encodings out-of-the-box, and additional encodings can be added + as needed. -* Supports cascading (recursive/composite) encoding of streams.  +- Supports cascading (recursive/composite) encoding of streams. -* Supports pluggable encoding selection policies. +- Supports pluggable encoding selection policies. -* Provide extensibility APIs where encodings and other aspects of the file can - be extended.  +- Provide extensibility APIs where encodings and other aspects of the file can + be extended. -* Clear separation between logical and physical encoded types. - -* And more. +- Clear separation between logical and physical encoded types. +- And more. Nimble is a work in progress, and many of these features above are still under design and/or active development. As such, Nimble does not provide stability or versioning guarantees (yet). They will be eventually provided with a future -stable release. Use it at your own risk.  +stable release. Use it at your own risk. ## Build Nimble’s CMake build system is self-sufficient and able to either locate its -main dependencies or compile them locally. In order to compile it, one can simply: +main dependencies or compile them locally. In order to compile it, one can +simply: ```shell $ git clone git@github.com:facebookexternal/nimble.git @@ -81,29 +81,34 @@ $ folly_SOURCE=BUNDLED make Nimble builds have been tested using clang 15 and 16. It should automatically compile the following dependencies: gtest, glog, folly, abseil, and velox. You -may need to first install the following system dependencies for these to -compile (example from Ubuntu 22.04): +may need to first install the following system dependencies for these to compile +(example from Ubuntu 22.04): ```shell $ sudo apt install -y \ + git \ + cmake \ flatbuffers-compiler \ + protobuf-compiler \ libflatbuffers-dev \ libgflags-dev \ libunwind-dev \ libgoogle-glog-dev \ libdouble-conversion-dev \ libevent-dev \ + liblz4-dev \ liblzo2-dev \ libelf-dev \ libdwarf-dev \ libsnappy-dev \ + libssl-dev \ bison \ flex \ libfl-dev ``` -Although Nimble’s codebase is today closely coupled with velox, we intend to decouple -them in the future. +Although Nimble’s codebase is today closely coupled with velox, we intend to +decouple them in the future. ## License diff --git a/dwio/nimble/encodings/Compression.cpp b/dwio/nimble/encodings/Compression.cpp index eb956ed..53be6ec 100644 --- a/dwio/nimble/encodings/Compression.cpp +++ b/dwio/nimble/encodings/Compression.cpp @@ -18,7 +18,7 @@ #include "dwio/nimble/common/Exceptions.h" #include "dwio/nimble/encodings/ZstdCompressor.h" -#ifdef META_INTERNAL_COMPRESSOR +#ifndef DISABLE_META_INTERNAL_COMPRESSOR #include "dwio/nimble/encodings/fb/MetaInternalCompressor.h" #endif @@ -31,7 +31,7 @@ struct CompressorRegistry { compressors.reserve(2); compressors.emplace( CompressionType::Zstd, std::make_unique()); -#ifdef META_INTERNAL_COMPRESSOR +#ifndef DISABLE_META_INTERNAL_COMPRESSOR compressors.emplace( CompressionType::MetaInternal, std::make_unique()); diff --git a/dwio/nimble/encodings/EncodingSelectionPolicy.h b/dwio/nimble/encodings/EncodingSelectionPolicy.h index 45a1830..d208847 100644 --- a/dwio/nimble/encodings/EncodingSelectionPolicy.h +++ b/dwio/nimble/encodings/EncodingSelectionPolicy.h @@ -198,7 +198,7 @@ class ManualEncodingSelectionPolicy : public EncodingSelectionPolicy { : compressionOptions_{std::move(compressionOptions)} {} CompressionInformation compression() const override { -#ifdef META_INTERNAL_COMPRESSOR +#ifndef DISABLE_META_INTERNAL_COMPRESSOR CompressionInformation information{ .compressionType = CompressionType::MetaInternal}; information.parameters.metaInternal.compressionLevel =