Skip to content

Commit

Permalink
Merge pull request #42 from CESNET/deduplicator-module
Browse files Browse the repository at this point in the history
Deduplicator module
  • Loading branch information
hynekkar authored Oct 21, 2024
2 parents 8e87067 + 9d60f88 commit ec647d7
Show file tree
Hide file tree
Showing 17 changed files with 1,124 additions and 1 deletion.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ option(NM_NG_BUILD_WITH_UBSAN "Build with Undefined Behavior Sanitizer (only f

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic -Wall -Wextra -Wunused -Wconversion -Wsign-conversion")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -Werror")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -ggdb3")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -ggdb3 -fsanitize=address")

if (NM_NG_BUILD_WITH_ASAN)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address -fsanitize-recover=address")
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ functionality/purposes are:

* [Sampler](modules/sampler/): sample records at the given rate.
* [Telemetry](modules/telemetry/): provides unirec telemetry of the input interface.
* [Deduplicator](modules/deduplicator/): omit duplicate records.
1 change: 1 addition & 0 deletions common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ target_link_libraries(common PUBLIC
unirec::unirec++
)


target_include_directories(common PUBLIC
include
spdlog::spdlog
Expand Down
1 change: 1 addition & 0 deletions common/external/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ include(telemetry.cmake)
include(spdlog.cmake)
include(rapidcsv.cmake)
include(argparse.cmake)
include(xxhash.cmake)
19 changes: 19 additions & 0 deletions common/external/xxhash.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# XXHash library (C librabry that provides hash functions)

set(GIT_REPO https://github.com/Cyan4973/xxHash)

FetchContent_Declare(
xxhash
GIT_REPOSITORY ${GIT_REPO}
GIT_TAG v0.8.2
)

FetchContent_MakeAvailable(xxhash)

set(XXHASH_SRC
${xxhash_SOURCE_DIR}/xxhash.c
)

add_library(xxhash STATIC ${XXHASH_SRC})

target_include_directories(xxhash PUBLIC ${xxhash_SOURCE_DIR})
1 change: 1 addition & 0 deletions modules/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
add_subdirectory(sampler)
add_subdirectory(telemetry)
add_subdirectory(deduplicator)
1 change: 1 addition & 0 deletions modules/deduplicator/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_subdirectory(src)
52 changes: 52 additions & 0 deletions modules/deduplicator/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Deduplicator module - README

## Description
The module is used to avoid forwarding duplicate Unirec records
that appear when the same flow is exported twice on different exporters and sent to same collector.
It identifies and forwards only unique records, ignoring records that have already been seen.
The storage is provided by hash map.

## Interfaces
- Input: 1
- Output: 1

## Parameters
### Common TRAP parameters
- `-h [trap,1]` Print help message for this module / for libtrap specific parameters.
- `-i IFC_SPEC` Specification of interface types and their parameters.
- `-v` Be verbose.
- `-vv` Be more verbose.
- `-vvv` Be even more verbose.

### Module specific parameters
- `-s, --size <int>` Count of records that hash table can keep simultaneously. Default value is 2^20
- `-t, --timeout <int>` Time to consider similar flows as duplicates in milliseconds. Default value 5000(5s)
- `-m, --appfs-mountpoint <path>` Path where the appFs directory will be mounted

## Identification of duplicates flows
Flows are considered as duplicates when they:
- arrive to the collector with less than `--timeout` delay
- have same source and destination ip addresses, ports and protocol field value
- have distinct `LINK_BIT_FIELD` values

## Usage Examples
```
# Data from the input unix socket interface "in" is processed, and entries that
are duplicates of entries received during last 1000 milliseconds are omitted, other are forwarded to the
output interface "out." Transient storage is hash map with 2^15 records.
$ deduplicator -i "u:in,u:out" -s 15 -t 1000
```

## Telemetry data format
```
├─ input/
│ └─ stats
└─ deduplicator/
└─ statistics
```

Statistics file contains counts of flows :
- Replaced flows - flows that were inserted to the bucket and the oldest flow from the bucket is removed.
- Deduplicated flows - flows that were identified as duplicates and were omitted.
- Inserted flows - flows that were normally inserted (not Replaced nor Deduplicated).
18 changes: 18 additions & 0 deletions modules/deduplicator/src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
add_executable(deduplicator
main.cpp
deduplicator.cpp
)

target_link_libraries(deduplicator PRIVATE
telemetry::telemetry
telemetry::appFs
common
rapidcsv
unirec::unirec++
unirec::unirec
trap::trap
argparse
xxhash
)

install(TARGETS deduplicator DESTINATION ${INSTALL_DIR_BIN})
105 changes: 105 additions & 0 deletions modules/deduplicator/src/deduplicator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/**
* @file
* @author Damir Zainullin <[email protected]>
* @brief Definition of the Deduplicator class
*
* SPDX-License-Identifier: BSD-3-Clause
*/

#include "deduplicator.hpp"

#include <stdexcept>
#include <type_traits>
#include <xxhash.h>

using namespace Nemea;

namespace Deduplicator {

template <typename Key>
static uint64_t xxHasher(const Key& key)
{
return XXH3_64bits(reinterpret_cast<const void*>(&key), sizeof(key));
}

static Deduplicator::Timestamp timeSum(const Deduplicator::Timestamp& value, uint64_t timeout)
{
return value + std::chrono::milliseconds(timeout);
}

static ur_field_id_t getUnirecIdByName(const char* str)
{
auto unirecId = ur_get_id_by_name(str);
if (unirecId == UR_E_INVALID_NAME) {
throw std::runtime_error(std::string("Invalid Unirec name:") + str);
}
return static_cast<ur_field_id_t>(unirecId);
}

Deduplicator::Deduplicator(const DeduplicatorHashMap::TimeoutHashMapParameters& parameters)
: m_hashMap(parameters, xxHasher<FlowKey>, std::less<>(), timeSum)
{
constexpr const size_t timeoutBucketSize = 256;
static_assert(
sizeof(DeduplicatorHashMap::HashMapTimeoutBucket) == timeoutBucketSize,
"TimeoutBucket size is not 256 bytes");
}

void Deduplicator::updateUnirecIds()
{
m_ids.srcIpId = getUnirecIdByName("SRC_IP");
m_ids.dstIpId = getUnirecIdByName("DST_IP");
m_ids.srcPortId = getUnirecIdByName("SRC_PORT");
m_ids.dstPortId = getUnirecIdByName("DST_PORT");
m_ids.protocolId = getUnirecIdByName("PROTOCOL");
m_ids.linkBitFieldId = getUnirecIdByName("LINK_BIT_FIELD");
m_ids.timeLastId = getUnirecIdByName("TIME_LAST");
}

bool Deduplicator::isDuplicate(UnirecRecordView& view)
{
FlowKey flowKey;
flowKey.srcIp = view.getFieldAsType<IpAddress>(m_ids.srcIpId);
flowKey.dstIp = view.getFieldAsType<IpAddress>(m_ids.dstIpId);
flowKey.srcPort = view.getFieldAsType<uint16_t>(m_ids.srcPortId);
flowKey.dstPort = view.getFieldAsType<uint16_t>(m_ids.dstPortId);
flowKey.proto = view.getFieldAsType<uint8_t>(m_ids.protocolId);
auto linkBitField = view.getFieldAsType<uint64_t>(m_ids.linkBitFieldId);

const auto [it, insertResult]
= m_hashMap.insert({flowKey, linkBitField}, std::chrono::steady_clock::now());

if (insertResult == DeduplicatorHashMap::HashMapTimeoutBucket::InsertResult::INSERTED) {
m_inserted++;
return false;
}
if (insertResult == DeduplicatorHashMap::HashMapTimeoutBucket::InsertResult::REPLACED) {
m_replaced++;
return false;
}
if (*it != linkBitField) {
m_deduplicated++;
return true;
}
m_inserted++;
return false;
}

void Deduplicator::setTelemetryDirectory(const std::shared_ptr<telemetry::Directory>& directory)
{
m_holder.add(directory);

const telemetry::FileOps fileOps
= {[this]() {
telemetry::Dict dict;
dict["replacedCount"] = telemetry::Scalar((long unsigned int) m_replaced);
dict["insertedCount"] = telemetry::Scalar((long unsigned int) m_inserted);
dict["deduplicatedCount"] = telemetry::Scalar((long unsigned int) m_deduplicated);
return dict;
},
nullptr};

m_holder.add(directory->addFile("statistics", fileOps));
}

} // namespace Deduplicator
88 changes: 88 additions & 0 deletions modules/deduplicator/src/deduplicator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/**
* @file
* @author Damir Zainullin <[email protected]>
* @brief Declaration of the Deduplicator class
*
* SPDX-License-Identifier: BSD-3-Clause
*/

#pragma once

#include "flowKey.hpp"
#include "timeoutHashMap.hpp"
#include "unirecidstorage.hpp"

#include <atomic>
#include <memory>
#include <telemetry.hpp>
#include <thread>
#include <unirec++/unirecRecordView.hpp>
#include <unirec++/urTime.hpp>
#include <vector>

namespace Deduplicator {

/**
* @brief Deduplicator class to omit duplicate records
*/
class Deduplicator {
public:
/**
* @brief Timestamp type used by deduplicator.
*/
using Timestamp = std::chrono::time_point<std::chrono::steady_clock>;
/**
* @brief Link bit field is represented by uint64_t.
*/
using LinkBitField = uint64_t;
/**
* @brief Timeout hash mapp type used by deduplicator.
*/
using DeduplicatorHashMap = TimeoutHashMap<
FlowKey,
LinkBitField,
Timestamp,
std::function<size_t(const FlowKey&)>,
std::function<bool(const Timestamp&, const Timestamp&)>,
std::function<Timestamp(const Timestamp&, uint64_t)>>;

static inline const int DEFAULT_HASHMAP_TIMEOUT = 5000; ///< Default timeout - 5s

/**
* @brief Deduplicator constructor
*
* @param parameters Parameters to build hash table of deduplicator
*/
explicit Deduplicator(const DeduplicatorHashMap::TimeoutHashMapParameters& parameters);

/**
* @brief Checks if the given UnirecRecordView is duplicate.
* @param view The Unirec record to check.
* @return True if the record is duplicate, false otherwise.
*/
bool isDuplicate(Nemea::UnirecRecordView& view);

/**
* @brief Sets the telemetry directory for the deduplicator.
* @param directory directory for deduplicator telemetry.
*/
void setTelemetryDirectory(const std::shared_ptr<telemetry::Directory>& directory);

/**
* @brief Update Unirec Id of required fields after template format change.
*/
void updateUnirecIds();

private:
DeduplicatorHashMap m_hashMap; ///< Hash map to keep flows

uint32_t m_replaced {0}; ///< Count of replaced flows
uint32_t m_deduplicated {0}; ///< Count of deduplicated flows
uint32_t m_inserted {0}; ///< Count of inserted flows

telemetry::Holder m_holder;

UnirecIdStorage m_ids; ///< Ids of Unirec fields used by deduplicator module
};

} // namespace Deduplicator
27 changes: 27 additions & 0 deletions modules/deduplicator/src/flowKey.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/**
* @file
* @author Damir Zainullin <[email protected]>
* @brief Declaration of the FlowKey structure
*
* SPDX-License-Identifier: BSD-3-Clause
*/

#pragma once

#include <chrono>
#include <unirec++/ipAddress.hpp>

namespace Deduplicator {

/**
* @brief Represents key fields of the flow to consider the duplicates.
*/
struct FlowKey {
Nemea::IpAddress srcIp; ///< Source IP address.
Nemea::IpAddress dstIp; ///< Destination IP address.
uint16_t srcPort; ///< Source port.
uint16_t dstPort; ///< Destination port.
uint8_t proto; ///< Protocol ID.
};

} // namespace Deduplicator
Loading

0 comments on commit ec647d7

Please sign in to comment.