From 1f2fc9885b9894c110acc9556ad173ec80aee256 Mon Sep 17 00:00:00 2001 From: Damir Zainullin Date: Mon, 2 Dec 2024 00:42:47 +0100 Subject: [PATCH] Top 10 ports - Introduce Count-Min Sketch class --- input/countminsketch.hpp | 147 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 input/countminsketch.hpp diff --git a/input/countminsketch.hpp b/input/countminsketch.hpp new file mode 100644 index 00000000..b5d6e674 --- /dev/null +++ b/input/countminsketch.hpp @@ -0,0 +1,147 @@ +/** +* \file countminsketch.hpp +* \brief Template class implementing Count-Min Sketch algorithm. + * Used to estimate frequency of events in a stream and effectively find top-k frequent events. +* \author Damir Zainullin +* \date 2024 +*/ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace ipxp { + +/** + * @brief Template class implementing Count-Min Sketch algorithm. + * Used to estimate frequency of events in a stream and effectively find top-k frequent events. + * @tparam EventType Type of tracked event event. + * @tparam HashFunctionsCount Count of passed hash functions. + * @tparam TopEventsCount Count of top frequent events to store. + * @tparam RelativeError Relative error of the algorithm on scale from 1 to 9999, where 1 is the highest precision. + * Lower error leads to higher memory consumption. + * @tparam EventsEqual Function object to compare events. + */ +template> +class CountMinSketch { + struct EventCount { + EventType event; + size_t frequency; + }; + + constexpr const static inline size_t MOST_FREQUENT_EVENTS_COUNT = TopEventsCount * 5; +public: + /** @brief Length of row for each hash function in table. */ + constexpr const static inline size_t ROW_LENGTH = std::ceil( std::exp(1.0) / (RelativeError / 10000.0)); + + /** + * @brief Constructor. + * @param hash_functions Array of hash functions to use. + */ + CountMinSketch(std::array, HashFunctionsCount> + hash_functions) noexcept + : m_hash_functions(std::move(hash_functions)) + , m_minimal_heap( + [](const EventCount& a, const EventCount& b) { return a.frequency > b.frequency; }) + , m_in_heap(0, m_hash_functions[0], EventsEqual()) + { + static_assert(TopEventsCount > 0, "TopEventsCount must be greater than 0"); + static_assert( + RelativeError > 0 && RelativeError < 10000, + "RelativeError must be between 0 and 10000"); + static_assert(HashFunctionsCount > 0, "HashFunctionsCount must be greater than 0"); + + for (auto& row : m_event_counts) { + row.fill(0); + } + } + + /** + * @brief Insert event into the sketch. + * @param event Event to insert. + */ + void insert(const EventType& event) noexcept + { + size_t event_frequency = std::numeric_limits::max(); + for (size_t hash_function_index = 0; hash_function_index < HashFunctionsCount; + hash_function_index++) { + const uint16_t event_index = get_event_index(hash_function_index, event); + m_event_counts[hash_function_index][event_index]++; + event_frequency = std::min( + event_frequency, + m_event_counts[hash_function_index][event_index]); + } + + update_least_freq_event(); + + if (m_in_heap.find(event) != m_in_heap.end()) { + m_in_heap[event] = event_frequency; + return; + } + + if (m_minimal_heap.size() < MOST_FREQUENT_EVENTS_COUNT) { + m_minimal_heap.push({event, event_frequency}); + m_in_heap[event] = event_frequency; + return; + } + + if (event_frequency > m_minimal_heap.top().frequency) { + m_in_heap.erase(m_minimal_heap.top().event); + m_minimal_heap.pop(); + m_minimal_heap.push({event, event_frequency}); + m_in_heap[event] = event_frequency; + } + } + + /** + * @brief Function to get current most frequent events. + * @return Pair of array of top frequent events and its real size. + */ + std::pair, uint16_t> get_top_events() const noexcept + { + std::array top_events{}; + std::transform(m_in_heap.begin(), m_in_heap.end(), top_events.begin(), + [](const std::pair& event_count) -> EventCount { + return {event_count.first, event_count.second}; + }); + const uint16_t inserted = std::min(m_in_heap.size(), TopEventsCount); + std::partial_sort(top_events.begin(), top_events.begin() + inserted, top_events.end(), + [](const EventCount& a, const EventCount& b) { + return a.frequency > b.frequency || ( a.frequency == b.frequency && a.event < b.event); }); + std::array res{}; + std::copy_n(top_events.begin(), inserted, res.begin()); + return {res, inserted}; + } +private: + size_t get_event_index(uint16_t hash_function_index, EventType event) const noexcept + { + return m_hash_functions[hash_function_index](event) % ROW_LENGTH; + } + + void update_least_freq_event() noexcept + { + if (m_minimal_heap.empty()) { + return; + } + + const EventType event = m_minimal_heap.top().event; + const size_t new_frequency = m_in_heap[m_minimal_heap.top().event]; + m_minimal_heap.pop(); + m_minimal_heap.push({event, new_frequency}); + } + + std::array, HashFunctionsCount> m_event_counts; + std::array, HashFunctionsCount> m_hash_functions; + std::priority_queue, + std::function> m_minimal_heap; + std::unordered_map, + std::function> m_in_heap; +}; + +} // namespace ipxp \ No newline at end of file