diff --git a/src/components/sscma-micro/sscma/core/engine/ma_engine.h b/src/components/sscma-micro/sscma/core/engine/ma_engine.h
index ac71936..e7b5fd1 100644
--- a/src/components/sscma-micro/sscma/core/engine/ma_engine.h
+++ b/src/components/sscma-micro/sscma/core/engine/ma_engine.h
@@ -13,9 +13,9 @@ using EngineDefault = ma::engine::EngineTFLite;
 using EngineDefault = ma::engine::EngineCVI;
 #endif
 
-#ifdef MA_USE_ENGINE_HALIO
-#include "ma_engine_halio.h"
-using EngineDefault = ma::engine::EngineHalio;
+#ifdef MA_USE_ENGINE_HAILO
+#include "ma_engine_hailo.h"
+using EngineDefault = ma::engine::EngineHailo;
 #endif
 
 #endif  // _MA_ENGINE_H_
\ No newline at end of file
diff --git a/src/components/sscma-micro/sscma/core/engine/ma_engine_halio.cpp b/src/components/sscma-micro/sscma/core/engine/ma_engine_hailo.cpp
similarity index 85%
rename from src/components/sscma-micro/sscma/core/engine/ma_engine_halio.cpp
rename to src/components/sscma-micro/sscma/core/engine/ma_engine_hailo.cpp
index 3a2bbb3..d8792a7 100644
--- a/src/components/sscma-micro/sscma/core/engine/ma_engine_halio.cpp
+++ b/src/components/sscma-micro/sscma/core/engine/ma_engine_hailo.cpp
@@ -1,7 +1,8 @@
-#include "ma_engine_halio.h"
+#include "ma_engine_hailo.h"
 
-#if MA_USE_ENGINE_HALIO
+#if MA_USE_ENGINE_HAILO
 
+#include <atomic>
 #include <chrono>
 #include <cstdint>
 #include <cstring>
@@ -11,33 +12,57 @@ namespace ma::engine {
 
 using namespace std;
 
-EngineHalio::EngineHalio() : _vdevice(nullptr), _model(nullptr), _configured_model(nullptr), _bindings(nullptr) {}
+EngineHailo::EngineHailo() : _vdevice(nullptr), _model(nullptr), _configured_model(nullptr), _bindings(nullptr) {}
 
-EngineHalio::~EngineHalio() {}
+EngineHailo::~EngineHailo() {}
 
-ma_err_t EngineHalio::init() {
+ma_err_t EngineHailo::init() {
     if (_vdevice) {
         return MA_OK;
     }
 
-    auto vdevice = VDevice::create();
-    if (!vdevice) {
+    // TODO: optimize
+    static auto get_vdevice_f = []() -> shared_ptr<VDevice> {
+        static unique_ptr<VDevice> vdevice = nullptr;
+        if (!vdevice) {
+            auto dev = VDevice::create();
+            if (!dev) {
+                return nullptr;
+            }
+            vdevice = move(dev.release());
+        }
+        if (!vdevice) {
+            return nullptr;
+        }
+        auto shared = vdevice.get();
+        auto mgr = &vdevice;
+        static atomic<size_t> ref_count = 0;
+        ref_count.fetch_add(1);
+        return shared_ptr<VDevice>(shared, [mgr](VDevice*) {
+            if (mgr) {
+                if (ref_count.fetch_sub(1) == 1) {
+                    mgr->reset();
+                }
+            }
+        });
+    };
+    _vdevice = get_vdevice_f();
+    if (!_vdevice) {
         return MA_FAILED;
     }
-    _vdevice = move(vdevice.value());
 
     return MA_OK;
 }
 
-ma_err_t EngineHalio::init(size_t size) {
+ma_err_t EngineHailo::init(size_t size) {
     return init();
 }
 
-ma_err_t EngineHalio::init(void* pool, size_t size) {
+ma_err_t EngineHailo::init(void* pool, size_t size) {
     return init();
 }
 
-ma_err_t EngineHalio::run() {
+ma_err_t EngineHailo::run() {
     if (!_configured_model || !_bindings) {
         return MA_FAILED;
     }
@@ -65,7 +90,7 @@ ma_err_t EngineHalio::run() {
 }
 
 #if MA_USE_FILESYSTEM
-ma_err_t EngineHalio::load(const string& model_path) {
+ma_err_t EngineHailo::load(const string& model_path) {
 
     {
         _input_tensors.clear();
@@ -324,12 +349,12 @@ ma_err_t EngineHalio::load(const string& model_path) {
     return MA_OK;
 }
 
-ma_err_t EngineHalio::load(const char* model_path) {
+ma_err_t EngineHailo::load(const char* model_path) {
     return load(string(model_path));
 }
 #endif
 
-ma_err_t EngineHalio::load(const void* model_data, size_t model_size) {
+ma_err_t EngineHailo::load(const void* model_data, size_t model_size) {
 #if MA_USE_FILESYSTEM
     string model_path(reinterpret_cast<const char*>(model_data), model_size);
     return load(model_path);
@@ -338,15 +363,15 @@ ma_err_t EngineHalio::load(const void* model_data, size_t model_size) {
 #endif
 }
 
-int32_t EngineHalio::getInputSize() {
+int32_t EngineHailo::getInputSize() {
     return _input_tensors.size();
 }
 
-int32_t EngineHalio::getOutputSize() {
+int32_t EngineHailo::getOutputSize() {
     return _output_tensors.size();
 }
 
-ma_tensor_t EngineHalio::getInput(int32_t index) {
+ma_tensor_t EngineHailo::getInput(int32_t index) {
     if (index < 0 || index >= static_cast<int32_t>(_input_tensors.size())) {
         return {0};
     }
@@ -354,7 +379,7 @@ ma_tensor_t EngineHalio::getInput(int32_t index) {
     return _input_tensors[index] ? *_input_tensors[index] : ma_tensor_t{0};
 }
 
-ma_tensor_t EngineHalio::getOutput(int32_t index) {
+ma_tensor_t EngineHailo::getOutput(int32_t index) {
     if (index < 0 || index >= static_cast<int32_t>(_output_tensors.size())) {
         return {0};
     }
@@ -362,7 +387,7 @@ ma_tensor_t EngineHalio::getOutput(int32_t index) {
     return _output_tensors[index] ? *_output_tensors[index] : ma_tensor_t{0};
 }
 
-ma_shape_t EngineHalio::getInputShape(int32_t index) {
+ma_shape_t EngineHailo::getInputShape(int32_t index) {
     if (index < 0 || index >= static_cast<int32_t>(_input_tensors.size())) {
         return {0};
     }
@@ -370,7 +395,7 @@ ma_shape_t EngineHalio::getInputShape(int32_t index) {
     return _input_tensors[index] ? _input_tensors[index]->shape : ma_shape_t{0};
 }
 
-ma_shape_t EngineHalio::getOutputShape(int32_t index) {
+ma_shape_t EngineHailo::getOutputShape(int32_t index) {
     if (index < 0 || index >= static_cast<int32_t>(_output_tensors.size())) {
         return {0};
     }
@@ -378,7 +403,7 @@ ma_shape_t EngineHalio::getOutputShape(int32_t index) {
     return _output_tensors[index] ? _output_tensors[index]->shape : ma_shape_t{0};
 }
 
-ma_quant_param_t EngineHalio::getInputQuantParam(int32_t index) {
+ma_quant_param_t EngineHailo::getInputQuantParam(int32_t index) {
     if (index < 0 || index >= static_cast<int32_t>(_input_tensors.size())) {
         return {0};
     }
@@ -386,7 +411,7 @@ ma_quant_param_t EngineHalio::getInputQuantParam(int32_t index) {
     return _input_tensors[index] ? _input_tensors[index]->quant_param : ma_quant_param_t{0};
 }
 
-ma_quant_param_t EngineHalio::getOutputQuantParam(int32_t index) {
+ma_quant_param_t EngineHailo::getOutputQuantParam(int32_t index) {
     if (index < 0 || index >= static_cast<int32_t>(_output_tensors.size())) {
         return {0};
     }
@@ -395,7 +420,17 @@ ma_quant_param_t EngineHalio::getOutputQuantParam(int32_t index) {
 }
 
 
-ma_err_t EngineHalio::setInput(int32_t index, const ma_tensor_t& tensor) {
+ma_err_t EngineHailo::setInput(int32_t index, const ma_tensor_t& tensor) {
+    if (index < 0 || index >= static_cast<int32_t>(_input_tensors.size())) {
+        return MA_EINVAL;
+    }
+
+    if (tensor.size != _input_tensors[index]->size) {
+        return MA_EINVAL;
+    }
+
+    std::memcpy(_input_tensors[index]->data.data, tensor.data.data, tensor.size);
+
     return MA_ENOTSUP;
 }
 
diff --git a/src/components/sscma-micro/sscma/core/engine/ma_engine_halio.h b/src/components/sscma-micro/sscma/core/engine/ma_engine_hailo.h
similarity index 87%
rename from src/components/sscma-micro/sscma/core/engine/ma_engine_halio.h
rename to src/components/sscma-micro/sscma/core/engine/ma_engine_hailo.h
index 2175767..76265c7 100644
--- a/src/components/sscma-micro/sscma/core/engine/ma_engine_halio.h
+++ b/src/components/sscma-micro/sscma/core/engine/ma_engine_hailo.h
@@ -1,11 +1,11 @@
-#ifndef _MA_ENGINE_HALIO_H_
-#define _MA_ENGINE_HALIO_H_
+#ifndef _MA_ENGINE_HAILO_H_
+#define _MA_ENGINE_HAILO_H_
 
 #include "../ma_common.h"
 
-#if MA_USE_ENGINE_HALIO
+#if MA_USE_ENGINE_HAILO
 
-#include "ma_engine.h"
+#include "ma_engine_base.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -21,12 +21,12 @@ namespace ma::engine {
 using namespace std;
 using namespace hailort;
 
-class EngineHalio final : public Engine {
+class EngineHailo final : public Engine {
 public:
     using ExternalHandler = function<ma_err_t(int, void*, size_t)>;
 
-    EngineHalio();
-    ~EngineHalio() override;
+    EngineHailo();
+    ~EngineHailo() override;
 
     ma_err_t init() override;
     ma_err_t init(size_t size) override;
@@ -52,7 +52,7 @@ class EngineHalio final : public Engine {
     ma_err_t setInput(int32_t index, const ma_tensor_t& tensor) override;
 
 private:
-    unique_ptr<VDevice> _vdevice;
+    shared_ptr<VDevice> _vdevice;
     shared_ptr<InferModel> _model;
     shared_ptr<ConfiguredInferModel> _configured_model;
     shared_ptr<ConfiguredInferModel::Bindings> _bindings;
diff --git a/src/components/sscma-micro/sscma/core/ma_common.h b/src/components/sscma-micro/sscma/core/ma_common.h
index 3d185d5..ebe01c7 100644
--- a/src/components/sscma-micro/sscma/core/ma_common.h
+++ b/src/components/sscma-micro/sscma/core/ma_common.h
@@ -15,7 +15,7 @@
 #include "ma_exception.h"
 #include "ma_types.h"
 
-#define MA_VERSION           "2024.11.13"
+#define MA_VERSION           "2024.11.25"
 #define MA_VERSION_LENTH_MAX 32
 
 #endif  // MA_COMMON_H
diff --git a/src/components/sscma-micro/sscma/core/ma_types.h b/src/components/sscma-micro/sscma/core/ma_types.h
index 984c65e..4b5d7f3 100644
--- a/src/components/sscma-micro/sscma/core/ma_types.h
+++ b/src/components/sscma-micro/sscma/core/ma_types.h
@@ -267,7 +267,7 @@ typedef enum {
     MA_OUTPUT_TYPE_POINT        = 0x0200,
     MA_OUTPUT_TYPE_BBOX         = 0x0300,
     MA_OUTPUT_TYPE_KEYPOINT     = 0x0400,
-    MA_OUTPUT_TYPE_SEGMENTATION = 0x0500,
+    MA_OUTPUT_TYPE_SEGMENT = 0x0500,
 } ma_output_type_t;
 
 
diff --git a/src/components/sscma-micro/sscma/core/math/ma_math.h b/src/components/sscma-micro/sscma/core/math/ma_math.h
index 2822488..1b1e2dd 100644
--- a/src/components/sscma-micro/sscma/core/math/ma_math.h
+++ b/src/components/sscma-micro/sscma/core/math/ma_math.h
@@ -3,5 +3,6 @@
 
 #include "ma_math_scalars.h"
 #include "ma_math_vectors.h"
+#include "ma_math_matrix.h"
 
 #endif  // _MA_MATH_H
diff --git a/src/components/sscma-micro/sscma/core/math/ma_math_matrix.cpp b/src/components/sscma-micro/sscma/core/math/ma_math_matrix.cpp
new file mode 100644
index 0000000..46187be
--- /dev/null
+++ b/src/components/sscma-micro/sscma/core/math/ma_math_matrix.cpp
@@ -0,0 +1,22 @@
+#include "ma_math_matrix.h"
+#include "ma_math_vectors.h"
+
+#include <cmath>
+
+namespace ma::math {
+
+void softmax2D(float* data, size_t rows, size_t cols) {
+    size_t size = rows * cols;
+    for (size_t i = 0; i < size; i += cols) {
+        softmax(&data[i], cols);
+    }
+}
+
+void fastSoftmax2D(float* data, size_t rows, size_t cols) {
+    size_t size = rows * cols;
+    for (size_t i = 0; i < size; i += cols) {
+        fastSoftmax(&data[i], cols);
+    }
+}
+
+}
\ No newline at end of file
diff --git a/src/components/sscma-micro/sscma/core/math/ma_math_matrix.h b/src/components/sscma-micro/sscma/core/math/ma_math_matrix.h
new file mode 100644
index 0000000..c2ffb18
--- /dev/null
+++ b/src/components/sscma-micro/sscma/core/math/ma_math_matrix.h
@@ -0,0 +1,32 @@
+#ifndef _MA_MATH_MARTRIX_H_
+#define _MA_MATH_MARTRIX_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#if MA_USE_LIB_XTENSOR
+#include <xtensor/xarray.hpp>
+#include <xtensor/xmath.hpp>
+#include <xtensor/xview.hpp>
+#endif
+
+namespace ma::math {
+
+void softmax2D(float* data, size_t rows, size_t cols);
+
+void fastSoftmax2D(float* data, size_t rows, size_t cols);
+
+#if MA_USE_LIB_XTENSOR
+template <typename QT>
+static void dequantizeValues2D(xt::xarray<float>& dequantized_outputs, int index, const xt::xarray<QT>& quantized_outputs, size_t dim1, size_t dim2, float32_t qp_scale, float32_t qp_zp) {
+    for (size_t i = 0; i < dim1; i++) {
+        for (size_t j = 0; j < dim2; j++) {
+            dequantized_outputs(i, j) = (float(quantized_outputs(index, i, j)) - qp_zp) * qp_scale;
+        }
+    }
+}
+#endif
+
+}  // namespace ma::math
+
+#endif
\ No newline at end of file
diff --git a/src/components/sscma-micro/sscma/core/math/ma_math_scalars.h b/src/components/sscma-micro/sscma/core/math/ma_math_scalars.h
index 1f8b494..cad47bf 100644
--- a/src/components/sscma-micro/sscma/core/math/ma_math_scalars.h
+++ b/src/components/sscma-micro/sscma/core/math/ma_math_scalars.h
@@ -21,8 +21,8 @@ constexpr inline float fastLn(float x) {
         return -std::numeric_limits<float>::infinity();
     }
 
-    auto       bx{*reinterpret_cast<unsigned int*>(&x)};
-    auto       ex{bx >> 23};
+    auto bx{*reinterpret_cast<unsigned int*>(&x)};
+    auto ex{bx >> 23};
     const auto t{static_cast<signed int>(ex) - static_cast<signed int>(127)};
 
     bx = 1065353216 | (bx & 8388607);
@@ -41,7 +41,8 @@ constexpr inline float fastExp(float x) {
     const float c{8388608.f};
     const float d{2139095040.f};
 
-    if ((x < c) | (x > d)) x = (x < c) ? 0.0f : d;
+    if ((x < c) | (x > d))
+        x = (x < c) ? 0.0f : d;
 
     uint32_t n = static_cast<uint32_t>(x);
     x          = *reinterpret_cast<float*>(&n);
@@ -49,9 +50,13 @@ constexpr inline float fastExp(float x) {
     return x;
 }
 
-constexpr inline float sigmoid(float x) { return 1.0f / (1.0f + std::exp(-x)); }
+constexpr inline float sigmoid(float x) {
+    return 1.0f / (1.0f + std::exp(-x));
+}
 
-constexpr inline float fastSigmoid(float x) { return 1.0f / (1.0f + fastExp(-x)); }
+constexpr inline float fastSigmoid(float x) {
+    return 1.0f / (1.0f + fastExp(-x));
+}
 
 constexpr inline float inverseSigmoid(float x) {
     float denominator = 1.0f - x;
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_classifier.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_classifier.cpp
index 9b51db1..d9f07c5 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_classifier.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_classifier.cpp
@@ -134,8 +134,9 @@ const void* Classifier::getInput() {
 }
 
 ma_err_t Classifier::run(const ma_img_t* img) {
-    // MA_ASSERT(img != nullptr);
+
     input_img_ = img;
+
     return underlyingRun();
 }
 
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_detector.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_detector.cpp
index 52a0b53..90a44bf 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_detector.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_detector.cpp
@@ -63,8 +63,9 @@ const void* Detector::getInput() {
 }
 
 ma_err_t Detector::run(const ma_img_t* img) {
-    // MA_ASSERT(img != nullptr);
+
     input_img_ = img;
+    
     return underlyingRun();
 }
 
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_factory.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_factory.cpp
index 551d9f3..573899d 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_factory.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_factory.cpp
@@ -11,58 +11,63 @@ Model* ModelFactory::create(Engine* engine, size_t algorithm_id) {
     }
 
     switch (algorithm_id) {
-    case 0:
-    case MA_MODEL_TYPE_FOMO:
-        if (FOMO::isValid(engine)) {
-            return new FOMO(engine);
-        }
-    
-    case MA_MODEL_TYPE_IMCLS:
-        if (Classifier::isValid(engine)) {
-            return new Classifier(engine);
-        }
+        case 0:
+        case MA_MODEL_TYPE_FOMO:
+            if (FOMO::isValid(engine)) {
+                return new FOMO(engine);
+            }
 
-    case MA_MODEL_TYPE_PFLD:
-        if (PFLD::isValid(engine)) {
-            return new PFLD(engine);
-        }
+        case MA_MODEL_TYPE_IMCLS:
+            if (Classifier::isValid(engine)) {
+                return new Classifier(engine);
+            }
 
-    case MA_MODEL_TYPE_YOLOV5:
-        if (YoloV5::isValid(engine)) {
-            return new YoloV5(engine);
-        }
+        case MA_MODEL_TYPE_PFLD:
+            if (PFLD::isValid(engine)) {
+                return new PFLD(engine);
+            }
 
-    case MA_MODEL_TYPE_YOLOV8_POSE:
-        if (YoloV8Pose::isValid(engine)) {
-            return new YoloV8Pose(engine);
-        }
+        case MA_MODEL_TYPE_YOLOV5:
+            if (YoloV5::isValid(engine)) {
+                return new YoloV5(engine);
+            }
 
-    case MA_MODEL_TYPE_YOLOV8:
-        if (YoloV8::isValid(engine)) {
-            return new YoloV8(engine);
-        }
+        case MA_MODEL_TYPE_YOLOV8_POSE:
+#if MA_USE_ENGINE_HAILO
+            if (YoloV8PoseHailo::isValid(engine)) {
+                return new YoloV8PoseHailo(engine);
+            }
+#endif
+            if (YoloV8Pose::isValid(engine)) {
+                return new YoloV8Pose(engine);
+            }
 
-    case MA_MODEL_TYPE_NVIDIA_DET:
-        if (NvidiaDet::isValid(engine)) {
-            return new NvidiaDet(engine);
-        }
+        case MA_MODEL_TYPE_YOLOV8:
+            if (YoloV8::isValid(engine)) {
+                return new YoloV8(engine);
+            }
 
-    case MA_MODEL_TYPE_YOLO_WORLD:
-        if (YoloWorld::isValid(engine)) {
-            return new YoloWorld(engine);
-        }
-    case MA_MODEL_TYPE_YOLO11:
-        if (Yolo11::isValid(engine)) {
-            return new Yolo11(engine);
-        }
-    case MA_MODEL_TYPE_YOLO11_POSE:
-        if (Yolo11Pose::isValid(engine)) {
-            return new Yolo11Pose(engine);
-        }
-    case MA_MODEL_TYPE_YOLO11_SEG:
-        if (Yolo11Seg::isValid(engine)) {
-            return new Yolo11Seg(engine);
-        }
+        case MA_MODEL_TYPE_NVIDIA_DET:
+            if (NvidiaDet::isValid(engine)) {
+                return new NvidiaDet(engine);
+            }
+
+        case MA_MODEL_TYPE_YOLO_WORLD:
+            if (YoloWorld::isValid(engine)) {
+                return new YoloWorld(engine);
+            }
+        case MA_MODEL_TYPE_YOLO11:
+            if (Yolo11::isValid(engine)) {
+                return new Yolo11(engine);
+            }
+        case MA_MODEL_TYPE_YOLO11_POSE:
+            if (Yolo11Pose::isValid(engine)) {
+                return new Yolo11Pose(engine);
+            }
+        case MA_MODEL_TYPE_YOLO11_SEG:
+            if (Yolo11Seg::isValid(engine)) {
+                return new Yolo11Seg(engine);
+            }
     }
 
     return nullptr;
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_factory.h b/src/components/sscma-micro/sscma/core/model/ma_model_factory.h
index c462610..f86bfda 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_factory.h
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_factory.h
@@ -20,6 +20,7 @@
 #include "ma_model_yolov5.h"
 #include "ma_model_yolov8.h"
 #include "ma_model_yolov8_pose.h"
+#include "ma_model_yolov8_pose_hailo.h"
 
 namespace ma {
 
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_pfld.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_pfld.cpp
index e52c189..190bb80 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_pfld.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_pfld.cpp
@@ -1,7 +1,7 @@
 #include "ma_model_pfld.h"
 
 #include <algorithm>
-#include <vector>
+#include <forward_list>
 
 namespace ma::model {
 
@@ -96,11 +96,9 @@ ma_err_t PFLD::postProcessI8() {
         point.score  = 1.0;
         point.target = i / 2;
 
-        results_.push_back(std::move(point));
+        results_.push_front(std::move(point));
     }
 
-    results_.shrink_to_fit();
-
     return MA_OK;
 }
 
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_pfld.h b/src/components/sscma-micro/sscma/core/model/ma_model_pfld.h
index c8b173a..a9da5e6 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_pfld.h
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_pfld.h
@@ -1,8 +1,6 @@
 #ifndef _MA_MODEL_PFLD_
 #define _MA_MODEL_PFLD_
 
-#include <vector>
-
 #include "ma_model_point_detector.h"
 
 namespace ma::model {
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_point_detector.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_point_detector.cpp
index f4c78c4..b0c326f 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_point_detector.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_point_detector.cpp
@@ -30,13 +30,18 @@ PointDetector::PointDetector(Engine* p_engine, const char* name, ma_model_type_t
 
 PointDetector::~PointDetector() {}
 
-const std::vector<ma_point_t>& PointDetector::getResults() const {
+const std::forward_list<ma_point_t>& PointDetector::getResults() const {
     return results_;
 }
 
 ma_err_t PointDetector::preprocess() {
     ma_err_t ret = MA_OK;
 
+
+    if (input_img_ == nullptr) {
+        return MA_OK;
+    }
+
     ret = ma::cv::convert(input_img_, &img_);
     if (ret != MA_OK) {
         return ret;
@@ -51,7 +56,6 @@ ma_err_t PointDetector::preprocess() {
 }
 
 ma_err_t PointDetector::run(const ma_img_t* img) {
-    MA_ASSERT(img != nullptr);
 
     input_img_ = img;
 
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_point_detector.h b/src/components/sscma-micro/sscma/core/model/ma_model_point_detector.h
index 80761b6..e98d65f 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_point_detector.h
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_point_detector.h
@@ -1,7 +1,7 @@
 #ifndef _MA_MODEL_POINT_DETECTOR_H_
 #define _MA_MODEL_POINT_DETECTOR_H_
 
-#include <vector>
+#include <forward_list>
 
 #include "ma_model_base.h"
 
@@ -17,7 +17,7 @@ class PointDetector : public Model {
 
     bool is_nhwc_;
 
-    std::vector<ma_point_t> results_;
+    std::forward_list<ma_point_t> results_;
 
 protected:
     ma_err_t preprocess() override;
@@ -26,7 +26,7 @@ class PointDetector : public Model {
     PointDetector(Engine* engine, const char* name, ma_model_type_t type);
     virtual ~PointDetector();
 
-    const std::vector<ma_point_t>& getResults() const;
+    const std::forward_list<ma_point_t>& getResults() const;
 
     ma_err_t run(const ma_img_t* img);
 
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_pose_detector.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_pose_detector.cpp
index f0efde5..1d84ac1 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_pose_detector.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_pose_detector.cpp
@@ -42,6 +42,10 @@ const void* PoseDetector::getInput() {
 ma_err_t PoseDetector::preprocess() {
     ma_err_t ret = MA_OK;
 
+    if (input_img_ == nullptr) {
+        return MA_OK;
+    }
+
     ret = ma::cv::convert(input_img_, &img_);
     if (ret != MA_OK) {
         return ret;
@@ -56,7 +60,6 @@ ma_err_t PoseDetector::preprocess() {
 }
 
 ma_err_t PoseDetector::run(const ma_img_t* img) {
-    MA_ASSERT(img != nullptr);
 
     input_img_ = img;
 
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_segmenter.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_segmentor.cpp
similarity index 81%
rename from src/components/sscma-micro/sscma/core/model/ma_model_segmenter.cpp
rename to src/components/sscma-micro/sscma/core/model/ma_model_segmentor.cpp
index e6c54ef..a3b1ad6 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_segmenter.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_segmentor.cpp
@@ -1,12 +1,12 @@
-#include "ma_model_segmenter.h"
+#include "ma_model_segmentor.h"
 
 #include "../cv/ma_cv.h"
 
 namespace ma::model {
 
-constexpr char TAG[] = "ma::model::segmenter";
+constexpr char TAG[] = "ma::model::Segmentor";
 
-Segmenter::Segmenter(Engine* p_engine, const char* name, ma_model_type_t type) : Model(p_engine, name, MA_INPUT_TYPE_IMAGE | MA_OUTPUT_TYPE_SEGMENTATION | type) {
+Segmentor::Segmentor(Engine* p_engine, const char* name, ma_model_type_t type) : Model(p_engine, name, MA_INPUT_TYPE_IMAGE | MA_OUTPUT_TYPE_SEGMENT | type) {
     input_           = p_engine_->getInput(0);
     threshold_nms_   = 0.45;
     threshold_score_ = 0.25;
@@ -29,10 +29,14 @@ Segmenter::Segmenter(Engine* p_engine, const char* name, ma_model_type_t type) :
     img_.data = input_.data.u8;
 }
 
-Segmenter::~Segmenter() {}
-ma_err_t Segmenter::preprocess() {
+Segmentor::~Segmentor() {}
+ma_err_t Segmentor::preprocess() {
     ma_err_t ret = MA_OK;
 
+    if (input_img_ == nullptr) {
+        return MA_OK;
+    }
+
     ret = ma::cv::convert(input_img_, &img_);
     if (ret != MA_OK) {
         return ret;
@@ -46,23 +50,22 @@ ma_err_t Segmenter::preprocess() {
     return ret;
 }
 
-const void* Segmenter::getInput() {
+const void* Segmentor::getInput() {
     return static_cast<const void*>(&img_);
 }
 
-const std::forward_list<ma_segm2f_t>& Segmenter::getResults() const {
+const std::forward_list<ma_segm2f_t>& Segmentor::getResults() const {
     return results_;
 }
 
-ma_err_t Segmenter::run(const ma_img_t* img) {
-    MA_ASSERT(img != nullptr);
+ma_err_t Segmentor::run(const ma_img_t* img) {
 
     input_img_ = img;
 
     return underlyingRun();
 }
 
-ma_err_t Segmenter::setConfig(ma_model_cfg_opt_t opt, ...) {
+ma_err_t Segmentor::setConfig(ma_model_cfg_opt_t opt, ...) {
     ma_err_t ret = MA_OK;
     va_list args;
     va_start(args, opt);
@@ -83,7 +86,7 @@ ma_err_t Segmenter::setConfig(ma_model_cfg_opt_t opt, ...) {
     return ret;
 }
 
-ma_err_t Segmenter::getConfig(ma_model_cfg_opt_t opt, ...) {
+ma_err_t Segmentor::getConfig(ma_model_cfg_opt_t opt, ...) {
     ma_err_t ret = MA_OK;
     va_list args;
     void* p_arg = nullptr;
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_segmenter.h b/src/components/sscma-micro/sscma/core/model/ma_model_segmentor.h
similarity index 74%
rename from src/components/sscma-micro/sscma/core/model/ma_model_segmenter.h
rename to src/components/sscma-micro/sscma/core/model/ma_model_segmentor.h
index 258b340..05d2023 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_segmenter.h
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_segmentor.h
@@ -1,5 +1,5 @@
-#ifndef _MA_MODEL_SEGMENTER_H_
-#define _MA_MODEL_SEGMENTER_H_
+#ifndef _MA_MODEL_SEGMENTOR_H_
+#define _MA_MODEL_SEGMENTOR_H_
 
 #include <vector>
 
@@ -7,7 +7,7 @@
 
 namespace ma::model {
 
-class Segmenter : public Model {
+class Segmentor : public Model {
 protected:
     ma_tensor_t input_;
     ma_img_t img_;
@@ -24,8 +24,8 @@ class Segmenter : public Model {
     ma_err_t preprocess() override;
 
 public:
-    Segmenter(Engine* engine, const char* name, ma_model_type_t type);
-    virtual ~Segmenter();
+    Segmentor(Engine* engine, const char* name, ma_model_type_t type);
+    virtual ~Segmentor();
 
     const std::forward_list<ma_segm2f_t>& getResults() const;
 
@@ -40,4 +40,4 @@ class Segmenter : public Model {
 
 }  // namespace ma::model
 
-#endif  // _MA_MODEL_SEGMENTER_H_
+#endif  // _MA_MODEL_SEGMENTOR_H_
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_seg.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_seg.cpp
index a9ccfe8..5df2ea4 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_seg.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_seg.cpp
@@ -15,7 +15,7 @@ constexpr char TAG[] = "ma::model::yolo11_seg";
 
 namespace ma::model {
 
-Yolo11Seg::Yolo11Seg(Engine* p_engine_) : Segmenter(p_engine_, "yolo11_seg", MA_MODEL_TYPE_YOLO11_SEG) {
+Yolo11Seg::Yolo11Seg(Engine* p_engine_) : Segmentor(p_engine_, "yolo11_seg", MA_MODEL_TYPE_YOLO11_SEG) {
     MA_ASSERT(p_engine_ != nullptr);
 
     bboxes_ = p_engine_->getOutput(0);
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_seg.h b/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_seg.h
index f7025e3..b0e7c6a 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_seg.h
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_yolo11_seg.h
@@ -6,11 +6,11 @@
 #include <utility>
 #include <vector>
 
-#include "ma_model_segmenter.h"
+#include "ma_model_segmentor.h"
 
 namespace ma::model {
 
-class Yolo11Seg : public Segmenter {
+class Yolo11Seg : public Segmentor {
 private:
     ma_tensor_t bboxes_;
     ma_tensor_t protos_;
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_yolov5.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_yolov5.cpp
index 07619f2..9aee8ab 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_yolov5.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_yolov5.cpp
@@ -1,7 +1,7 @@
 #include <algorithm>
 #include <forward_list>
-#include <vector> 
 #include <utility>
+#include <vector>
 
 #include "../utils/ma_nms.h"
 
@@ -24,7 +24,7 @@ YoloV5::YoloV5(Engine* p_engine_) : Detector(p_engine_, "yolov5", MA_MODEL_TYPE_
 YoloV5::~YoloV5() {}
 
 static bool generalValid(Engine* engine) {
-    const auto inputs_count = engine->getInputSize();
+    const auto inputs_count  = engine->getInputSize();
     const auto outputs_count = engine->getOutputSize();
 
     if (inputs_count != 1 || outputs_count != 1) {
@@ -37,8 +37,7 @@ static bool generalValid(Engine* engine) {
     if (input_shape.size != 4)
         return false;
 
-    int n = input_shape.dims[0], h = input_shape.dims[1], w = input_shape.dims[2],
-        c        = input_shape.dims[3];
+    int n = input_shape.dims[0], h = input_shape.dims[1], w = input_shape.dims[2], c = input_shape.dims[3];
     bool is_nhwc = c == 3 || c == 1;
 
     if (!is_nhwc)
@@ -55,19 +54,18 @@ static bool generalValid(Engine* engine) {
     if (output_shape.size != 3 && output_shape.size != 4)
         return false;
 
-    if (output_shape.dims[0] != 1 || output_shape.dims[1] != ibox_len || output_shape.dims[2] < 6 ||
-        output_shape.dims[2] > 85)
+    if (output_shape.dims[0] != 1 || output_shape.dims[1] != ibox_len || output_shape.dims[2] < 6 || output_shape.dims[2] > 85)
         return false;
 
     return true;
 }
 
 static bool nmsValid(Engine* engine) {
-#if MA_USE_ENGINE_HALIO
+#if MA_USE_ENGINE_HAILO
     if (engine->getInputSize() != 1 || engine->getOutputSize() != 1)
         return false;
 
-    auto input = engine->getInput(0);
+    auto input  = engine->getInput(0);
     auto output = engine->getOutput(0);
 
     if (input.shape.size != 4 || output.shape.size != 4)
@@ -86,7 +84,7 @@ static bool nmsValid(Engine* engine) {
     auto mb = output.shape.dims[2];
     auto f  = output.shape.dims[3];
 
-    if (b != 1 || cs <= 0 || mb <= 1 || f != 0) 
+    if (b != 1 || cs <= 0 || mb <= 1 || f != 0)
         return false;
 
     return true;
@@ -146,12 +144,7 @@ ma_err_t YoloV5::generalPostProcess() {
                 h /= img_.height;
             }
 
-            ma_bbox_t box{.x      = MA_CLIP(x, 0, 1.0f),
-                          .y      = MA_CLIP(y, 0, 1.0f),
-                          .w      = MA_CLIP(w, 0, 1.0f),
-                          .h      = MA_CLIP(h, 0, 1.0f),
-                          .score  = score,
-                          .target = target};
+            ma_bbox_t box{.x = MA_CLIP(x, 0, 1.0f), .y = MA_CLIP(y, 0, 1.0f), .w = MA_CLIP(w, 0, 1.0f), .h = MA_CLIP(h, 0, 1.0f), .score = score, .target = target};
 
             results_.emplace_front(box);
         }
@@ -187,12 +180,7 @@ ma_err_t YoloV5::generalPostProcess() {
                 h /= img_.height;
             }
 
-            ma_bbox_t box{.x      = MA_CLIP(x, 0, 1.0f),
-                          .y      = MA_CLIP(y, 0, 1.0f),
-                          .w      = MA_CLIP(w, 0, 1.0f),
-                          .h      = MA_CLIP(h, 0, 1.0f),
-                          .score  = score,
-                          .target = target};
+            ma_bbox_t box{.x = MA_CLIP(x, 0, 1.0f), .y = MA_CLIP(y, 0, 1.0f), .w = MA_CLIP(w, 0, 1.0f), .h = MA_CLIP(h, 0, 1.0f), .score = score, .target = target};
 
             results_.emplace_front(box);
         }
@@ -208,7 +196,7 @@ ma_err_t YoloV5::generalPostProcess() {
 }
 
 ma_err_t YoloV5::nmsPostProcess() {
-#if MA_USE_ENGINE_HALIO
+#if MA_USE_ENGINE_HAILO
 
     auto& output = output_;
 
@@ -222,7 +210,7 @@ ma_err_t YoloV5::nmsPostProcess() {
 
     hailo_nms_shape_t nms_shape;
     if (output.external_handler) {
-        auto rc = (*reinterpret_cast<ma::engine::EngineHalio::ExternalHandler*>(output.external_handler))(4, &nms_shape, sizeof(hailo_nms_shape_t));
+        auto rc = (*reinterpret_cast<ma::engine::EngineHailo::ExternalHandler*>(output.external_handler))(4, &nms_shape, sizeof(hailo_nms_shape_t));
         if (rc == MA_OK) {
             w = nms_shape.number_of_classes;
             h = nms_shape.max_bboxes_per_class;
@@ -254,7 +242,7 @@ ma_err_t YoloV5::nmsPostProcess() {
                     ptr += sizeof(P);
 
                     ma_bbox_t res;
-                    
+
                     auto x_min = static_cast<float>(bbox.x_min - zp) * scale;
                     auto y_min = static_cast<float>(bbox.y_min - zp) * scale;
                     auto x_max = static_cast<float>(bbox.x_max - zp) * scale;
@@ -264,7 +252,7 @@ ma_err_t YoloV5::nmsPostProcess() {
                     res.x      = x_min + res.w * 0.5;
                     res.y      = y_min + res.h * 0.5;
                     res.score  = static_cast<float>(bbox.score - zp) * scale;
-                    
+
                     res.target = static_cast<int>(i);
 
                     res.x = MA_CLIP(res.x, 0, 1.0f);
@@ -276,7 +264,7 @@ ma_err_t YoloV5::nmsPostProcess() {
                 }
             }
         } break;
-            
+
         case MA_TENSOR_TYPE_NMS_BBOX_F32: {
             using T = float32_t;
             using P = hailo_bbox_float32_t;
@@ -297,13 +285,13 @@ ma_err_t YoloV5::nmsPostProcess() {
                     ptr += sizeof(P);
 
                     ma_bbox_t res;
-                    
+
                     res.w     = bbox.x_max - bbox.x_min;
                     res.h     = bbox.y_max - bbox.y_min;
                     res.x     = bbox.x_min + res.w * 0.5;
                     res.y     = bbox.y_min + res.h * 0.5;
                     res.score = bbox.score;
-                    
+
                     res.target = static_cast<int>(i);
 
                     res.x = MA_CLIP(res.x, 0, 1.0f);
@@ -315,11 +303,15 @@ ma_err_t YoloV5::nmsPostProcess() {
                 }
             }
         } break;
-           
+
         default:
             return MA_ENOTSUP;
     }
 
+    ma::utils::nms(results_, threshold_nms_, threshold_score_, false, false);
+
+    results_.sort([](const ma_bbox_t& a, const ma_bbox_t& b) { return a.x < b.x; });
+
     return MA_OK;
 #else
     return MA_FAILED;
@@ -332,17 +324,17 @@ ma_err_t YoloV5::postprocess() {
     switch (output_.type) {
         case MA_TENSOR_TYPE_NMS_BBOX_U16:
         case MA_TENSOR_TYPE_NMS_BBOX_F32: {
-#if MA_USE_ENGINE_HALIO
+#if MA_USE_ENGINE_HAILO
             // TODO: can be optimized by whihout calling this handler for each frame
-            if (output.external_handler) {
-                auto ph   = reinterpret_cast<ma::engine::EngineHalio::ExternalHandler*>(output.external_handler);
+            if (output_.external_handler) {
+                auto ph   = reinterpret_cast<ma::engine::EngineHailo::ExternalHandler*>(output_.external_handler);
                 float thr = threshold_score_;
                 auto rc   = (*ph)(1, &thr, sizeof(float));
                 if (rc == MA_OK) {
                     threshold_score_ = thr;
                 }
                 thr = threshold_nms_;
-                rc   = (*ph)(3, &thr, sizeof(float));
+                rc  = (*ph)(3, &thr, sizeof(float));
                 if (rc == MA_OK) {
                     threshold_nms_ = thr;
                 }
@@ -355,6 +347,7 @@ ma_err_t YoloV5::postprocess() {
             return generalPostProcess();
     }
 
+
     return MA_ENOTSUP;
 }
 }  // namespace ma::model
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose.cpp
index 5fbbc38..781b836 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose.cpp
@@ -24,7 +24,7 @@ static inline decltype(auto) estimateTensorHW(const ma_shape_t& shape) {
     return is_nhwc ? std::make_pair(shape.dims[1], shape.dims[2]) : std::make_pair(shape.dims[2], shape.dims[3]);
 }
 
-YoloV8Pose::YoloV8Pose(Engine* p_engine_) : PoseDetector(p_engine_, "yolo_world", MA_MODEL_TYPE_YOLO_WORLD) {
+YoloV8Pose::YoloV8Pose(Engine* p_engine_) : PoseDetector(p_engine_, "yolov8_pose", MA_MODEL_TYPE_YOLOV8_POSE) {
     MA_ASSERT(p_engine_ != nullptr);
 
     for (size_t i = 0; i < num_outputs_; ++i) {
@@ -153,7 +153,7 @@ bool YoloV8Pose::isValid(Engine* engine) {
 }
 
 const char* YoloV8Pose::getTag() {
-    return "ma::model::yolo_world";
+    return "ma::model::yolov8_pose";
 }
 
 ma_err_t YoloV8Pose::postprocess() {
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose_hailo.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose_hailo.cpp
new file mode 100644
index 0000000..b637dc8
--- /dev/null
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose_hailo.cpp
@@ -0,0 +1,373 @@
+#include "ma_model_yolov8_pose_hailo.h"
+
+#if MA_USE_ENGINE_HAILO
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <forward_list>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include <xtensor/xarray.hpp>
+#include <xtensor/xview.hpp>
+#include <xtensor/xmath.hpp>
+#include <xtensor/xsort.hpp>
+#include <xtensor/xadapt.hpp>
+#include <xtensor/xtensor.hpp>
+
+#include "../math/ma_math.h"
+#include "../utils/ma_anchors.h"
+#include "../utils/ma_nms.h"
+
+namespace ma::model {
+
+static inline decltype(auto) estimateTensorHW(const ma_shape_t& shape) {
+    if (shape.size != 4) {
+        int32_t ph = 0;
+        return std::make_pair(ph, ph);
+    }
+    const auto is_nhwc{shape.dims[3] == 3 || shape.dims[3] == 1};
+
+    return is_nhwc ? std::make_pair(shape.dims[1], shape.dims[2]) : std::make_pair(shape.dims[2], shape.dims[3]);
+}
+
+std::vector<int> YoloV8PoseHailo::strides_ = {8, 16, 32};
+
+/**
+ * Copyright (c) 2021-2022 Hailo Technologies Ltd. All rights reserved.
+ * Distributed under the LGPL license (https://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt)
+ **/
+static decltype(auto) getBoxesScoresKeypoints(std::vector<ma_tensor_t>& tensors, int num_classes) {
+    std::vector<ma_tensor_t> outputs_boxes(tensors.size() / 3);
+    std::vector<ma_tensor_t> outputs_keypoints(tensors.size() / 3);
+
+    int total_scores = 0;
+    for (uint i = 0; i < tensors.size(); i = i + 3) {
+        auto w = tensors[i + 1].shape.dims[1];  // w
+        auto h = tensors[i + 1].shape.dims[2];  // h
+        total_scores += w * h;
+    }
+
+    std::vector<size_t> scores_shape = {(long unsigned int)total_scores, (long unsigned int)num_classes};
+
+    xt::xarray<float> scores(scores_shape);
+
+    int view_index_scores = 0;
+
+    for (uint i = 0; i < tensors.size(); i = i + 3) {
+        outputs_boxes[i / 3] = tensors[i];
+
+        auto& tensor                = tensors[i + 1];
+        std::vector<size_t> shape   = {(size_t)tensor.shape.dims[1], (size_t)tensor.shape.dims[2], (size_t)tensor.shape.dims[3]};
+        xt::xarray<uint8_t> xtensor = xt::adapt(tensor.data.u8, tensor.size, xt::no_ownership(), shape);
+        auto dequantized_output_s   = (xtensor - tensor.quant_param.zero_point) * tensor.quant_param.scale;
+
+        int num_proposals_scores = dequantized_output_s.shape(0) * dequantized_output_s.shape(1);
+
+        auto output_scores                                                                                  = xt::view(dequantized_output_s, xt::all(), xt::all(), xt::all());
+        xt::view(scores, xt::range(view_index_scores, view_index_scores + num_proposals_scores), xt::all()) = xt::reshape_view(output_scores, {num_proposals_scores, num_classes});
+        view_index_scores += num_proposals_scores;
+
+        outputs_keypoints[i / 3] = tensors[i + 2];
+    }
+
+    return _internal::Triple{outputs_boxes, scores, outputs_keypoints};
+}
+
+
+YoloV8PoseHailo::YoloV8PoseHailo(Engine* p_engine_) : PoseDetector(p_engine_, "yolov8_pose", MA_MODEL_TYPE_YOLOV8_POSE) {
+    MA_ASSERT(p_engine_ != nullptr);
+
+    threshold_score_ = 0.6;
+    threshold_nms_   = 0.7;
+
+    outputs_.resize(9);
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+        outputs_[i] = p_engine_->getOutput(i);
+    }
+
+    std::sort(outputs_.begin(), outputs_.end(), [](const ma_tensor_t& a, const ma_tensor_t& b) { return a.shape.dims[1] > b.shape.dims[1]; });
+   
+    auto update_route_f = [&route = route_](ma_tensor_type_t t, int i) {
+        switch (t) {
+            case MA_TENSOR_TYPE_U8:
+                route |= 1 << i;
+                break;
+            case MA_TENSOR_TYPE_U16:
+                route |= 1 << (i + 9);
+                break;
+            default:
+                break;
+        }
+    };
+
+    std::vector<size_t> idx(outputs_.size());
+    for (size_t i = 0; i < outputs_.size(); i += 3) {
+        for (size_t j = 0; j < 3; ++j) {
+            auto at = i + j;
+            switch (outputs_[at].shape.dims[3]) {
+                case 1:
+                    idx[i + 1] = at;
+                    break;
+                case 64:
+                    idx[i] = at;
+                    break;
+                default:
+                    idx[i + 2] = at;
+            }
+        }
+    }
+    std::vector<ma_tensor_t> reordered_outputs(outputs_.size()); 
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+        reordered_outputs[i] = outputs_[idx[i]];
+        update_route_f(reordered_outputs[i].type, i);
+    }
+    outputs_ = std::move(reordered_outputs);
+
+    const auto [h, w] = estimateTensorHW(p_engine_->getInputShape(0));
+
+    centers_      = ma::utils::generateAnchorMatrix(strides_, {static_cast<int>(w), static_cast<int>(h)}, 3, 0, 0);
+    network_dims_ = {w, h};
+}
+
+YoloV8PoseHailo::~YoloV8PoseHailo() {}
+
+bool YoloV8PoseHailo::isValid(Engine* engine) {
+    const auto inputs_count  = engine->getInputSize();
+    const auto outputs_count = engine->getOutputSize();
+
+    if (inputs_count != 1 || outputs_count != 9) {
+        return false;
+    }
+
+    const auto input_shape{engine->getInputShape(0)};
+
+    if (input_shape.size != 4) {
+        return false;
+    }
+
+    const auto is_nhwc{input_shape.dims[3] == 3 || input_shape.dims[3] == 1};
+
+    size_t n = 0, h = 0, w = 0, c = 0;
+
+    if (is_nhwc) {
+        n = input_shape.dims[0];
+        h = input_shape.dims[1];
+        w = input_shape.dims[2];
+        c = input_shape.dims[3];
+    } else {
+        n = input_shape.dims[0];
+        c = input_shape.dims[1];
+        h = input_shape.dims[2];
+        w = input_shape.dims[3];
+    }
+
+    if (n != 1 || h ^ w || h < 32 || h % 32 || (c != 3 && c != 1)) {
+        return false;
+    }
+
+    const auto output_nums = engine->getOutputSize();
+    if (output_nums != 9) {
+        return false;
+    }
+
+    std::vector<ma_tensor_t> outputs(output_nums);
+    for (size_t i = 0; i < output_nums; ++i) {
+        outputs[i] = engine->getOutput(i);
+    }
+
+    std::vector<std::vector<int>> dims{std::vector<int>{int(w / strides_[0]), int(h / strides_[0]), 0},
+                                       std::vector<int>{int(w / strides_[1]), int(h / strides_[1]), 0},
+                                       std::vector<int>{int(w / strides_[2]), int(h / strides_[2]), 0}};
+
+    for (auto& out : outputs) {
+        if (out.shape.size != 4 || out.shape.dims[0] != 1) {
+            return false;
+        }
+        auto it = std::find_if(dims.begin(), dims.end(), [&out](const std::vector<int>& dim) { return dim[0] == out.shape.dims[1] && dim[1] == out.shape.dims[2]; });
+        if (it == dims.end()) {
+            return false;
+        }
+        switch (out.shape.dims[3]) {
+            case 1:
+                if (out.type != MA_TENSOR_TYPE_U8) {
+                    return false;
+                }
+                (*it)[2] += 1;
+                break;
+            case 64:
+                if (out.type != MA_TENSOR_TYPE_U8) {
+                    return false;
+                }
+                (*it)[2] += 1;
+                break;
+            default:
+                if (out.shape.dims[3] % 3 != 0) {
+                    return false;
+                }
+                if (out.type != MA_TENSOR_TYPE_U8 && out.type != MA_TENSOR_TYPE_U16) {
+                    return false;
+                }
+                (*it)[2] += 1;
+        }
+    }
+
+    for (const auto& dim : dims) {
+        if (dim[2] != 3) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+const char* YoloV8PoseHailo::getTag() {
+    return "ma::model::yolov8_pose";
+}
+
+template <typename KptsType>
+static decltype(auto) decodeBoxesAndKeypoints(const std::vector<ma_tensor_t>& raw_boxes_outputs,
+                                              xt::xarray<float>& scores,
+                                              const std::vector<ma_tensor_t>& raw_keypoints,
+                                              const std::vector<int>& network_dims,
+                                              const std::vector<int>& strides,
+                                              const std::vector<xt::xarray<double>>& centers,
+                                              int regression_length,
+                                              float score_threshold) {
+
+    int class_index = 0;
+    std::forward_list<ma_keypoint3f_t> decodings;
+
+    int instance_index = 0;
+    float confidence   = 0.0;
+    std::string label;
+
+    // Box distribution to distance
+    auto regression_distance = xt::reshape_view(xt::arange(0, regression_length + 1), {1, 1, regression_length + 1});
+
+    for (uint i = 0; i < raw_boxes_outputs.size(); ++i) {
+        // Boxes setup
+        float32_t qp_scale = raw_boxes_outputs[i].quant_param.scale;
+        float32_t qp_zp    = raw_boxes_outputs[i].quant_param.zero_point;
+
+        std::vector<size_t> output_b_shape = {(size_t)raw_boxes_outputs[i].shape.dims[1], (size_t)raw_boxes_outputs[i].shape.dims[2], (size_t)raw_boxes_outputs[i].shape.dims[3]};
+        auto output_b = xt::adapt(raw_boxes_outputs[i].data.u8, raw_boxes_outputs[i].size, xt::no_ownership(), output_b_shape);
+
+        int num_proposals    = output_b.shape(0) * output_b.shape(1);
+        auto output_boxes    = xt::view(output_b, xt::all(), xt::all(), xt::all());
+        auto quantized_boxes = xt::reshape_view(output_boxes, {num_proposals, 4, regression_length + 1});
+
+        auto shape = {quantized_boxes.shape(1), quantized_boxes.shape(2)};
+
+        // Keypoints setup
+        float32_t qp_scale_kpts = raw_keypoints[i].quant_param.scale;
+        float32_t qp_zp_kpts    = raw_keypoints[i].quant_param.zero_point;
+
+        std::vector<size_t> output_keypoints_shape = {(size_t)raw_keypoints[i].shape.dims[1], (size_t)raw_keypoints[i].shape.dims[2], (size_t)raw_keypoints[i].shape.dims[3]};
+       
+        size_t output_keypoints_size = output_keypoints_shape[0] * output_keypoints_shape[1] * output_keypoints_shape[2];
+        auto output_keypoints        = xt::adapt(static_cast<KptsType*>(raw_keypoints[i].data.data), output_keypoints_size, xt::no_ownership(), output_keypoints_shape);
+
+        int num_proposals_keypoints     = output_keypoints.shape(0) * output_keypoints.shape(1);
+        auto output_keypoints_quantized = xt::view(output_keypoints, xt::all(), xt::all(), xt::all());
+        auto quantized_keypoints        = xt::reshape_view(output_keypoints_quantized, {num_proposals_keypoints, int(output_keypoints_shape[2] / 3), 3});
+
+        auto keypoints_shape = {quantized_keypoints.shape(1), quantized_keypoints.shape(2)};
+
+        // Bbox decoding
+        for (uint j = 0; j < (uint)num_proposals; ++j) {
+            confidence = xt::row(scores, instance_index)(0);
+            instance_index++;
+            if (confidence < score_threshold)
+                continue;
+
+            xt::xarray<float> box(shape);
+            xt::xarray<float> kpts_corrdinates_and_scores(keypoints_shape);
+
+            ma::math::dequantizeValues2D<uint8_t>(box, j, quantized_boxes, box.shape(0), box.shape(1), qp_scale, qp_zp);
+            ma::math::softmax2D(box.data(), box.shape(0), box.shape(1));
+
+            auto box_distance                   = box * regression_distance;
+            xt::xarray<float> reduced_distances = xt::sum(box_distance, {2});
+            auto strided_distances              = reduced_distances * strides[i];
+
+            using namespace xt::placeholders;
+            auto distance_view1 = xt::view(strided_distances, xt::all(), xt::range(_, 2)) * -1;
+            auto distance_view2 = xt::view(strided_distances, xt::all(), xt::range(2, _));
+            auto distance_view  = xt::concatenate(xt::xtuple(distance_view1, distance_view2), 1);
+            auto decoded_box    = centers[i] + distance_view;
+
+            ma_keypoint3f_t kp;
+            auto x_min = decoded_box(j, 0) / network_dims[0];
+            auto y_min = decoded_box(j, 1) / network_dims[1];
+            auto w     = (decoded_box(j, 2) - decoded_box(j, 0)) / network_dims[0];
+            auto h     = (decoded_box(j, 3) - decoded_box(j, 1)) / network_dims[1];
+
+            kp.box.x      = x_min + (w / 2);
+            kp.box.y      = y_min + (h / 2);
+            kp.box.w      = w;
+            kp.box.h      = h;
+            kp.box.score  = confidence;
+            kp.box.target = class_index;
+
+            ma::math::dequantizeValues2D<KptsType>(
+                kpts_corrdinates_and_scores, j, quantized_keypoints, kpts_corrdinates_and_scores.shape(0), kpts_corrdinates_and_scores.shape(1), qp_scale_kpts, qp_zp_kpts);
+
+            auto kpts_corrdinates = xt::view(kpts_corrdinates_and_scores, xt::all(), xt::range(0, 2));
+            auto keypoints_scores = xt::view(kpts_corrdinates_and_scores, xt::all(), xt::range(2, _));
+
+            kpts_corrdinates *= 2;
+
+            auto center        = xt::view(centers[i], xt::all(), xt::range(0, 2));
+            auto center_values = xt::xarray<float>{(float)center(j, 0), (float)center(j, 1)};
+
+            kpts_corrdinates = strides[i] * (kpts_corrdinates - 0.5) + center_values;
+
+            auto sigmoided_scores = 1 / (1 + xt::exp(-keypoints_scores));
+
+            auto keypoint = std::make_pair(kpts_corrdinates, sigmoided_scores);
+
+            int pt_size = kpts_corrdinates.shape(0);
+            for (int i = 0; i < pt_size; ++i) {
+                ma_pt3f_t pt;
+                pt.x = kpts_corrdinates(i, 0) / network_dims[0];
+                pt.y = kpts_corrdinates(i, 1) / network_dims[1];
+                pt.z = sigmoided_scores(i, 0);
+                kp.pts.push_back(pt);
+            }
+
+            decodings.push_front(std::move(kp));
+        }
+    }
+
+    return decodings;
+}
+
+
+ma_err_t YoloV8PoseHailo::postprocess() {
+    // TODO: could be optimized
+    boxes_scores_keypoints_ = getBoxesScoresKeypoints(outputs_, 1);
+
+    switch (route_) {
+        case 511:
+            results_ = decodeBoxesAndKeypoints<uint8_t>(
+                boxes_scores_keypoints_.boxes, boxes_scores_keypoints_.scores, boxes_scores_keypoints_.keypoints, network_dims_, strides_, centers_, 15, threshold_score_);
+            break;
+        case 149723:
+            results_ = decodeBoxesAndKeypoints<uint16_t>(
+                boxes_scores_keypoints_.boxes, boxes_scores_keypoints_.scores, boxes_scores_keypoints_.keypoints, network_dims_, strides_, centers_, 15, threshold_score_);
+            break;
+        default:
+            return MA_ENOTSUP;
+    }
+
+    ma::utils::nms(results_, threshold_nms_, true);
+
+    return MA_OK;
+}
+
+}  // namespace ma::model
+
+#endif
\ No newline at end of file
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose_hailo.h b/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose_hailo.h
new file mode 100644
index 0000000..5743d1b
--- /dev/null
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose_hailo.h
@@ -0,0 +1,53 @@
+#ifndef _MA_MODEL_YOLOV8_POSE_HAILO_H_
+#define _MA_MODEL_YOLOV8_POSE_HAILO_H_
+
+#include "ma_model_pose_detector.h"
+
+#if MA_USE_ENGINE_HAILO
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include <xtensor/xtensor.hpp>
+#include <xtensor/xarray.hpp>
+
+namespace ma::model {
+
+namespace _internal {
+
+struct Triple {
+    std::vector<ma_tensor_t> boxes;
+    xt::xarray<float> scores;
+    std::vector<ma_tensor_t> keypoints;
+};
+
+}  // namespace _internal
+
+class YoloV8PoseHailo : public PoseDetector {
+private:
+    std::vector<xt::xarray<double>> centers_;
+    static std::vector<int> strides_;
+    std::vector<int> network_dims_;
+    std::vector<ma_tensor_t> outputs_;
+    _internal::Triple boxes_scores_keypoints_;
+    int32_t route_ = 0;
+
+protected:
+    ma_err_t postprocess() override;
+
+public:
+    YoloV8PoseHailo(Engine* engine);
+    ~YoloV8PoseHailo();
+
+    static bool isValid(Engine* engine);
+
+    static const char* getTag();
+};
+
+}  // namespace ma::model
+
+#endif
+
+#endif  // _MA_MODEL_YOLO_H
diff --git a/src/components/sscma-micro/sscma/core/utils/ma_anchors.cpp b/src/components/sscma-micro/sscma/core/utils/ma_anchors.cpp
index b4d6670..1bfa640 100644
--- a/src/components/sscma-micro/sscma/core/utils/ma_anchors.cpp
+++ b/src/components/sscma-micro/sscma/core/utils/ma_anchors.cpp
@@ -4,7 +4,7 @@ namespace ma::utils {
 
 std::vector<ma_anchor_stride_t> generateAnchorStrides(size_t input_size, std::vector<size_t> strides) {
     std::vector<ma_anchor_stride_t> anchor_strides(strides.size());
-    size_t                          nth_anchor = 0;
+    size_t nth_anchor = 0;
 
     for (size_t i = 0; i < strides.size(); ++i) {
         const size_t stride = strides[i];
@@ -17,19 +17,17 @@ std::vector<ma_anchor_stride_t> generateAnchorStrides(size_t input_size, std::ve
     return anchor_strides;
 }
 
-std::vector<std::vector<ma_pt2f_t>> generateAnchorMatrix(const std::vector<ma_anchor_stride_t>& anchor_strides,
-                                                         float                                  shift_right,
-                                                         float                                  shift_down) {
-    const auto                          anchor_matrix_size = anchor_strides.size();
+std::vector<std::vector<ma_pt2f_t>> generateAnchorMatrix(const std::vector<ma_anchor_stride_t>& anchor_strides, float shift_right, float shift_down) {
+    const auto anchor_matrix_size = anchor_strides.size();
     std::vector<std::vector<ma_pt2f_t>> anchor_matrix(anchor_matrix_size);
-    const float                         shift_right_init = shift_right * 0.5f;
-    const float                         shift_down_init  = shift_down * 0.5f;
+    const float shift_right_init = shift_right * 0.5f;
+    const float shift_down_init  = shift_down * 0.5f;
 
     for (size_t i = 0; i < anchor_matrix_size; ++i) {
-        const auto& anchor_stride   = anchor_strides[i];
-        const auto  split           = anchor_stride.split;
-        const auto  size            = anchor_stride.size;
-        auto&       anchor_matrix_i = anchor_matrix[i];
+        const auto& anchor_stride = anchor_strides[i];
+        const auto split          = anchor_stride.split;
+        const auto size           = anchor_stride.size;
+        auto& anchor_matrix_i     = anchor_matrix[i];
 
         anchor_matrix[i].resize(size);
 
@@ -43,4 +41,38 @@ std::vector<std::vector<ma_pt2f_t>> generateAnchorMatrix(const std::vector<ma_an
     return anchor_matrix;
 }
 
+#if MA_USE_LIB_XTENSOR
+
+/**
+ * Copyright (c) 2021-2022 Hailo Technologies Ltd. All rights reserved.
+ * Distributed under the LGPL license (https://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt)
+ **/
+
+std::vector<xt::xarray<double>> generateAnchorMatrix(std::vector<int>& strides, std::vector<int> network_dims, std::size_t boxes_num, int strided_width, int strided_height) {
+    std::vector<xt::xarray<double>> centers(boxes_num);
+
+    for (uint i = 0; i < boxes_num; ++i) {
+        strided_width  = network_dims[0] / strides[i];
+        strided_height = network_dims[1] / strides[i];
+
+        // Create a meshgrid of the proper strides
+        xt::xarray<int> grid_x = xt::arange(0, strided_width);
+        xt::xarray<int> grid_y = xt::arange(0, strided_height);
+
+        auto mesh = xt::meshgrid(grid_x, grid_y);
+        grid_x    = std::get<1>(mesh);
+        grid_y    = std::get<0>(mesh);
+
+        // Use the meshgrid to build up box center prototypes
+        auto ct_row = (xt::flatten(grid_y) + 0.5) * strides[i];
+        auto ct_col = (xt::flatten(grid_x) + 0.5) * strides[i];
+
+        centers[i] = xt::stack(xt::xtuple(ct_col, ct_row, ct_col, ct_row), 1);
+    }
+
+    return centers;
+}
+
+#endif
+
 }  // namespace ma::utils
diff --git a/src/components/sscma-micro/sscma/core/utils/ma_anchors.h b/src/components/sscma-micro/sscma/core/utils/ma_anchors.h
index 8142c8a..a6d6108 100644
--- a/src/components/sscma-micro/sscma/core/utils/ma_anchors.h
+++ b/src/components/sscma-micro/sscma/core/utils/ma_anchors.h
@@ -7,13 +7,21 @@
 
 #include "../ma_types.h"
 
+#if MA_USE_LIB_XTENSOR
+#include <xtensor/xarray.hpp>
+#include <xtensor/xview.hpp>
+#include <xtensor/xbuilder.hpp>
+#endif
+
 namespace ma::utils {
 
 std::vector<ma_anchor_stride_t> generateAnchorStrides(size_t input_size, std::vector<size_t> strides = {8, 16, 32});
 
-std::vector<std::vector<ma_pt2f_t>> generateAnchorMatrix(const std::vector<ma_anchor_stride_t>& anchor_strides,
-                                                         float                                  shift_right = 1.f,
-                                                         float                                  shift_down  = 1.f);
+std::vector<std::vector<ma_pt2f_t>> generateAnchorMatrix(const std::vector<ma_anchor_stride_t>& anchor_strides, float shift_right = 1.f, float shift_down = 1.f);
+
+#if MA_USE_LIB_XTENSOR
+std::vector<xt::xarray<double>> generateAnchorMatrix(std::vector<int>& strides, std::vector<int> network_dims, std::size_t boxes_num, int strided_width, int strided_height);
+#endif
 
 }  // namespace ma::utils
 
diff --git a/src/components/sscma-micro/sscma/core/utils/ma_nms.cpp b/src/components/sscma-micro/sscma/core/utils/ma_nms.cpp
index 50b9a81..8246b97 100644
--- a/src/components/sscma-micro/sscma/core/utils/ma_nms.cpp
+++ b/src/components/sscma-micro/sscma/core/utils/ma_nms.cpp
@@ -30,20 +30,23 @@ static constexpr void nms_impl(Container& bboxes, float threshold_iou, float thr
     if constexpr (std::is_same_v<Container, std::forward_list<T>>) {
         bboxes.sort([](const auto& box1, const auto& box2) { return box1.score > box2.score; });
     } else {
-        std::sort(
-          bboxes.begin(), bboxes.end(), [](const auto& box1, const auto& box2) { return box1.score > box2.score; });
+        std::sort(bboxes.begin(), bboxes.end(), [](const auto& box1, const auto& box2) { return box1.score > box2.score; });
     }
 
     for (auto it = bboxes.begin(); it != bboxes.end(); ++it) {
-        if (it->score == 0) continue;
+        if (it->score == 0)
+            continue;
         for (auto it2 = std::next(it); it2 != bboxes.end(); ++it2) {
-            if (it2->score == 0) continue;
-            if (multi_target && it->target != it2->target) continue;
+            if (it2->score == 0)
+                continue;
+            if (multi_target && it->target != it2->target)
+                continue;
             const auto iou = compute_iou(*it, *it2);
             if (iou > threshold_iou) {
                 if (soft_nms) {
                     it2->score = it2->score * (1 - iou);
-                    if (it2->score < threshold_score) it2->score = 0;
+                    if (it2->score < threshold_score)
+                        it2->score = 0;
                 } else {
                     it2->score = 0;
                 }
@@ -54,22 +57,34 @@ static constexpr void nms_impl(Container& bboxes, float threshold_iou, float thr
     if constexpr (std::is_same_v<Container, std::forward_list<T>>) {
         bboxes.remove_if([](const auto& box) { return box.score == 0; });
     } else {
-        bboxes.erase(std::remove_if(bboxes.begin(), bboxes.end(), [](const auto& box) { return box.score == 0; }),
-                     bboxes.end());
+        bboxes.erase(std::remove_if(bboxes.begin(), bboxes.end(), [](const auto& box) { return box.score == 0; }), bboxes.end());
     }
 }
 
-void nms(
-  std::forward_list<ma_bbox_t>& bboxes, float threshold_iou, float threshold_score, bool soft_nms, bool multi_target) {
+void nms(std::forward_list<ma_bbox_t>& bboxes, float threshold_iou, float threshold_score, bool soft_nms, bool multi_target) {
     nms_impl(bboxes, threshold_iou, threshold_score, soft_nms, multi_target);
 }
 
-void nms(std::forward_list<ma_bbox_ext_t>& bboxes,
-         float                             threshold_iou,
-         float                             threshold_score,
-         bool                              soft_nms,
-         bool                              multi_target) {
+void nms(std::forward_list<ma_bbox_ext_t>& bboxes, float threshold_iou, float threshold_score, bool soft_nms, bool multi_target) {
     nms_impl(bboxes, threshold_iou, threshold_score, soft_nms, multi_target);
 }
 
+void nms(std::forward_list<ma_keypoint3f_t>& decodings, const float iou_thr, bool should_nms_cross_classes) {
+    for (
+        auto it = decodings.begin(); it != decodings.end(); ++it) {
+        if (it->box.score != 0.0f) {
+            for (
+                auto it2 = std::next(it); it2 != decodings.end(); ++it2) {
+                if ((should_nms_cross_classes || (it->box.target == it2->box.target)) && it2->box.score != 0.0f) {
+                    float iou = compute_iou(it->box, it2->box);
+                    if (iou >= iou_thr) {
+                        it2->box.score = 0.0f;
+                    }
+                }
+            }
+        }
+    }
+    decodings.remove_if([](const auto& box) { return box.box.score == 0.0f; });
+}
+
 }  // namespace ma::utils
\ No newline at end of file
diff --git a/src/components/sscma-micro/sscma/core/utils/ma_nms.h b/src/components/sscma-micro/sscma/core/utils/ma_nms.h
index 14e2ef1..cc4d52e 100644
--- a/src/components/sscma-micro/sscma/core/utils/ma_nms.h
+++ b/src/components/sscma-micro/sscma/core/utils/ma_nms.h
@@ -4,7 +4,6 @@
 #include <algorithm>
 #include <forward_list>
 #include <iterator>
-#include <vector>
 
 #include "../ma_types.h"
 
@@ -14,11 +13,9 @@ namespace ma::utils {
 
 void nms(std::forward_list<ma_bbox_t>& bboxes, float threshold_iou, float threshold_score, bool soft_nms, bool multi_target);
 
-void nms(std::forward_list<ma_bbox_ext_t>& bboxes,
-         float                             threshold_iou,
-         float                             threshold_score,
-         bool                              soft_nms,
-         bool                              multi_target);
+void nms(std::forward_list<ma_bbox_ext_t>& bboxes, float threshold_iou, float threshold_score, bool soft_nms, bool multi_target);
+
+void nms(std::forward_list<ma_keypoint3f_t>& decodings, const float iou_thr, bool should_nms_cross_classes);
 
 }  // namespace ma::utils