diff --git a/src/components/sscma-micro/sscma/core/engine/ma_engine_hailo.cpp b/src/components/sscma-micro/sscma/core/engine/ma_engine_hailo.cpp
index d8792a7..40de878 100644
--- a/src/components/sscma-micro/sscma/core/engine/ma_engine_hailo.cpp
+++ b/src/components/sscma-micro/sscma/core/engine/ma_engine_hailo.cpp
@@ -207,6 +207,16 @@ ma_err_t EngineHailo::load(const string& model_path) {
                     tensor->shape.dims[2] = shape.features;
                     tensor->shape.dims[3] = shape.width;
                     break;
+                case HAILO_FORMAT_ORDER_NC:
+                    if (shape.width != 1 || shape.height != 1) {
+                        tensor->shape.dims[1] = shape.features;
+                        tensor->shape.dims[2] = shape.height;
+                        tensor->shape.dims[3] = shape.width;
+                        break;
+                    }
+                    tensor->shape.dims[1] = shape.features;
+                    tensor->shape.size    = 2;
+                    break;
                 default:
                     break;
             }
diff --git a/src/components/sscma-micro/sscma/core/ma_definations.h b/src/components/sscma-micro/sscma/core/ma_definations.h
index b3ef63a..796286a 100644
--- a/src/components/sscma-micro/sscma/core/ma_definations.h
+++ b/src/components/sscma-micro/sscma/core/ma_definations.h
@@ -42,6 +42,8 @@
 
 #define MA_STORAGE_KEY_TRIGGER_RULES   "trigger#rules"
 
+#define MA_STORAGE_KEY_WS_PORT           "ws#port"
+
 
 #define MA_AT_CMD_PREFIX               "AT+"
 #define MA_AT_CMD_QUERY                "?"
diff --git a/src/components/sscma-micro/sscma/core/ma_types.h b/src/components/sscma-micro/sscma/core/ma_types.h
index 4b5d7f3..8507e1f 100644
--- a/src/components/sscma-micro/sscma/core/ma_types.h
+++ b/src/components/sscma-micro/sscma/core/ma_types.h
@@ -97,7 +97,7 @@ typedef struct {
     } data;
     bool is_physical;  // For physical tensor
     bool is_variable;  // For constant tensor
-    void* external_handler = nullptr;
+    void* external_handler;
 } ma_tensor_t;
 
 typedef enum {
@@ -241,7 +241,8 @@ typedef enum {
     MA_TRANSPORT_MQTT    = 5,
     MA_TRANSPORT_TCP     = 6,
     MA_TRANSPORT_UDP     = 7,
-    MA_TRANSPORT_RTSP    = 8
+    MA_TRANSPORT_RTSP    = 8,
+    MA_TRANSPORT_WS     = 9
 } ma_transport_type_t;
 
 typedef enum {
@@ -283,7 +284,9 @@ typedef enum {
     MA_MODEL_TYPE_YOLO_WORLD  = 8u,
     MA_MODEL_TYPE_YOLO11      = 9u,
     MA_MODEL_TYPE_YOLO11_POSE = 10u,
-    MA_MODEL_TYPE_YOLO11_SEG = 11u,
+    MA_MODEL_TYPE_YOLO11_SEG  = 11u,
+    MA_MODEL_TYPE_YOLOV8_SGE  = 12u,
+    MA_MODEL_TYPE_RTMDET      = 13u
 } ma_model_type_t;
 
 typedef struct {
@@ -444,4 +447,4 @@ typedef struct in6_info_t {
 
 #endif
 
-#endif  // _MA_TYPES_H_
\ No newline at end of file
+#endif  // _MA_TYPES_H_
diff --git a/src/components/sscma-micro/sscma/core/math/ma_math_vectors.h b/src/components/sscma-micro/sscma/core/math/ma_math_vectors.h
index 667c216..9d66571 100644
--- a/src/components/sscma-micro/sscma/core/math/ma_math_vectors.h
+++ b/src/components/sscma-micro/sscma/core/math/ma_math_vectors.h
@@ -1,8 +1,14 @@
 #ifndef _MA_MATH_VECTORS_H_
 #define _MA_MATH_VECTORS_H_
 
-#include <cstdint>
 #include <cstddef>
+#include <cstdint>
+
+#if MA_USE_LIB_XTENSOR
+#include <xtensor/xarray.hpp>
+#include <xtensor/xmath.hpp>
+#include <xtensor/xview.hpp>
+#endif
 
 namespace ma::math {
 
@@ -10,6 +16,15 @@ void softmax(float* data, size_t size);
 
 void fastSoftmax(float* data, size_t size);
 
+#if MA_USE_LIB_XTENSOR
+template <typename QT>
+static void dequantizeValues1D(xt::xarray<float>& dequantized_outputs, int index, const xt::xarray<QT>& quantized_outputs, size_t dim1, float32_t qp_scale, float32_t qp_zp) {
+    for (size_t i = 0; i < dim1; ++i) {
+        dequantized_outputs(i) = (float(quantized_outputs(index, i)) - qp_zp) * qp_scale;
+    }
+}
+#endif
+
 }  // namespace ma::math
 
 #endif
\ No newline at end of file
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_classifier.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_classifier.cpp
index d9f07c5..62fee76 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_classifier.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_classifier.cpp
@@ -40,9 +40,7 @@ bool Classifier::isValid(Engine* engine) {
         return false;
     }
 
-    const auto& input_shape = engine->getInputShape(0);
-    const auto& output_shape{engine->getOutputShape(0)};
-
+    const auto input_shape = engine->getInputShape(0);
     int n = input_shape.dims[0], h = input_shape.dims[1], w = input_shape.dims[2], c = input_shape.dims[3];
     bool is_nhwc = c == 3 || c == 1;
 
@@ -52,6 +50,10 @@ bool Classifier::isValid(Engine* engine) {
     if (n != 1 || h < 32 || h % 32 != 0 || (c != 3 && c != 1))
         return false;
 
+    const auto output_shape = engine->getOutputShape(0);
+    if (output_shape.size != 2) {
+        return false;
+    }
 
     if (output_shape.dims[0] != 1 ||  // N = 1
         output_shape.dims[1] < 2      // C >= 2
@@ -59,10 +61,6 @@ bool Classifier::isValid(Engine* engine) {
         return false;
     }
 
-    if (output_shape.size >= 3) {
-        return false;
-    }
-
     return true;
 }
 
@@ -91,32 +89,65 @@ ma_err_t Classifier::preprocess() {
 ma_err_t Classifier::postprocess() {
     results_.clear();
 
-    if (output_.type == MA_TENSOR_TYPE_S8) {
-        auto scale{output_.quant_param.scale};
-        auto zero_point{output_.quant_param.zero_point};
-        bool rescale{scale < 0.1f ? true : false};
-        auto* data = output_.data.s8;
+    switch (output_.type) {
+        case MA_TENSOR_TYPE_S8: {
+            auto scale{output_.quant_param.scale};
+            auto zero_point{output_.quant_param.zero_point};
+            bool rescale{scale < 0.1f ? true : false};
+            auto* data = output_.data.s8;
+            auto pred_l{output_.shape.dims[1]};
+
+            for (decltype(pred_l) i{0}; i < pred_l; ++i) {
+                auto score{static_cast<decltype(scale)>(data[i] - zero_point) * scale};
+                score = rescale ? score : score / 100.f;
+                if (score > threshold_score_)
+                    results_.emplace_front(ma_class_t{score, i});
+            }
+        } break;
+
+        case MA_TENSOR_TYPE_U8: {
+            auto scale{output_.quant_param.scale};
+            auto zero_point{output_.quant_param.zero_point};
+            bool rescale{scale < 0.1f ? true : false};
+            auto* data = output_.data.u8;
+            auto pred_l{output_.shape.dims[1]};
+
+            for (decltype(pred_l) i{0}; i < pred_l; ++i) {
+                auto score{static_cast<decltype(scale)>(data[i] - zero_point) * scale};
+                score = rescale ? score : score / 100.f;
+                if (score > threshold_score_)
+                    results_.emplace_front(ma_class_t{score, i});
+            }
+        } break;
+
+        case MA_TENSOR_TYPE_U16: {
+            auto scale{output_.quant_param.scale};
+            auto zero_point{output_.quant_param.zero_point};
+            bool rescale{scale < 0.1f ? true : false};
+            auto* data = output_.data.u16;
+            auto pred_l{output_.shape.dims[1]};
+
+            for (decltype(pred_l) i{0}; i < pred_l; ++i) {
+                auto score{static_cast<decltype(scale)>(data[i] - zero_point) * scale};
+                score = rescale ? score : score / 100.f;
+                if (score > threshold_score_)
+                    results_.emplace_front(ma_class_t{score, i});
+            }
+        } break;
+
+    
+        case MA_TENSOR_TYPE_F32: {
+            auto* data = output_.data.f32;
+            auto pred_l{output_.shape.dims[1]};
+            for (decltype(pred_l) i{0}; i < pred_l; ++i) {
+                auto score{data[i]};
+                if (score > threshold_score_)
+                    results_.emplace_front(ma_class_t{score, i});
+            }
+        } break;
 
-        auto pred_l{output_.shape.dims[1]};
-
-        for (decltype(pred_l) i{0}; i < pred_l; ++i) {
-            auto score{static_cast<decltype(scale)>(data[i] - zero_point) * scale};
-            score = rescale ? score : score / 100.f;
-            if (score > threshold_score_)
-                results_.emplace_front(ma_class_t{score, i});
-        }
-    }
-    if (output_.type == MA_TENSOR_TYPE_F32) {
-        auto* data = output_.data.f32;
-        auto pred_l{output_.shape.dims[1]};
-        for (decltype(pred_l) i{0}; i < pred_l; ++i) {
-            auto score{data[i]};
-            if (score > threshold_score_)
-                results_.emplace_front(ma_class_t{score, i});
-        }
-
-    } else {
-        return MA_ENOTSUP;
+        default:
+            return MA_ENOTSUP;
     }
 
     results_.sort([](const ma_class_t& a, const ma_class_t& b) { return a.score > b.score; });
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_factory.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_factory.cpp
index 573899d..9ac7858 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_factory.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_factory.cpp
@@ -37,6 +37,10 @@ Model* ModelFactory::create(Engine* engine, size_t algorithm_id) {
             if (YoloV8PoseHailo::isValid(engine)) {
                 return new YoloV8PoseHailo(engine);
             }
+
+            if (YoloV8SegHailo::isValid(engine)) {
+                return new YoloV8SegHailo(engine);
+            }
 #endif
             if (YoloV8Pose::isValid(engine)) {
                 return new YoloV8Pose(engine);
@@ -52,6 +56,11 @@ Model* ModelFactory::create(Engine* engine, size_t algorithm_id) {
                 return new NvidiaDet(engine);
             }
 
+        case MA_MODEL_TYPE_RTMDET:
+            if (RTMDet::isValid(engine)) {
+                return new RTMDet(engine);
+            }
+        
         case MA_MODEL_TYPE_YOLO_WORLD:
             if (YoloWorld::isValid(engine)) {
                 return new YoloWorld(engine);
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_factory.h b/src/components/sscma-micro/sscma/core/model/ma_model_factory.h
index f86bfda..ddfdbbf 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_factory.h
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_factory.h
@@ -21,6 +21,8 @@
 #include "ma_model_yolov8.h"
 #include "ma_model_yolov8_pose.h"
 #include "ma_model_yolov8_pose_hailo.h"
+#include "ma_model_yolov8_seg_hailo.h"
+#include "ma_model_rtmdet.h"
 
 namespace ma {
 
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_rtmdet.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_rtmdet.cpp
new file mode 100644
index 0000000..265d540
--- /dev/null
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_rtmdet.cpp
@@ -0,0 +1,466 @@
+#include "ma_model_rtmdet.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <forward_list>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "../math/ma_math.h"
+#include "../utils/ma_anchors.h"
+#include "../utils/ma_nms.h"
+
+namespace ma::model {
+
+static inline decltype(auto) estimateTensorHW(const ma_shape_t& shape) {
+    if (shape.size != 4) {
+        int32_t ph = 0;
+        return std::make_pair(ph, ph);
+    }
+    const auto is_nhwc{shape.dims[3] == 3 || shape.dims[3] == 1};
+
+    return is_nhwc ? std::make_pair(shape.dims[1], shape.dims[2]) : std::make_pair(shape.dims[2], shape.dims[3]);
+}
+
+RTMDet::RTMDet(Engine* p_engine_) : Detector(p_engine_, "rtmdet", MA_MODEL_TYPE_RTMDET) {
+    MA_ASSERT(p_engine_ != nullptr);
+
+    for (size_t i = 0; i < num_outputs_; ++i) {
+        outputs_[i] = p_engine_->getOutput(i);
+    }
+
+    const auto [h, w] = estimateTensorHW(p_engine_->getInputShape(0));
+
+    anchor_strides_ = ma::utils::generateAnchorStrides(std::min(h, w));
+    anchor_matrix_  = ma::utils::generateAnchorMatrix(anchor_strides_);
+
+
+    for (size_t i = 0; i < num_outputs_; ++i) {
+        const auto dim_1 = outputs_[i].shape.dims[1];
+        const auto dim_2 = outputs_[i].shape.dims[2];
+
+        if (dim_2 == 4) {
+            for (size_t j = 0; j < anchor_variants_; ++j) {
+                if (dim_1 == static_cast<int>(anchor_strides_[j].size)) {
+                    output_bboxes_ids_[j] = i;
+                    break;
+                }
+            }
+        } else {
+            for (size_t j = 0; j < anchor_variants_; ++j) {
+                if (dim_1 == static_cast<int>(anchor_strides_[j].size)) {
+                    output_scores_ids_[j] = i;
+                    break;
+                }
+            }
+        }
+    }
+
+    using CheckType       = uint8_t;
+    size_t    check_bytes = sizeof(CheckType) * 8u;
+    CheckType check       = 0;
+    for (size_t i = 0; i < anchor_variants_; ++i) {
+        CheckType f_s = 1 << (output_scores_ids_[i] % check_bytes);
+        CheckType f_b = 1 << (output_bboxes_ids_[i] % check_bytes);
+        MA_ASSERT(!(f_s & f_b));
+        MA_ASSERT(!(f_s & check));
+        MA_ASSERT(!(f_b & check));
+        check |= f_s | f_b;
+    }
+    MA_ASSERT(!(check ^ 0b00111111));
+}
+
+RTMDet::~RTMDet() {}
+
+bool RTMDet::isValid(Engine* engine) {
+    const auto inputs_count  = engine->getInputSize();
+    const auto outputs_count = engine->getOutputSize();
+
+    if (inputs_count != 1 || outputs_count != num_outputs_) {
+        return false;
+    }
+
+    const auto input_shape{engine->getInputShape(0)};
+
+    if (input_shape.size != 4) {
+        return false;
+    }
+
+    const auto is_nhwc{input_shape.dims[3] == 3 || input_shape.dims[3] == 1};
+
+    size_t n = 0, h = 0, w = 0, c = 0;
+
+    if (is_nhwc) {
+        n = input_shape.dims[0];
+        h = input_shape.dims[1];
+        w = input_shape.dims[2];
+        c = input_shape.dims[3];
+    } else {
+        n = input_shape.dims[0];
+        c = input_shape.dims[1];
+        h = input_shape.dims[2];
+        w = input_shape.dims[3];
+    }
+
+    if (n != 1 || h ^ w || h < 32 || h % 32 || (c != 3 && c != 1)) {
+        return false;
+    }
+
+    auto anchor_strides_1 = ma::utils::generateAnchorStrides(std::min(h, w));
+    auto anchor_strides_2 = anchor_strides_1;
+
+    for (size_t i = 0; i < num_outputs_; ++i) {
+        const auto output_shape{engine->getOutputShape(i)};
+        if (output_shape.size != 3 || output_shape.dims[0] != 1) {
+            return false;
+        }
+
+        if (output_shape.dims[2] == 4) {
+            auto it = std::find_if(
+              anchor_strides_2.begin(), anchor_strides_2.end(), [&output_shape](const ma_anchor_stride_t& anchor_stride) {
+                  return static_cast<int>(anchor_stride.size) == output_shape.dims[1];
+              });
+            if (it == anchor_strides_2.end())
+                return false;
+            else
+                anchor_strides_2.erase(it);
+        } else {
+            auto it = std::find_if(
+              anchor_strides_1.begin(), anchor_strides_1.end(), [&output_shape](const ma_anchor_stride_t& anchor_stride) {
+                  return static_cast<int>(anchor_stride.size) == output_shape.dims[1];
+              });
+            if (it == anchor_strides_1.end())
+                return false;
+            else
+                anchor_strides_1.erase(it);
+        }
+    }
+
+    if (anchor_strides_1.size() || anchor_strides_2.size()) {
+        return false;
+    }
+
+    return true;
+}
+
+const char* RTMDet::getTag() {
+    return "ma::model::rmdet";
+}
+
+ma_err_t RTMDet::postprocess() {
+    uint8_t check = 0;
+
+    for (size_t i = 0; i < num_outputs_; ++i) {
+        switch (outputs_[i].type) {
+            case MA_TENSOR_TYPE_S8:
+                check += 1;
+                break;
+
+            case MA_TENSOR_TYPE_U8:
+                check += 2;
+                break;
+
+            case MA_TENSOR_TYPE_F32:
+                check += 4;
+                break;
+
+            default:
+                return MA_ENOTSUP;
+        }
+    }
+
+    switch (check) {
+        case 6:
+            return postProcessI8();
+
+        case 12:
+            return postProcessU8();
+
+#ifdef MA_MODEL_POSTPROCESS_FP32_VARIANT
+        case 24:
+            return postProcessF32();
+#endif
+
+        default:
+            return MA_ENOTSUP;
+    }
+
+    return MA_OK;
+}
+
+ma_err_t RTMDet::postProcessI8() {
+    results_.clear();
+
+    const int8_t* output_data[num_outputs_];
+
+    for (size_t i = 0; i < num_outputs_; ++i) {
+        output_data[i] = outputs_[i].data.s8;
+    }
+
+    const auto score_threshold = threshold_score_;
+    const auto iou_threshold   = threshold_nms_;
+
+    const float score_threshold_non_sigmoid = ma::math::inverseSigmoid(score_threshold);
+
+    const auto anchor_matrix_size = anchor_matrix_.size();
+
+    for (size_t i = 0; i < anchor_matrix_size; ++i) {
+        const auto output_scores_id             = output_scores_ids_[i];
+        const auto* output_scores               = output_data[output_scores_id];
+        const size_t output_scores_shape_dims_2 = outputs_[output_scores_id].shape.dims[2];
+        const auto output_scores_quant_parm     = outputs_[output_scores_id].quant_param;
+
+        const auto output_bboxes_id             = output_bboxes_ids_[i];
+        const auto* output_bboxes               = output_data[output_bboxes_id];
+        const size_t output_bboxes_shape_dims_2 = outputs_[output_bboxes_id].shape.dims[2];
+        const auto output_bboxes_quant_parm     = outputs_[output_bboxes_id].quant_param;
+
+        const auto  stride  = anchor_strides_[i];
+        const float scale_w = float(stride.stride) / float(img_.width);
+        const float scale_h = float(stride.stride) / float(img_.height);
+
+        const auto& anchor_array     = anchor_matrix_[i];
+        const auto anchor_array_size = anchor_array.size();
+
+        const int32_t score_threshold_quan_non_sigmoid = ma::math::quantizeValueFloor(score_threshold_non_sigmoid, output_scores_quant_parm.scale, output_scores_quant_parm.zero_point);
+
+        for (size_t j = 0; j < anchor_array_size; ++j) {
+            const auto j_mul_output_scores_shape_dims_2 = j * output_scores_shape_dims_2;
+
+            auto max_score_raw = score_threshold_quan_non_sigmoid;
+            int32_t target     = -1;
+
+            for (size_t k = 0; k < output_scores_shape_dims_2; ++k) {
+                int8_t score = output_scores[j_mul_output_scores_shape_dims_2 + k];
+
+                if (static_cast<decltype(max_score_raw)>(score) < max_score_raw) [[likely]]
+                    continue;
+
+                max_score_raw = score;
+                target        = k;
+            }
+
+            if (target < 0)
+                continue;
+
+            const float real_score = ma::math::sigmoid(ma::math::dequantizeValue(max_score_raw, output_scores_quant_parm.scale, output_scores_quant_parm.zero_point));
+
+
+            float dist[4];
+            const auto pre = j * output_bboxes_shape_dims_2;
+            for (size_t m = 0; m < 4; ++m) {
+                const size_t offset = pre + m;
+                dist[m]  = ma::math::dequantizeValue(static_cast<int32_t>(output_bboxes[offset]), output_bboxes_quant_parm.scale, output_bboxes_quant_parm.zero_point);
+            }
+
+            const auto anchor = anchor_array[j];
+
+            float cx = anchor.x + ((dist[2] - dist[0]) * 0.5f);
+            float cy = anchor.y + ((dist[3] - dist[1]) * 0.5f);
+            float w  = dist[0] + dist[2];
+            float h  = dist[1] + dist[3];
+
+            ma_bbox_t res;
+
+            res.x      = cx * scale_w;
+            res.y      = cy * scale_h;
+            res.w      = w  * scale_w;
+            res.h      = h  * scale_h;
+            res.score  = real_score;
+            res.target = target;
+
+            results_.emplace_front(
+                std::move(res)
+            );
+        }
+    }
+
+    ma::utils::nms(results_, threshold_nms_, threshold_score_, false, true);
+
+    return MA_OK;
+}
+
+ma_err_t RTMDet::postProcessU8() {
+    results_.clear();
+
+    const uint8_t* output_data[num_outputs_];
+
+    for (size_t i = 0; i < num_outputs_; ++i) {
+        output_data[i] = outputs_[i].data.u8;
+    }
+
+    const auto score_threshold = threshold_score_;
+    const auto iou_threshold   = threshold_nms_;
+
+    const float score_threshold_non_sigmoid = ma::math::inverseSigmoid(score_threshold);
+
+    const auto anchor_matrix_size = anchor_matrix_.size();
+
+    for (size_t i = 0; i < anchor_matrix_size; ++i) {
+        const auto output_scores_id             = output_scores_ids_[i];
+        const auto* output_scores               = output_data[output_scores_id];
+        const size_t output_scores_shape_dims_2 = outputs_[output_scores_id].shape.dims[2];
+        const auto output_scores_quant_parm     = outputs_[output_scores_id].quant_param;
+
+        const auto output_bboxes_id             = output_bboxes_ids_[i];
+        const auto* output_bboxes               = output_data[output_bboxes_id];
+        const size_t output_bboxes_shape_dims_2 = outputs_[output_bboxes_id].shape.dims[2];
+        const auto output_bboxes_quant_parm     = outputs_[output_bboxes_id].quant_param;
+
+        const auto  stride  = anchor_strides_[i];
+        const float scale_w = float(stride.stride) / float(img_.width);
+        const float scale_h = float(stride.stride) / float(img_.height);
+
+        const auto& anchor_array     = anchor_matrix_[i];
+        const auto anchor_array_size = anchor_array.size();
+
+        const int32_t score_threshold_quan_non_sigmoid = ma::math::quantizeValueFloor(score_threshold_non_sigmoid, output_scores_quant_parm.scale, output_scores_quant_parm.zero_point);
+
+        for (size_t j = 0; j < anchor_array_size; ++j) {
+            const auto j_mul_output_scores_shape_dims_2 = j * output_scores_shape_dims_2;
+
+            auto max_score_raw = score_threshold_quan_non_sigmoid;
+            int32_t target     = -1;
+
+            for (size_t k = 0; k < output_scores_shape_dims_2; ++k) {
+                uint8_t score = output_scores[j_mul_output_scores_shape_dims_2 + k];
+
+                if (static_cast<decltype(max_score_raw)>(score) < max_score_raw) [[likely]]
+                    continue;
+
+                max_score_raw = score;
+                target        = k;
+            }
+
+            if (target < 0)
+                continue;
+
+            const float real_score = ma::math::sigmoid(ma::math::dequantizeValue(max_score_raw, output_scores_quant_parm.scale, output_scores_quant_parm.zero_point));
+
+            // DFL
+            float dist[4];
+            const auto pre = j * output_bboxes_shape_dims_2;
+            for (size_t m = 0; m < 4; ++m) {
+                const size_t offset = pre + m;
+                dist[m]  = ma::math::dequantizeValue(static_cast<int32_t>(output_bboxes[offset]), output_bboxes_quant_parm.scale, output_bboxes_quant_parm.zero_point);
+            }
+
+            const auto anchor = anchor_array[j];
+
+            float cx = anchor.x + ((dist[2] - dist[0]) * 0.5f);
+            float cy = anchor.y + ((dist[3] - dist[1]) * 0.5f);
+            float w  = dist[0] + dist[2];
+            float h  = dist[1] + dist[3];
+
+            ma_bbox_t res;
+
+            res.x      = cx * scale_w;
+            res.y      = cy * scale_h;
+            res.w      = w  * scale_w;
+            res.h      = h  * scale_h;
+            res.score  = real_score;
+            res.target = target;
+
+            results_.emplace_front(
+                std::move(res)
+            );
+        }
+    }
+
+    ma::utils::nms(results_, threshold_nms_, threshold_score_, false, true);
+
+    return MA_OK;
+}
+
+#ifdef MA_MODEL_POSTPROCESS_FP32_VARIANT
+ma_err_t RTMDet::postProcessF32() {
+    results_.clear();
+
+    const float* output_data[num_outputs_];
+
+    for (size_t i = 0; i < num_outputs_; ++i) {
+        output_data[i] = outputs_[i].data.f32;
+    }
+
+    const auto score_threshold = threshold_score_;
+    const auto iou_threshold   = threshold_nms_;
+
+    const float score_threshold_non_sigmoid = ma::math::inverseSigmoid(score_threshold);
+
+    const auto anchor_matrix_size = anchor_matrix_.size();
+
+    for (size_t i = 0; i < anchor_matrix_size; ++i) {
+        const auto output_scores_id             = output_scores_ids_[i];
+        const auto* output_scores               = output_data[output_scores_id];
+        const size_t output_scores_shape_dims_2 = outputs_[output_scores_id].shape.dims[2];
+
+        const auto output_bboxes_id             = output_bboxes_ids_[i];
+        const auto* output_bboxes               = output_data[output_bboxes_id];
+        const size_t output_bboxes_shape_dims_2 = outputs_[output_bboxes_id].shape.dims[2];
+
+        const auto  stride  = anchor_strides_[i];
+        const float scale_w = float(stride.stride) / float(img_.width);
+        const float scale_h = float(stride.stride) / float(img_.height);
+
+        const auto& anchor_array     = anchor_matrix_[i];
+        const auto anchor_array_size = anchor_array.size();
+
+        for (size_t j = 0; j < anchor_array_size; ++j) {
+            const auto j_mul_output_scores_shape_dims_2 = j * output_scores_shape_dims_2;
+
+            auto max_score_raw = score_threshold_non_sigmoid;
+            int32_t target     = -1;
+
+            for (size_t k = 0; k < output_scores_shape_dims_2; ++k) {
+                int8_t score = output_scores[j_mul_output_scores_shape_dims_2 + k];
+
+                if (static_cast<decltype(max_score_raw)>(score) < max_score_raw) [[likely]]
+                    continue;
+
+                max_score_raw = score;
+                target        = k;
+            }
+
+            if (target < 0)
+                continue;
+
+            const float real_score = ma::math::sigmoid(max_score_raw);
+
+            float dist[4];
+            const auto pre = j * output_bboxes_shape_dims_2;
+            for (size_t m = 0; m < 4; ++m) {
+                const size_t offset = pre + m;
+                dist[m] = output_bboxes[offset];
+            }
+
+            const auto anchor = anchor_array[j];
+
+            float cx = anchor.x + ((dist[2] - dist[0]) * 0.5f);
+            float cy = anchor.y + ((dist[3] - dist[1]) * 0.5f);
+            float w  = dist[0] + dist[2];
+            float h  = dist[1] + dist[3];
+
+            ma_bbox_t res;
+
+            res.x      = cx * scale_w;
+            res.y      = cy * scale_h;
+            res.w      = w  * scale_w;
+            res.h      = h  * scale_h;
+            res.score  = real_score;
+            res.target = target;
+
+            results_.emplace_front(
+                std::move(res)
+            );
+        }
+    }
+
+    ma::utils::nms(results_, threshold_nms_, threshold_score_, false, true);
+
+    return MA_OK;
+}
+#endif
+
+}  // namespace ma::model
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_rtmdet.h b/src/components/sscma-micro/sscma/core/model/ma_model_rtmdet.h
new file mode 100644
index 0000000..9193023
--- /dev/null
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_rtmdet.h
@@ -0,0 +1,46 @@
+#ifndef _MA_MODEL_RTMDET_H_
+#define _MA_MODEL_RTMDET_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "ma_model_detector.h"
+
+namespace ma::model {
+
+class RTMDet : public Detector {
+   private:
+    static constexpr size_t num_outputs_     = 6;
+    static constexpr size_t anchor_variants_ = 3;
+
+    ma_tensor_t outputs_[num_outputs_];
+
+    std::vector<ma_anchor_stride_t>     anchor_strides_;
+    std::vector<std::vector<ma_pt2f_t>> anchor_matrix_;
+
+    size_t output_scores_ids_[anchor_variants_];
+    size_t output_bboxes_ids_[anchor_variants_];
+
+   protected:
+    ma_err_t postprocess() override;
+
+    ma_err_t postProcessI8();
+    ma_err_t postProcessU8();
+#ifdef MA_MODEL_POSTPROCESS_FP32_VARIANT
+    ma_err_t postProcessF32();
+#endif
+
+   public:
+    RTMDet(Engine* engine);
+    ~RTMDet();
+
+    static bool isValid(Engine* engine);
+
+    static const char* getTag();
+};
+
+}  // namespace ma::model
+
+#endif  // _MA_MODEL_YOLO_H
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose.cpp
index 781b836..c849717 100644
--- a/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose.cpp
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_pose.cpp
@@ -221,7 +221,7 @@ ma_err_t YoloV8Pose::postProcessI8() {
         const auto& anchor_array     = anchor_matrix_[i];
         const auto anchor_array_size = anchor_array.size();
 
-        const int32_t score_threshold_quan_non_sigmoid = ma::math::quantizeValueFloor(score_threshold_non_sigmoid, output_scores_quant_parm.zero_point, output_scores_quant_parm.scale);
+        const int32_t score_threshold_quan_non_sigmoid = ma::math::quantizeValueFloor(score_threshold_non_sigmoid, output_scores_quant_parm.scale, output_scores_quant_parm.zero_point);
 
         for (size_t j = 0; j < anchor_array_size; ++j) {
             const auto j_mul_output_scores_shape_dims_2 = j * output_scores_shape_dims_2;
@@ -242,7 +242,7 @@ ma_err_t YoloV8Pose::postProcessI8() {
             if (target < 0)
                 continue;
 
-            const float real_score = ma::math::sigmoid(ma::math::dequantizeValue(max_score_raw, output_scores_quant_parm.zero_point, output_scores_quant_parm.scale));
+            const float real_score = ma::math::sigmoid(ma::math::dequantizeValue(max_score_raw, output_scores_quant_parm.scale, output_scores_quant_parm.zero_point));
 
             // DFL
             float dist[4];
@@ -252,7 +252,7 @@ ma_err_t YoloV8Pose::postProcessI8() {
             for (size_t m = 0; m < 4; ++m) {
                 const size_t offset = pre + m * 16;
                 for (size_t n = 0; n < 16; ++n) {
-                    matrix[n] = ma::math::dequantizeValue(static_cast<int32_t>(output_bboxes[offset + n]), output_bboxes_quant_parm.zero_point, output_bboxes_quant_parm.scale);
+                    matrix[n] = ma::math::dequantizeValue(static_cast<int32_t>(output_bboxes[offset + n]), output_bboxes_quant_parm.scale, output_bboxes_quant_parm.zero_point);
                 }
 
                 ma::math::softmax(matrix, 16);
@@ -304,11 +304,11 @@ ma_err_t YoloV8Pose::postProcessI8() {
         for (size_t i = 0; i < keypoint_nums; ++i) {
             const auto offset = pre + i * 3;
 
-            const float x = ma::math::dequantizeValue(static_cast<int32_t>(output_keypoints[offset]), output_keypoints_quant_parm.zero_point, output_keypoints_quant_parm.scale);
+            const float x = ma::math::dequantizeValue(static_cast<int32_t>(output_keypoints[offset]), output_keypoints_quant_parm.scale, output_keypoints_quant_parm.zero_point);
 
-            const float y = ma::math::dequantizeValue(static_cast<int32_t>(output_keypoints[offset + 1]), output_keypoints_quant_parm.zero_point, output_keypoints_quant_parm.scale);
+            const float y = ma::math::dequantizeValue(static_cast<int32_t>(output_keypoints[offset + 1]), output_keypoints_quant_parm.scale, output_keypoints_quant_parm.zero_point);
 
-            const float z = ma::math::sigmoid(ma::math::dequantizeValue(static_cast<int32_t>(output_keypoints[offset + 2]), output_keypoints_quant_parm.zero_point, output_keypoints_quant_parm.scale));
+            const float z = ma::math::sigmoid(ma::math::dequantizeValue(static_cast<int32_t>(output_keypoints[offset + 2]), output_keypoints_quant_parm.scale, output_keypoints_quant_parm.zero_point));
 
             n_keypoint[i] = {x, y, z};
         }
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_seg_hailo.cpp b/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_seg_hailo.cpp
new file mode 100644
index 0000000..22d0844
--- /dev/null
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_seg_hailo.cpp
@@ -0,0 +1,435 @@
+#include "ma_model_yolov8_seg_hailo.h"
+
+#if MA_USE_ENGINE_HAILO
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <forward_list>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include <xtensor/xadapt.hpp>
+#include <xtensor/xarray.hpp>
+#include <xtensor/xmath.hpp>
+#include <xtensor/xsort.hpp>
+#include <xtensor/xtensor.hpp>
+#include <xtensor/xview.hpp>
+
+#include "../math/ma_math.h"
+#include "../utils/ma_anchors.h"
+#include "../utils/ma_nms.h"
+
+namespace ma::model {
+
+static inline decltype(auto) estimateTensorHW(const ma_shape_t& shape) {
+    if (shape.size != 4) {
+        int32_t ph = 0;
+        return std::make_pair(ph, ph);
+    }
+    const auto is_nhwc{shape.dims[3] == 3 || shape.dims[3] == 1};
+
+    return is_nhwc ? std::make_pair(shape.dims[1], shape.dims[2]) : std::make_pair(shape.dims[2], shape.dims[3]);
+}
+
+static void nms(std::forward_list<std::pair<ma_bbox_t, xt::xarray<float>>>& decodings, const float iou_thr, bool should_nms_cross_classes) {
+    for (auto it = decodings.begin(); it != decodings.end(); ++it) {
+        if (it->first.score != 0.0f) {
+            for (auto it2 = std::next(it); it2 != decodings.end(); ++it2) {
+                if ((should_nms_cross_classes || (it->first.target == it2->first.target)) && it2->first.score != 0.0f) {
+                    float iou = ma::utils::compute_iou(it->first, it2->first);
+                    if (iou >= iou_thr) {
+                        it2->first.score = 0.0f;
+                    }
+                }
+            }
+        }
+    }
+    decodings.remove_if([](const auto& p) { return p.first.score == 0.0f; });
+}
+
+std::vector<int> YoloV8SegHailo::strides_ = {8, 16, 32};
+
+/**
+ * Copyright (c) 2021-2022 Hailo Technologies Ltd. All rights reserved.
+ * Distributed under the LGPL license (https://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt)
+ **/
+static decltype(auto) getBoxesScoresMasks(std::vector<ma_tensor_t> tensors, int num_classes) {
+    auto raw_proto = [&tensors]() {
+        auto it = std::find_if(
+            tensors.begin(), tensors.end(), [](const ma_tensor_t& t) { return t.size == 4 && t.shape.dims[0] == 1 && t.shape.dims[1] == 160 && t.shape.dims[2] == 160 && t.shape.dims[3] == 32; });
+        if (it == tensors.end()) {
+            return ma_tensor_t{0};
+        }
+        tensors.erase(it);
+        return *it;
+    }();
+
+    std::vector<ma_tensor_t> outputs_boxes(tensors.size() / 3);
+    std::vector<ma_tensor_t> outputs_masks(tensors.size() / 3);
+
+    int total_scores = 0;
+    for (uint i = 0; i < tensors.size(); i = i + 3) {
+        auto w = tensors[i + 1].shape.dims[1];  // w
+        auto h = tensors[i + 1].shape.dims[2];  // h
+        total_scores += w * h;
+    }
+
+    std::vector<size_t> scores_shape = {(long unsigned int)total_scores, (long unsigned int)num_classes};
+    xt::xarray<float> scores(scores_shape);
+
+    std::vector<size_t> proto_shape = {(long unsigned int)raw_proto.shape.dims[1], (long unsigned int)raw_proto.shape.dims[2], (long unsigned int)raw_proto.shape.dims[3]};
+    xt::xarray<float> proto(proto_shape);
+
+    int view_index_scores = 0;
+
+    for (uint i = 0; i < tensors.size(); i = i + 3) {
+        outputs_boxes[i / 3] = tensors[i];
+
+        auto& tensor                = tensors[i + 1];
+        std::vector<size_t> shape   = {(size_t)tensor.shape.dims[1], (size_t)tensor.shape.dims[2], (size_t)tensor.shape.dims[3]};
+        xt::xarray<uint8_t> xtensor = xt::adapt(tensor.data.u8, tensor.size, xt::no_ownership(), shape);
+        auto dequantized_output_s   = (xtensor - tensor.quant_param.zero_point) * tensor.quant_param.scale;
+        int num_proposals_scores    = dequantized_output_s.shape(0) * dequantized_output_s.shape(1);
+
+        auto output_scores                                                                                  = xt::view(dequantized_output_s, xt::all(), xt::all(), xt::all());
+        xt::view(scores, xt::range(view_index_scores, view_index_scores + num_proposals_scores), xt::all()) = xt::reshape_view(output_scores, {num_proposals_scores, num_classes});
+        view_index_scores += num_proposals_scores;
+
+        outputs_masks[i / 3] = tensors[i + 2];
+    }
+
+    auto proto_tensor = xt::adapt(raw_proto.data.u8, raw_proto.size, xt::no_ownership(), proto_shape);
+    proto             = (proto_tensor - raw_proto.quant_param.zero_point) * raw_proto.quant_param.scale;
+
+
+    return _internal::Quadruple{outputs_boxes, scores, outputs_masks, proto};
+}
+
+
+YoloV8SegHailo::YoloV8SegHailo(Engine* p_engine_) : Segmentor(p_engine_, "yolov8_seg", MA_MODEL_TYPE_YOLOV8_SGE) {
+    MA_ASSERT(p_engine_ != nullptr);
+
+    threshold_score_ = 0.6;
+    threshold_nms_   = 0.7;
+
+    outputs_.resize(10);
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+        outputs_[i] = p_engine_->getOutput(i);
+    }
+
+    std::sort(outputs_.begin(), outputs_.end(), [](const ma_tensor_t& a, const ma_tensor_t& b) { return a.shape.dims[1] > b.shape.dims[1]; });
+
+    auto update_route_f = [&route = route_, this](ma_tensor_type_t t, int i) {
+        switch (t) {
+            case MA_TENSOR_TYPE_U8:
+                route |= 1 << i;
+                break;
+            case MA_TENSOR_TYPE_U16:
+                route |= 1 << (i + this->outputs_.size());
+                break;
+            default:
+                break;
+        }
+    };
+
+    std::vector<size_t> idx(outputs_.size());
+    for (size_t i = 1; i < outputs_.size(); i += 3) {
+        for (size_t j = 0; j < 3; ++j) {
+            auto at = i + j;
+            switch (outputs_[at].shape.dims[3]) {
+                case 32:
+                    idx[i + 2] = at;
+                    break;
+                case 64:
+                    idx[i] = at;
+                    break;
+                default:
+                    idx[i + 1] = at;
+            }
+        }
+    }
+    std::vector<ma_tensor_t> reordered_outputs(outputs_.size());
+    reordered_outputs[0] = outputs_[0];
+    update_route_f(reordered_outputs[0].type, 0);
+    for (size_t i = 1; i < outputs_.size(); ++i) {
+        reordered_outputs[i] = outputs_[idx[i]];
+        update_route_f(reordered_outputs[i].type, i);
+    }
+    outputs_ = std::move(reordered_outputs);
+    classes_ = outputs_[2].shape.dims[3];
+
+    const auto [h, w] = estimateTensorHW(p_engine_->getInputShape(0));
+
+    centers_      = ma::utils::generateAnchorMatrix(strides_, {static_cast<int>(w), static_cast<int>(h)}, 3, 0, 0);
+    network_dims_ = {w, h};
+}
+
+YoloV8SegHailo::~YoloV8SegHailo() {}
+
+bool YoloV8SegHailo::isValid(Engine* engine) {
+    const auto inputs_count  = engine->getInputSize();
+    const auto outputs_count = engine->getOutputSize();
+
+    if (inputs_count != 1 || outputs_count != 10) {
+        return false;
+    }
+
+    const auto input_shape{engine->getInputShape(0)};
+
+    if (input_shape.size != 4) {
+        return false;
+    }
+
+    const auto is_nhwc{input_shape.dims[3] == 3 || input_shape.dims[3] == 1};
+
+    size_t n = 0, h = 0, w = 0, c = 0;
+
+    if (is_nhwc) {
+        n = input_shape.dims[0];
+        h = input_shape.dims[1];
+        w = input_shape.dims[2];
+        c = input_shape.dims[3];
+    } else {
+        n = input_shape.dims[0];
+        c = input_shape.dims[1];
+        h = input_shape.dims[2];
+        w = input_shape.dims[3];
+    }
+
+    if (n != 1 || h ^ w || h < 32 || h % 32 || (c != 3 && c != 1)) {
+        return false;
+    }
+
+    const auto output_nums = engine->getOutputSize();
+    if (output_nums != 9) {
+        return false;
+    }
+
+    std::vector<ma_tensor_t> outputs(output_nums);
+    for (size_t i = 0; i < output_nums; ++i) {
+        outputs[i] = engine->getOutput(i);
+    }
+
+    auto it = std::find_if(
+        outputs.begin(), outputs.end(), [](const ma_tensor_t& t) { return t.size == 4 && t.shape.dims[0] == 1 && t.shape.dims[1] == 160 && t.shape.dims[2] == 160 && t.shape.dims[3] == 32; });
+    if (it == outputs.end()) {
+        return false;
+    }
+    outputs.erase(it);
+
+    std::vector<std::vector<int>> dims{std::vector<int>{int(w / strides_[0]), int(h / strides_[0]), 0},
+                                       std::vector<int>{int(w / strides_[1]), int(h / strides_[1]), 0},
+                                       std::vector<int>{int(w / strides_[2]), int(h / strides_[2]), 0}};
+
+    for (auto& out : outputs) {
+        if (out.shape.size != 4 || out.shape.dims[0] != 1) {
+            return false;
+        }
+        auto it = std::find_if(dims.begin(), dims.end(), [&out](const std::vector<int>& dim) { return dim[0] == out.shape.dims[1] && dim[1] == out.shape.dims[2]; });
+        if (it == dims.end()) {
+            return false;
+        }
+        switch (out.shape.dims[3]) {
+            case 32:
+                if (out.type != MA_TENSOR_TYPE_U8) {
+                    return false;
+                }
+                (*it)[2] += 1;
+                break;
+            case 64:
+                if (out.type != MA_TENSOR_TYPE_U8) {
+                    return false;
+                }
+                (*it)[2] += 1;
+                break;
+            default:
+                if (out.type != MA_TENSOR_TYPE_U8) {
+                    return false;
+                }
+                (*it)[2] += 1;
+        }
+    }
+
+    for (const auto& dim : dims) {
+        if (dim[2] != 3) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+const char* YoloV8SegHailo::getTag() {
+    return "ma::model::yolov8_pose";
+}
+
+template <typename T>
+static decltype(auto) decodeBoxesAndExtractMasks(const std::vector<ma_tensor_t>& raw_boxes_outputs,
+                                                 const std::vector<ma_tensor_t>& raw_masks_outputs,
+                                                 xt::xarray<float>& scores,
+                                                 const std::vector<int>& network_dims,
+                                                 const std::vector<int>& strides,
+                                                 const std::vector<xt::xarray<double>>& centers,
+                                                 int regression_length,
+                                                 float score_threshold) {
+
+    int class_index = 0;
+    std::forward_list<std::pair<ma_bbox_t, xt::xarray<float>>> decodings;
+
+    int instance_index = 0;
+    float confidence   = 0.0;
+    std::string label;
+
+    // Box distribution to distance
+    auto regression_distance = xt::reshape_view(xt::arange(0, regression_length + 1), {1, 1, regression_length + 1});
+
+    for (uint i = 0; i < raw_boxes_outputs.size(); ++i) {
+        // Boxes setup
+        float32_t qp_scale = raw_boxes_outputs[i].quant_param.scale;
+        float32_t qp_zp    = raw_boxes_outputs[i].quant_param.zero_point;
+
+        std::vector<size_t> output_b_shape = {(size_t)raw_boxes_outputs[i].shape.dims[1], (size_t)raw_boxes_outputs[i].shape.dims[2], (size_t)raw_boxes_outputs[i].shape.dims[3]};
+        auto output_b                      = xt::adapt(raw_boxes_outputs[i].data.u8, raw_boxes_outputs[i].size, xt::no_ownership(), output_b_shape);
+
+        int num_proposals    = output_b.shape(0) * output_b.shape(1);
+        auto output_boxes    = xt::view(output_b, xt::all(), xt::all(), xt::all());
+        auto quantized_boxes = xt::reshape_view(output_boxes, {num_proposals, 4, regression_length + 1});
+
+        auto shape = {quantized_boxes.shape(1), quantized_boxes.shape(2)};
+
+        // Masks setup
+        float32_t qp_scale_masks = raw_masks_outputs[i].quant_param.scale;
+        float32_t qp_zp_masks    = raw_masks_outputs[i].quant_param.zero_point;
+
+        std::vector<size_t> output_m_shape = {(size_t)raw_masks_outputs[i].shape.dims[1], (size_t)raw_masks_outputs[i].shape.dims[2], (size_t)raw_masks_outputs[i].shape.dims[3]};
+        auto output_m                      = xt::adapt(raw_masks_outputs[i].data.u8, raw_masks_outputs[i].size, xt::no_ownership(), output_m_shape);
+
+        int num_proposals_masks = output_m.shape(0) * output_m.shape(1);
+        auto output_masks       = xt::view(output_m, xt::all(), xt::all(), xt::all());
+        auto quantized_masks    = xt::reshape_view(output_masks, {num_proposals_masks, 1, regression_length + 1});
+
+        auto mask_shape = {quantized_masks.shape(1)};
+
+        // Bbox decoding
+        for (uint j = 0; j < (uint)num_proposals; ++j) {
+            class_index = xt::argmax(xt::row(scores, instance_index))(0);
+            confidence  = scores(instance_index, class_index);
+            instance_index++;
+            if (confidence < score_threshold)
+                continue;
+
+            xt::xarray<float> box(shape);
+            xt::xarray<float> mask(mask_shape);
+
+            ma::math::dequantizeValues2D<uint8_t>(box, j, quantized_boxes, box.shape(0), box.shape(1), qp_scale, qp_zp);
+            ma::math::softmax2D(box.data(), box.shape(0), box.shape(1));
+
+            ma::math::dequantizeValues1D<uint8_t>(mask, j, quantized_masks, mask.shape(0), qp_scale_masks, qp_zp_masks);
+
+            auto box_distance                   = box * regression_distance;
+            xt::xarray<float> reduced_distances = xt::sum(box_distance, {2});
+            auto strided_distances              = reduced_distances * strides[i];
+
+            using namespace xt::placeholders;
+            auto distance_view1 = xt::view(strided_distances, xt::all(), xt::range(_, 2)) * -1;
+            auto distance_view2 = xt::view(strided_distances, xt::all(), xt::range(2, _));
+            auto distance_view  = xt::concatenate(xt::xtuple(distance_view1, distance_view2), 1);
+            auto decoded_box    = centers[i] + distance_view;
+
+            ma_bbox_t bbox;
+            auto x_min = decoded_box(j, 0) / network_dims[0];
+            auto y_min = decoded_box(j, 1) / network_dims[1];
+            auto w     = (decoded_box(j, 2) - decoded_box(j, 0)) / network_dims[0];
+            auto h     = (decoded_box(j, 3) - decoded_box(j, 1)) / network_dims[1];
+
+            bbox.x      = x_min + (w / 2);
+            bbox.y      = y_min + (h / 2);
+            bbox.w      = w;
+            bbox.h      = h;
+            bbox.score  = confidence;
+            bbox.target = class_index;
+
+            decodings.emplace_front(std::make_pair(bbox, mask));
+        }
+    }
+
+    return decodings;
+}
+
+
+static xt::xarray<float> dot(xt::xarray<float> mask, xt::xarray<float> reshaped_proto, size_t proto_height, size_t proto_width, size_t mask_num = 32) {
+
+    auto shape = {proto_height, proto_width};
+    xt::xarray<float> mask_product(shape);
+
+    for (size_t i = 0; i < mask_product.shape(0); i++) {
+        for (size_t j = 0; j < mask_product.shape(1); j++) {
+            for (size_t k = 0; k < mask_num; k++) {
+                mask_product(i, j) += mask(k) * reshaped_proto(k, i, j);
+            }
+        }
+    }
+    return mask_product;
+}
+
+ma_err_t YoloV8SegHailo::postprocess() {
+    // TODO: could be optimized
+
+    boxes_scores_masks_mask_matrix_ = getBoxesScoresMasks(outputs_, classes_);
+    std::forward_list<std::pair<ma_bbox_t, xt::xarray<float>>> decodings;
+
+    switch (route_) {
+        case 1023:
+            decodings = decodeBoxesAndExtractMasks<uint8_t>(
+                boxes_scores_masks_mask_matrix_.boxes, boxes_scores_masks_mask_matrix_.masks, boxes_scores_masks_mask_matrix_.scores, network_dims_, strides_, centers_, 15, threshold_score_);
+            break;
+        default:
+            return MA_ENOTSUP;
+    }
+
+    nms(decodings, threshold_nms_, true);
+
+    xt::xarray<float> proto = boxes_scores_masks_mask_matrix_.proto_data;
+    int mask_height         = static_cast<int>(proto.shape(0));
+    int mask_width          = static_cast<int>(proto.shape(1));
+    int mask_features       = static_cast<int>(proto.shape(2));
+    auto reshaped_proto     = xt::reshape_view(xt::transpose(xt::reshape_view(proto, {-1, mask_features}), {1, 0}), {-1, mask_height, mask_width});
+
+    for (const auto& [bbox, curr_mask] : decodings) {
+        ma_segm2f_t segm;
+        segm.box = bbox;
+
+        auto mask_product = dot(curr_mask, reshaped_proto, reshaped_proto.shape(1), reshaped_proto.shape(2), curr_mask.shape(0));
+        for (auto& v : mask_product) {
+            v = ma::math::sigmoid(v);
+        }
+
+        int x1 = (bbox.x - bbox.w / 2) * mask_width;
+        int y1 = (bbox.y - bbox.h / 2) * mask_height;
+        int x2 = (bbox.x + bbox.w / 2) * mask_width;
+        int y2 = (bbox.y + bbox.h / 2) * mask_height;
+
+        segm.mask.width  = mask_width;
+        segm.mask.height = mask_height;
+        auto sz = mask_width * mask_height;
+        segm.mask.data.resize(static_cast<size_t>(std::ceil(static_cast<float>(sz) / 8.f)), 0);  // bitwise
+
+        for (int i = y1; i < y2; ++i) {
+            for (int j = x1; j < x2; ++j) {
+                if (mask_product(i, j) > 0.5) {
+                    segm.mask.data[i / 8] |= 1 << (i % 8);
+                }
+            }
+        } 
+
+        results_.emplace_front(std::move(segm));
+    }
+
+    return MA_OK;
+}
+
+}  // namespace ma::model
+
+#endif
\ No newline at end of file
diff --git a/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_seg_hailo.h b/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_seg_hailo.h
new file mode 100644
index 0000000..bc878b9
--- /dev/null
+++ b/src/components/sscma-micro/sscma/core/model/ma_model_yolov8_seg_hailo.h
@@ -0,0 +1,55 @@
+#ifndef _MA_MODEL_YOLOV8_SEG_HAILO_H_
+#define _MA_MODEL_YOLOV8_SEG_HAILO_H_
+
+#include "ma_model_segmentor.h"
+
+#if MA_USE_ENGINE_HAILO
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include <xtensor/xtensor.hpp>
+#include <xtensor/xarray.hpp>
+
+namespace ma::model {
+
+namespace _internal {
+
+struct Quadruple {
+    std::vector<ma_tensor_t> boxes;
+    xt::xarray<float> scores;
+    std::vector<ma_tensor_t> masks;
+    xt::xarray<float> proto_data;
+};
+
+}  // namespace _internal
+
+class YoloV8SegHailo : public Segmentor {
+private:
+    std::vector<xt::xarray<double>> centers_;
+    static std::vector<int> strides_;
+    std::vector<int> network_dims_;
+    std::vector<ma_tensor_t> outputs_;
+    _internal::Quadruple boxes_scores_masks_mask_matrix_;
+    int classes_ = 0;
+    int32_t route_ = 0;
+
+protected:
+    ma_err_t postprocess();
+
+public:
+    YoloV8SegHailo(Engine* engine);
+    ~YoloV8SegHailo();
+
+    static bool isValid(Engine* engine);
+
+    static const char* getTag();
+};
+
+}  // namespace ma::model
+
+#endif
+
+#endif  // _MA_MODEL_YOLO_H
diff --git a/src/components/sscma-micro/sscma/core/utils/ma_nms.cpp b/src/components/sscma-micro/sscma/core/utils/ma_nms.cpp
index 8246b97..1cf655b 100644
--- a/src/components/sscma-micro/sscma/core/utils/ma_nms.cpp
+++ b/src/components/sscma-micro/sscma/core/utils/ma_nms.cpp
@@ -9,22 +9,6 @@
 
 namespace ma::utils {
 
-template <typename T, std::enable_if_t<std::is_base_of_v<ma_bbox_t, T>, bool> = true>
-static inline float compute_iou(const T& box1, const T& box2) {
-    const float x1    = std::max(box1.x, box2.x);
-    const float y1    = std::max(box1.y, box2.y);
-    const float x2    = std::min(box1.x + box1.w, box2.x + box2.w);
-    const float y2    = std::min(box1.y + box1.h, box2.y + box2.h);
-    const float w     = std::max(0.0f, x2 - x1);
-    const float h     = std::max(0.0f, y2 - y1);
-    const float inter = w * h;
-    const float d     = box1.w * box1.h + box2.w * box2.h - inter;
-    if (std::abs(d) < std::numeric_limits<float>::epsilon()) [[unlikely]] {
-        return 0;
-    }
-    return inter / d;
-}
-
 template <typename Container, typename T = typename Container::value_type>
 static constexpr void nms_impl(Container& bboxes, float threshold_iou, float threshold_score, bool soft_nms, bool multi_target) {
     if constexpr (std::is_same_v<Container, std::forward_list<T>>) {
@@ -54,7 +38,7 @@ static constexpr void nms_impl(Container& bboxes, float threshold_iou, float thr
         }
     }
 
-    if constexpr (std::is_same_v<Container, std::forward_list<T>>) {
+    if constexpr (std::is_same<Container, std::forward_list<T>>::value) {
         bboxes.remove_if([](const auto& box) { return box.score == 0; });
     } else {
         bboxes.erase(std::remove_if(bboxes.begin(), bboxes.end(), [](const auto& box) { return box.score == 0; }), bboxes.end());
diff --git a/src/components/sscma-micro/sscma/core/utils/ma_nms.h b/src/components/sscma-micro/sscma/core/utils/ma_nms.h
index cc4d52e..8a152c3 100644
--- a/src/components/sscma-micro/sscma/core/utils/ma_nms.h
+++ b/src/components/sscma-micro/sscma/core/utils/ma_nms.h
@@ -2,8 +2,10 @@
 #define _MA_NMS_H_
 
 #include <algorithm>
+#include <cmath>
 #include <forward_list>
 #include <iterator>
+#include <type_traits>
 
 #include "../ma_types.h"
 
@@ -11,6 +13,22 @@ namespace ma::utils {
 
 // skip use of template since it is not allowed
 
+template <typename T, std::enable_if_t<std::is_base_of<ma_bbox_t, T>::value, bool> = true>
+inline float compute_iou(const T& box1, const T& box2) {
+    const float x1    = std::max(box1.x, box2.x);
+    const float y1    = std::max(box1.y, box2.y);
+    const float x2    = std::min(box1.x + box1.w, box2.x + box2.w);
+    const float y2    = std::min(box1.y + box1.h, box2.y + box2.h);
+    const float w     = std::max(0.0f, x2 - x1);
+    const float h     = std::max(0.0f, y2 - y1);
+    const float inter = w * h;
+    const float d     = box1.w * box1.h + box2.w * box2.h - inter;
+    if (std::abs(d) < std::numeric_limits<float>::epsilon()) [[unlikely]] {
+        return 0;
+    }
+    return inter / d;
+}
+
 void nms(std::forward_list<ma_bbox_t>& bboxes, float threshold_iou, float threshold_score, bool soft_nms, bool multi_target);
 
 void nms(std::forward_list<ma_bbox_ext_t>& bboxes, float threshold_iou, float threshold_score, bool soft_nms, bool multi_target);