Merge branch 'develop' into onnxruntime-sync-2023-12-22

ROCm · Dec 29, 2023 · 3770c8e · 3770c8e
2 parents 6cefecb + fff12a2
commit 3770c8e
Show file tree

Hide file tree

Showing 9 changed files with 432 additions and 44 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -323,5 +323,5 @@ rocm_create_package(
     MAINTAINER "AMDMIGraphX Maintainer <[email protected]>"
     LDCONFIG
     PTH
-    DEPENDS miopen-hip rocblas hip-rocclr hip-base half
+    DEPENDS miopen-hip rocblas hip-rocclr hip-base half libtbb2
 )
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -35,6 +35,7 @@ add_library(migraphx
     analyze_streams.cpp
     apply_alpha_beta.cpp
     argument.cpp
+    autocast_fp8.cpp
     auto_contiguous.cpp
     common.cpp
     common_dims.cpp

diff --git a/src/autocast_fp8.cpp b/src/autocast_fp8.cpp
@@ -0,0 +1,81 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/autocast_fp8.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/ranges.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+void autocast_fp8_pass::apply(module& m) const
+{
+    std::vector<instruction_ref> remove_parameters;
+    for(auto ins : iterator_for(m))
+    {
+        const auto& ins_name = ins->name();
+        if(ins_name == "@param" and contains(fp8_types, ins->get_shape().type()))
+        {
+            shape::type_t fp8_type    = ins->get_shape().type();
+            migraphx::shape new_shape = ins->get_shape().with_type(target_type);
+            std::string param_name = ins->get_operator().to_value()["parameter"].to<std::string>();
+            m.rename_parameter(ins, param_name + "_old");
+            auto new_param = m.add_parameter(param_name, new_shape);
+            auto new_ins   = m.insert_instruction(
+                ins,
+                migraphx::make_op("convert", {{"target_type", migraphx::to_value(fp8_type)}}),
+                new_param);
+            m.replace_instruction(ins, new_ins);
+            remove_parameters.push_back(ins);
+        }
+
+        if(ins_name == "@return")
+        {
+            std::vector<instruction_ref> inputs = ins->inputs();
+            std::vector<instruction_ref> new_inputs;
+            std::transform(
+                inputs.begin(), inputs.end(), std::back_inserter(new_inputs), [&](auto i) {
+                    if(contains(fp8_types, i->get_shape().type()))
+                    {
+                        return m.insert_instruction(
+                            ins,
+                            migraphx::make_op("convert",
+                                              {{"target_type", migraphx::to_value(target_type)}}),
+                            i);
+                    }
+                    else
+                        return i;
+                });
+            m.replace_return({new_inputs});
+        }
+    }
+    // Remove unused parameters with fp8 type
+    for(const auto& i : remove_parameters)
+        m.remove_instruction(i);
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
diff --git a/src/include/migraphx/autocast_fp8.hpp b/src/include/migraphx/autocast_fp8.hpp
@@ -0,0 +1,49 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_AUTOCAST_FP8_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_AUTOCAST_FP8_HPP
+
+#include <migraphx/shape.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct program;
+struct module;
+
+/**
+This pass will convert model with fp8 input parameter to model with fp32
+input parameter and internally add casts to fp8 for those converted params.*/
+struct MIGRAPHX_EXPORT autocast_fp8_pass
+{
+    std::set<shape::type_t> fp8_types = {migraphx::shape::fp8e4m3fnuz_type};
+    shape::type_t target_type         = migraphx::shape::float_type;
+    std::string name() const { return "autocast_fp8_pass"; }
+    void apply(module& m) const;
+};
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
diff --git a/src/include/migraphx/module.hpp b/src/include/migraphx/module.hpp
@@ -175,6 +175,8 @@ struct MIGRAPHX_EXPORT module
 
     instruction_ref get_parameter(std::string name) const;
 
+    void rename_parameter(instruction_ref ins, const std::string& name);
+
     std::unordered_map<std::string, shape> get_parameter_shapes() const;
 
     bool has_instruction(instruction_ref ins) const;

diff --git a/src/include/migraphx/op/nonmaxsuppression.hpp b/src/include/migraphx/op/nonmaxsuppression.hpp
@@ -38,7 +38,59 @@
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/output_iterator.hpp>
 #include <migraphx/argument.hpp>
+#include <migraphx/par.hpp>
 
+/*
+https://github.com/onnx/onnx/blob/main/docs/Operators.md#NonMaxSuppression
+
+Filter out boxes that have high intersection-over-union (IOU) overlap with previously selected
+boxes. Bounding boxes with score less than score_threshold are removed. Bounding box format is
+indicated by attribute center_point_box. Note that this algorithm is agnostic to where the origin is
+in the coordinate system and more generally is invariant to orthogonal transformations and
+translations of the coordinate system; thus translating or reflections of the coordinate system
+result in the same boxes being selected by the algorithm. The selected_indices output is a set of
+integers indexing into the input collection of bounding boxes representing the selected boxes. The
+bounding box coordinates corresponding to the selected indices can then be obtained using the Gather
+or GatherND operation.
+
+Version
+This version of the operator has been available since version 11 of the default ONNX operator set.
+Other versions of this operator: 10
+
+Attributes
+center_point_box : int (default is 0)
+Integer indicate the format of the box data. The default is 0. 0 - the box data is supplied as [y1,
+x1, y2, x2] where (y1, x1) and (y2, x2) are the coordinates of any diagonal pair of box corners and
+the coordinates can be provided as normalized (i.e., lying in the interval [0, 1]) or absolute.
+Mostly used for TF models. 1 - the box data is supplied as [x_center, y_center, width, height].
+Mostly used for Pytorch models.
+
+Inputs (2 - 5)
+---------------------------------------------------------------------------------------------------------------------
+boxes : tensor(float)
+An input tensor with shape [num_batches, spatial_dimension, 4].
+The single box data format is indicated by center_point_box.
+
+scores : tensor(float)
+An input tensor with shape [num_batches, num_classes, spatial_dimension]
+
+max_output_boxes_per_class (optional) : tensor(int64)
+Integer representing the maximum number of boxes to be selected per batch per class.
+It is a scalar. Default to 0, which means no output.
+
+iou_threshold (optional) : tensor(float)
+Float representing the threshold for deciding whether boxes overlap too much with respect to IOU.
+It is scalar. Value range [0, 1]. Default to 0.
+
+score_threshold (optional) : tensor(flo187Gat)
+Float representing the threshold for deciding when to remove boxes based on score. It is a scalar.
+----------------------------------------------------------------------------------------------------------------------
+Outputs
+selected_indices : tensor(int64)
+selected indices from the boxes tensor. [num_selected_indices, 3],
+the selected index format is [batch_index, class_index, box_index].
+----------------------------------------------------------------------------------------------------------------------
+*/
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {
@@ -184,13 +236,20 @@ struct nonmaxsuppression
             result.y = {static_cast<double>(start[0]), static_cast<double>(start[2])};
         }
 
+        result.sort();
+
         return result;
     }
 
     inline bool suppress_by_iou(box b1, box b2, double iou_threshold) const
     {
-        b1.sort();
-        b2.sort();
+        const double area1 = b1.area();
+        const double area2 = b2.area();
+
+        if(area1 <= .0f or area2 <= .0f)
+        {
+            return false;
+        }
 
         box intersection{};
         for(auto i : range(2))
@@ -203,12 +262,10 @@ struct nonmaxsuppression
             }
         }
 
-        const double area1             = b1.area();
-        const double area2             = b2.area();
         const double intersection_area = intersection.area();
         const double union_area        = area1 + area2 - intersection_area;
 
-        if(area1 <= .0f or area2 <= .0f or union_area <= .0f)
+        if(union_area <= .0f)
         {
             return false;
         }
@@ -220,30 +277,42 @@ struct nonmaxsuppression
 
     // filter boxes below score_threshold
     template <class T>
-    std::priority_queue<std::pair<double, int64_t>>
+    std::vector<std::pair<double, int64_t>>
     filter_boxes_by_score(T scores_start, std::size_t num_boxes, double score_threshold) const
     {
-        std::priority_queue<std::pair<double, int64_t>> boxes_heap;
-        auto insert_to_boxes_heap =
-            make_function_output_iterator([&](const auto& x) { boxes_heap.push(x); });
+        std::vector<std::pair<double, int64_t>> boxes_heap;
         int64_t box_idx = 0;
-        transform_if(
-            scores_start,
-            scores_start + num_boxes,
-            insert_to_boxes_heap,
-            [&](auto sc) {
-                box_idx++;
-                return sc >= score_threshold;
-            },
-            [&](auto sc) { return std::make_pair(sc, box_idx - 1); });
+
+        if(score_threshold > 0.0)
+        {
+            transform_if(
+                scores_start,
+                scores_start + num_boxes,
+                std::back_inserter(boxes_heap),
+                [&](auto sc) {
+                    box_idx++;
+                    return sc >= score_threshold;
+                },
+                [&](auto sc) { return std::make_pair(sc, box_idx - 1); });
+        }
+        else
+        { // score is irrelevant, just push into boxes_heap and make a score-index pair
+            std::transform(scores_start,
+                           scores_start + num_boxes,
+                           std::back_inserter(boxes_heap),
+                           [&](auto sc) {
+                               box_idx++;
+                               return std::make_pair(sc, box_idx - 1);
+                           });
+        }
+        par_sort(boxes_heap.begin(), boxes_heap.end(), std::greater<std::pair<double, int64_t>>{});
         return boxes_heap;
     }
 
     template <class Output, class Boxes, class Scores>
     std::size_t compute_nms(Output output,
                             Boxes boxes,
                             Scores scores,
-                            const shape& max_output_shape,
                             std::size_t max_output_boxes_per_class,
                             double iou_threshold,
                             double score_threshold) const
@@ -254,9 +323,7 @@ struct nonmaxsuppression
         const auto num_classes = lens[1];
         const auto num_boxes   = lens[2];
         // boxes of a class with NMS applied [score, index]
-        std::vector<std::pair<double, int64_t>> selected_boxes_inside_class;
         std::vector<int64_t> selected_indices;
-        selected_boxes_inside_class.reserve(max_output_shape.elements());
         // iterate over batches and classes
         shape comp_s{shape::double_type, {num_batches, num_classes}};
         shape_for_each(comp_s, [&](const auto& idx) {
@@ -267,31 +334,33 @@ struct nonmaxsuppression
             // iterator to first value of this batch
             auto batch_boxes_start = boxes.begin() + batch_idx * num_boxes * 4;
             auto boxes_heap = filter_boxes_by_score(scores_start, num_boxes, score_threshold);
-            selected_boxes_inside_class.clear();
+            int64_t selected_boxes_inside_class = 0;
             while(not boxes_heap.empty() &&
-                  selected_boxes_inside_class.size() < max_output_boxes_per_class)
+                  selected_boxes_inside_class < max_output_boxes_per_class)
             {
                 // select next top scorer box and remove any boxes from boxes_heap that exceeds IOU
                 // threshold with the selected box
-                const auto next_top_score = boxes_heap.top();
-                boxes_heap.pop();
-                selected_boxes_inside_class.push_back(next_top_score);
+                const auto next_top_score = boxes_heap.front();
+                auto next_box             = batch_box(batch_boxes_start, next_top_score.second);
+                auto next_box_idx         = next_top_score.second;
+
+                selected_boxes_inside_class++;
                 selected_indices.push_back(batch_idx);
                 selected_indices.push_back(class_idx);
-                selected_indices.push_back(next_top_score.second);
-                std::priority_queue<std::pair<double, int64_t>> remainder_boxes;
-                while(not boxes_heap.empty())
-                {
-                    auto iou_candidate_box = boxes_heap.top();
-                    if(not this->suppress_by_iou(
-                           batch_box(batch_boxes_start, iou_candidate_box.second),
-                           batch_box(batch_boxes_start, next_top_score.second),
-                           iou_threshold))
-                    {
-                        remainder_boxes.push(iou_candidate_box);
-                    }
-                    boxes_heap.pop();
-                }
+                selected_indices.push_back(next_box_idx);
+
+                std::vector<std::pair<double, int64_t>> remainder_boxes(boxes_heap.size());
+
+                auto it = par_copy_if(
+                    boxes_heap.begin() + 1,
+                    boxes_heap.end(),
+                    remainder_boxes.begin(),
+                    [&](auto iou_candidate_box) {
+                        auto iou_box = batch_box(batch_boxes_start, iou_candidate_box.second);
+                        return not this->suppress_by_iou(iou_box, next_box, iou_threshold);
+                    });
+
+                remainder_boxes.resize(it - remainder_boxes.begin());
                 boxes_heap = remainder_boxes;
             }
         });
@@ -320,7 +389,6 @@ struct nonmaxsuppression
                 num_selected = compute_nms(output,
                                            boxes,
                                            scores,
-                                           max_output_shape,
                                            max_output_boxes_per_class,
                                            iou_threshold,
                                            score_threshold);

diff --git a/src/include/migraphx/op/scatternd_op.hpp b/src/include/migraphx/op/scatternd_op.hpp
@@ -128,14 +128,15 @@ struct scatternd_op : op_name<Derived>
                     auto updates_idx = updates_std.multi(i);
                     std::vector<std::size_t> indices_idx(q, 0);
                     std::copy(
-                        updates_idx.begin(), updates_idx.begin() + q - 1, indices_idx.begin());
+                        updates_idx.begin(), updates_idx.begin() + (q - 1), indices_idx.begin());
                     auto index_start = indices.begin() +
                                        indices_shape.index(indices_idx.begin(), indices_idx.end());
                     auto index_end = index_start + k;
 
                     std::vector<std::size_t> out_idx(r, 0);
                     std::copy(index_start, index_end, out_idx.begin());
-                    std::copy(updates_idx.begin() + q - 1, updates_idx.end(), out_idx.begin() + k);
+                    std::copy(
+                        updates_idx.begin() + (q - 1), updates_idx.end(), out_idx.begin() + k);
 
                     self.reduction()(output[dyn_out.computed_shape.index(out_idx)], updates[i]);
                 }