ROCm · causten · Oct 30, 2023 · Oct 26, 2023 · Oct 26, 2023 · Oct 26, 2023
@@ -32,7 +32,7 @@
 #define MIGRAPHX_MIOPEN_ASSERT(x) (assert((x) == miopenStatusSuccess))
 #define MIGRAPHX_HIP_ASSERT(x) (assert((x) == hipSuccess))
 
-inline miopenTensorDescriptor_t make_miopen_tensor(const migraphx::shape& s, bool pack = false)
+inline miopenTensorDescriptor_t make_miopen_tensor(const migraphx::shape& s)
 {
     miopenTensorDescriptor_t t;
     MIGRAPHX_MIOPEN_ASSERT(miopenCreateTensorDescriptor(&t));
@@ -49,23 +49,9 @@ inline miopenTensorDescriptor_t make_miopen_tensor(const migraphx::shape& s, boo
     else if(s.type() == migraphx_shape_int32_type)
         d = miopenInt32;
     else if(s.type() == migraphx_shape_int8_type)
-    {
-        if(pack)
-        {
-            // update the lens and corresponding strides
-            d          = miopenInt8x4;
-            lens[1]    = ((lens[1] + 3) / 4) * 4;
-            strides[0] = strides[1] * lens[1];
-        }
-        else
-        {
-            d = miopenInt8;
-        }
-    }
+        d = miopenInt8;
     else
-    {
         throw("MAKE_TENSOR: unsupported type");
-    }
     miopenSetTensorDescriptor(t, d, s_lens.size(), lens.data(), strides.data());
     return t;
 }

@@ -149,9 +149,6 @@ gpu::gelu
 gpu::gelu_new
 gpu::gemm
 gpu::greater
-gpu::int8_conv_pack
-gpu::int8_gemm_pack_a
-gpu::int8_gemm_pack_b
 gpu::layernorm
 gpu::leaky_relu
 gpu::less

@@ -128,8 +128,6 @@ add_library(migraphx_gpu
     gather.cpp
     gemm_impl.cpp
     hip.cpp
-    int8_conv_pack.cpp
-    int8_gemm_pack.cpp
     kernel.cpp
     lowering.cpp
     logsoftmax.cpp
@@ -140,7 +138,6 @@ add_library(migraphx_gpu
     no_device.cpp
     nonzero.cpp
     pack_args.cpp
-    pack_int8_args.cpp
     prefuse_ops.cpp
     pad.cpp
     perfdb.cpp
@@ -184,18 +181,13 @@ register_migraphx_gpu_ops(hip_
 register_migraphx_gpu_ops(miopen_
     abs
     contiguous
-    int8_conv_pack
     lrn
     pooling
 )
 register_op(migraphx_gpu
     HEADER migraphx/gpu/rnn_variable_seq_lens.hpp
     OPERATORS gpu::hip_rnn_var_sl_shift_sequence gpu::hip_rnn_var_sl_shift_output gpu::hip_rnn_var_sl_last_output
     INCLUDES migraphx/gpu/context.hpp)
-register_op(migraphx_gpu
-    HEADER migraphx/gpu/int8_gemm_pack.hpp
-    OPERATORS gpu::hip_int8_gemm_pack_a gpu::hip_int8_gemm_pack_b
-    INCLUDES migraphx/gpu/context.hpp)
 register_op(migraphx_gpu
     HEADER migraphx/gpu/gemm.hpp
     OPERATORS gpu::rocblas_gemm<op::dot> gpu::rocblas_gemm<op::quant_dot>

@@ -60,35 +60,24 @@ struct miopen_op
 };
 MIGRAPHX_REGISTER_OP(miopen_op);
 
-std::size_t compile_miopen::compile(operation& op, instruction_ref ins, bool format) const
+std::size_t compile_miopen::compile(operation& op, instruction_ref ins) const
 {
-    op.from_value({{"int8_x4_format", format}});
     auto v = op.compile(*ctx, ins->get_shape(), to_shapes(ins->inputs()));
     return v.get<std::size_t>("workspace", 0);
 }
 
 void compile_miopen::apply(module& m) const
 {
     assert(ctx);
-    const bool int8_x4_format = get_int8_x4_format(any_cast<migraphx::gpu::context>(*ctx));
     for(auto ins : iterator_for(m))
     {
         if(ins->name() != "gpu::miopen_op")
             continue;
         auto op        = any_cast<miopen_op>(ins->get_operator()).op;
         std::size_t ws = 0;
-        try
-        {
-            // for the regular convolution and convolution_backwards, this try would always succeed
-            ws = compile(op, ins, int8_x4_format);
-        }
-        catch(migraphx::exception&)
-        {
-            // In case no solver supports the default format, retry using the other format.
-            ws = compile(op, ins, not int8_x4_format);
-        }
-        auto inputs = ins->inputs();
-        auto alloc  = m.insert_instruction(
+        ws             = compile(op, ins);
+        auto inputs    = ins->inputs();
+        auto alloc     = m.insert_instruction(
             ins, make_op("allocate", {{"shape", to_value(shape{shape::int8_type, {ws}})}}));
         inputs.insert(std::prev(inputs.end()), alloc);
 

@@ -108,7 +108,6 @@ void gemm_impl(context& ctx,
                const std::vector<argument>& args,
                T alpha,
                T beta,
-               bool int8_x4_format,
                bool compute_fp32)
 {
     const bool is_3inputs = (args.size() == 4);
@@ -141,11 +140,6 @@ void gemm_impl(context& ctx,
     }
 
     rocblas_gemm_flags flag = rocblas_gemm_flags_none;
-#if ROCBLAS_VERSION_MAJOR < 3
-    if(int8_x4_format)
-        flag = rocblas_gemm_flags_pack_int8x4;
-#endif
-
     auto a_lens = args[0].get_shape().lens();
     auto b_lens = args[1].get_shape().lens();
     output_shape.visit_type([&](auto as) {
@@ -167,10 +161,6 @@ void gemm_impl(context& ctx,
         rocblas_int n   = out_lens[dim_1];
         rocblas_int k   = args[0].get_shape().lens()[dim_1];
         auto to_pointer = [&](auto&& arg) { return as.from(arg.data()); };
-        if(args[0].get_shape().type() == shape::int8_type and (k % 4) != 0 and int8_x4_format)
-        {
-            MIGRAPHX_THROW("ROCBLAS_GEMM: k size of int8 type input must be mutlple of 4!");
-        }
 
         auto num_matrices = std::accumulate(
             out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
@@ -256,21 +246,19 @@ void gemm(context& ctx,
           const std::vector<argument>& args,
           float alpha,
           float beta,
-          bool int8_x4_format,
           bool compute_fp32)
 {
-    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
+    gemm_impl(ctx, output_shape, args, alpha, beta, compute_fp32);
 }
 
 void gemm(context& ctx,
           const shape& output_shape,
           const std::vector<argument>& args,
           int32_t alpha,
           int32_t beta,
-          bool int8_x4_format,
           bool compute_fp32)
 {
-    gemm_impl(ctx, output_shape, args, alpha, beta, int8_x4_format, compute_fp32);
+    gemm_impl(ctx, output_shape, args, alpha, beta, compute_fp32);
 }
 
 } // namespace gpu

@@ -42,7 +42,7 @@ struct compile_miopen
     context* ctx = nullptr;
     std::string name() const { return "gpu::compile_miopen"; }
     void apply(module& m) const;
-    std::size_t compile(operation& op, instruction_ref ins, bool format) const;
+    std::size_t compile(operation& op, instruction_ref ins) const;
 };
 
 } // namespace gpu

@@ -57,7 +57,6 @@ template <class Op>
 struct miopen_convolution
 {
     Op op;
-    bool int8_x4_format               = false;
     shared<convolution_descriptor> cd = nullptr;
     miopenConvFwdAlgorithm_t algo{};
 #ifdef MIGRAPHX_HAS_FIND_2_API
@@ -74,7 +73,6 @@ struct miopen_convolution
                     f(self.solution_object, "solution_object"),
 #endif
                     f(self.algo, "algo"),
-                    f(self.int8_x4_format, "int8_x4_format"),
                     f(self.solution_id, "solution_id"));
     }
 
@@ -94,9 +92,9 @@ struct miopen_convolution
     argument
     compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
     {
-        auto x_desc = make_tensor(reshape_if_1d(args[0].get_shape()), int8_x4_format);
-        auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()), int8_x4_format);
-        auto y_desc = make_tensor(reshape_if_1d(output_shape));
+        auto x_desc                = make_tensor(reshape_if_1d(args[0].get_shape()));
+        auto w_desc                = make_tensor(reshape_if_1d(args[1].get_shape()));
+        auto y_desc                = make_tensor(reshape_if_1d(output_shape));
         auto* miopen_stream_handle = ctx.get_stream().get_miopen();
         auto workspace_size        = args[2].get_shape().bytes();
 
@@ -162,8 +160,8 @@ struct miopen_convolution
     shape find(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
     {
         shape workspace_shape{};
-        auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
-        auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+        auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
+        auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
         auto y_desc = make_tensor(reshape_if_1d(output_shape));
 
         auto* miopen_stream_handle = ctx.get_stream().get_miopen();
@@ -179,13 +177,8 @@ struct miopen_convolution
 
         workspace_shape = shape{shape::int8_type, {workspace_size}};
 
-        auto x_shape = inputs[0];
-        auto w_shape = inputs[1];
-        if(int8_x4_format)
-        {
-            x_shape = pack_int8_shape(x_shape);
-            w_shape = pack_int8_shape(w_shape);
-        }
+        const auto& x_shape = inputs[0];
+        const auto& w_shape = inputs[1];
 
 #ifdef MIGRAPHX_HAS_FIND_2_API
         {
@@ -327,8 +320,8 @@ struct miopen_convolution
                                    ": workspace has changed during finalization.");
             }
 
-            auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
-            auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+            auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
+            auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
             auto y_desc = make_tensor(reshape_if_1d(output_shape));
 
             auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
@@ -347,21 +340,6 @@ struct miopen_convolution
     {
         return shapes.size() - 1;
     }
-
-    inline shape pack_int8_shape(const shape& s) const
-    {
-        if(s.type() != shape::int8_type)
-        {
-            return s;
-        }
-
-        auto lens    = s.lens();
-        auto strides = s.strides();
-        lens[1]      = (lens[1] + 3) / 4 * 4;
-        strides[0]   = strides[1] * lens[1];
-
-        return {s.type(), lens, strides};
-    }
 };
 
 } // namespace gpu