ggerganov · bssrdf · Sep 26, 2024 · Sep 27, 2024 · Sep 27, 2024 · Sep 27, 2024
diff --git a/include/ggml.h b/include/ggml.h
@@ -510,6 +510,8 @@ extern "C" {
         GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
         GGML_OP_LEAKY_RELU,
+        GGML_OP_WINOGRAD_STAGE0,
+        GGML_OP_WINOGRAD_STAGE1,
 
         GGML_OP_FLASH_ATTN_EXT,
         GGML_OP_FLASH_ATTN_BACK,
@@ -1696,6 +1698,21 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
             int                   stride);
+
+   GGML_API struct ggml_tensor * ggml_winograd_stage0(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+   GGML_API struct ggml_tensor * ggml_winograd_stage1(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+   GGML_API struct ggml_tensor * ggml_conv_2d_3x3(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
 
     enum ggml_op_pool {
         GGML_OP_POOL_MAX,

diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
@@ -10,6 +10,7 @@
 #include "ggml-cuda/clamp.cuh"
 #include "ggml-cuda/concat.cuh"
 #include "ggml-cuda/conv-transpose-1d.cuh"
+#include "ggml-cuda/conv-winograd.cuh"
 #include "ggml-cuda/convert.cuh"
 #include "ggml-cuda/cpy.cuh"
 #include "ggml-cuda/cross-entropy-loss.cuh"
@@ -2331,6 +2332,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_CONV_TRANSPOSE_1D:
             ggml_cuda_op_conv_transpose_1d(ctx,dst);
             break;
+        case GGML_OP_WINOGRAD_STAGE0:
+            ggml_cuda_op_winograd_stage0(ctx, dst);
+            break;
+        case GGML_OP_WINOGRAD_STAGE1:
+            ggml_cuda_op_winograd_stage1(ctx, dst);
+            break;
         case GGML_OP_POOL_2D:
             ggml_cuda_op_pool2d(ctx, dst);
             break;
@@ -2950,6 +2957,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                 }
                 return false;
             } break;
+        case GGML_OP_WINOGRAD_STAGE0:
+        case GGML_OP_WINOGRAD_STAGE1:    
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:

diff --git a/src/ggml-cuda/conv-winograd.cu b/src/ggml-cuda/conv-winograd.cu
diff --git a/src/ggml-cuda/conv-winograd.cuh b/src/ggml-cuda/conv-winograd.cuh
@@ -0,0 +1,44 @@
+#include "common.cuh"
+
+
+#define BC 8
+#define BN 32
+#define BK 64
+#define TW 8
+#define TH 16
+#define BN_p 138
+
+__constant__ int access_f_s[2][32];
+__constant__ int access_s[2][32];
+__constant__ int tileid[2][32];
+
+
+// access_f_s
+const int aux[2][32] = {
+                        {0,0,1,1,2,2,3,3,4,4,5,5,6,6,
+                            7,7,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7},
+                        {8,8,9,9,10,10,11,11,12,12,13,13,
+                            14,14,15,15,8,8,9,9,10,10,11,11,12,12,
+                            13,13,14,14,15,15}
+                        };
+// access_s
+const int aux2[2][32] = {
+                         {0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,
+                          3,2,3,2,3,2,3,2,3,2,3,2,3,2,3},                         
+                         {4,5,4,5,4,5,4,5,4,5,4,
+                            5,4,5,4,5,6,7,6,7,6,7,6,7,
+                            6,7,6,7,6,7,6,7}
+                        };   
+
+const int tid[2][32] = {
+                        {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29,
+                         0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29},
+                        {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,
+                         2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31}
+                        };       
+
+
+
+void ggml_cuda_op_winograd_stage0(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_winograd_stage1(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
diff --git a/src/ggml.c b/src/ggml.c
@@ -2995,6 +2995,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
     "LEAKY_RELU",
+    "WINOGRAD_STAGE0",
+    "WINOGRAD_STAGE1",
 
     "FLASH_ATTN_EXT",
     "FLASH_ATTN_BACK",
@@ -3024,7 +3026,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 
-static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -3089,6 +3091,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
     "leaky_relu(x)",
+    "winograd_stage0(x)",
+    "winograd_stage1(x)",
 
     "flash_attn_ext(x)",
     "flash_attn_back(x)",
@@ -3118,7 +3122,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
 
-static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -7166,6 +7170,73 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
     return result;
 }
 
+
+// ggml_winograd
+
+// a: [OC，IC, 3, 3]
+// result: [OC, IC, 16]
+struct ggml_tensor * ggml_winograd_stage0(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[3], 4, 4, a->ne[2]);
+
+    result->op   = GGML_OP_WINOGRAD_STAGE0;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_winograd
+// a: [OC, IC, 4, 4]
+// b: [1, IC, IH, IW]
+// result: [N, OC, OH, OW]
+struct ggml_tensor * ggml_winograd_stage1(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    bool is_node = false;
+    if (a->grad) {
+        is_node = true;
+    }
+
+    int OW = b->ne[0];
+    int OH = b->ne[1];
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, OW, OH, a->ne[0] /* OC */, 1);
+
+    result->op   = GGML_OP_WINOGRAD_STAGE1;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_conv_2d_3x3(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b){
+    GGML_ASSERT(a->ne[0] == 3 && a->ne[1] == 3); // kernel should be 3x3
+    GGML_ASSERT(b->ne[3] == 1); // only works for 1 input image
+    GGML_ASSERT(b->ne[2] == a->ne[2]); // number of channels must match
+    if(a->ne[3] % 64 != 0 || a->ne[2] % 8 != 0)            // only works for the number of filters is a multiple of 64
+        return ggml_conv_2d(ctx, a, b, 1, 1, 1, 1, 1, 1);  // and the number of channels is a multiple of 8
+
+    // struct ggml_tensor* ra =  ggml_cont(ctx, ggml_permute(ctx, a, 1, 2, 3, 0)); // [N, OC, OH, OW]
+    struct ggml_tensor* W = ggml_winograd_stage0(ctx, a);
+    struct ggml_tensor * result = ggml_winograd_stage1(ctx, W, b);
+
+    return result;
+
+}
+
+
 // ggml_pool_*
 
 static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
@@ -15124,6 +15195,23 @@ static void ggml_compute_forward_conv_transpose_1d(
     }
 }
 
+
+static void ggml_compute_forward_winograd_stage0(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    GGML_ASSERT(false && " CPU backend not implemented!");         
+    return;
+}
+
+static void ggml_compute_forward_winograd_stage1(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    GGML_ASSERT(false && " CPU backend not implemented!");         
+    return;
+}
+
 // ggml_compute_forward_im2col_f32
 // src0: kernel [OC, IC, KH, KW]
 // src1: image [N, IC, IH, IW]
@@ -17820,6 +17908,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_conv_transpose_1d(params, tensor);
             } break;
+        case GGML_OP_WINOGRAD_STAGE0:
+            {
+                ggml_compute_forward_winograd_stage0(params, tensor);
+            } break;
+        case GGML_OP_WINOGRAD_STAGE1:
+            {
+                ggml_compute_forward_winograd_stage1(params, tensor);
+            } break;    
         case GGML_OP_IM2COL:
             {
                 ggml_compute_forward_im2col(params, tensor);
@@ -18893,6 +18989,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ABORT("fatal error"); // TODO: not implemented
             }
+        case GGML_OP_WINOGRAD_STAGE0:
+            {
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
+        case GGML_OP_WINOGRAD_STAGE1:
+            {
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }    
         case GGML_OP_POOL_1D:
             {
                 GGML_ABORT("fatal error"); // TODO: not implemented

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -408,6 +408,15 @@ target_link_libraries(${TEST_TARGET} PRIVATE ggml)
 add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
 set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
 
+#
+# test-conv2d-wino
+
+set(TEST_TARGET test-conv2d-winograd)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
 
 #
 # test-mul-mat

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -2246,6 +2246,51 @@ struct test_im2col : public test_case {
     }
 };
 
+// GGML_Conv2D
+struct test_conv2d : public test_case {
+    const ggml_type type_input;
+    const ggml_type type_kernel;
+    const ggml_type dst_type;
+    const std::array<int64_t, 4> ne_input;
+    const std::array<int64_t, 4> ne_kernel;
+    // stride
+    const int s0;
+    const int s1;
+    // padding
+    const int p0;
+    const int p1;
+    // dilation
+    const int d0;
+    const int d1;
+    // mode
+
+    std::string vars() override {
+        return VARS_TO_STR11(type_input, type_kernel, dst_type, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1);
+    }
+
+    test_conv2d(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16, ggml_type dst_type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
+            std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
+            int s0 = 1, int s1 = 1,
+            int p0 = 1, int p1 = 1,
+            int d0 = 1, int d1 = 1)
+        : type_input(type_input), type_kernel(type_kernel), dst_type(dst_type), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1)
+          {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());        
+        ggml_set_name(input, "input");
+
+        ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
+        ggml_set_name(kernel, "kernel");
+
+        ggml_tensor * out = ggml_conv_2d_3x3(ctx, kernel, input);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
 // GGML_OP_CONCAT
 struct test_concat : public test_case {
     const ggml_type type;
@@ -3252,6 +3297,10 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
 
+    test_cases.emplace_back(new test_conv2d(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {56, 80, 640, 1}, {3, 3, 640, 960}, 1, 1, 1, 1, 1, 1));
+    test_cases.emplace_back(new test_conv2d(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {56, 80, 1280, 1}, {3, 3, 1280, 1280}, 1, 1, 1, 1, 1, 1));
+    test_cases.emplace_back(new test_conv2d(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {56, 80, 1280, 1}, {3, 3, 1280, 2560}, 1, 1, 1, 1, 1, 1));
+
     // sycl backend will limit task global_range < MAX_INT
     // test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
     // however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.)