flexflow · reyna-abhyankar · Aug 25, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt
@@ -8,6 +8,8 @@ file(GLOB_RECURSE SRC
      LIST_DIRECTORIES False
      src/*.cc
      src/cuda/cuda_helper.cu
+     src/cuda/loss_function_kernels.cu
+     src/cuda/optimizer_kernels.cu
      src/cuda/ops/*.cu
      )
 

diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
@@ -17,6 +17,7 @@ struct ArrayShape {
   ArrayShape(size_t *dims, size_t num_dims);
   ArrayShape(TensorShape const &shape);
   ArrayShape(std::vector<std::size_t> const &);
+  ArrayShape(LegionTensorDims const &);
 
   /**
    * @brief Alias of ArrayShape::num_elements for compatibility with
@@ -42,9 +43,16 @@ struct ArrayShape {
   std::optional<std::size_t> at_maybe(legion_dim_t) const;
   std::optional<std::size_t> at_maybe(ff_dim_t) const;
 
-  ArrayShape
-      sub_shape(std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
-                std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const;
+  ArrayShape sub_shape(legion_dim_t start, ff_dim_t end) const;
+
+  ArrayShape sub_shape(std::optional<ff_dim_t> start,
+                       std::optional<ff_dim_t> end) const;
+
+  ArrayShape sub_shape(std::optional<legion_dim_t> start,
+                       std::optional<legion_dim_t> end) const;
+
+  bool operator==(ArrayShape const &) const;
+  bool operator!=(ArrayShape const &) const;
 
 public:
   LegionTensorDims dims;

diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h
@@ -1,7 +1,8 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
 
 namespace FlexFlow {
 
@@ -20,7 +21,8 @@ void sgd_nccl_update_task_gpu(ffStream_t,
                               float lr,
                               float momentum,
                               bool nesterov,
-                              float weight_decay PerDeviceFFHandle const &,
+                              float weight_decay,
+                              PerDeviceFFHandle const &,
                               float const *weight_grad_ptr,
                               size_t size,
                               float *weight_ptr,
@@ -32,6 +34,8 @@ void adam_ps_update_task_gpu(ffStream_t,
                              float beta2,
                              float weight_decay,
                              float epsilon,
+                             size_t size,
+                             int num_replicas,
                              float const *weight_grad_ptr,
                              float *adam_m_ptr,
                              float *adam_v_ptr,
@@ -43,6 +47,7 @@ void adam_nccl_update_task_gpu(ffStream_t,
                                float beta2,
                                float weight_decay,
                                float epsilon,
+                               size_t size,
                                PerDeviceFFHandle const &,
                                float const *weight_grad_ptr,
                                float *adam_m_ptr,

diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
@@ -1,4 +1,5 @@
 #include "kernels/array_shape.h"
+#include "op-attrs/dim_ordered/slice.h"
 #include "utils/containers/product.h"
 
 namespace FlexFlow {
@@ -19,6 +20,9 @@
 ArrayShape::ArrayShape(std::vector<std::size_t> const &input_dims)
     : dims(input_dims) {}
 
+ArrayShape::ArrayShape(LegionTensorDims const &legion_tensor_dims)
+    : dims(legion_tensor_dims) {}
+
 std::size_t ArrayShape::get_volume() const {
   return this->num_elements();
 }
@@ -50,10 +54,20 @@
   return dims.at(legion_dim_from_ff_dim(idx, this->num_dims()));
 }
 
-ArrayShape ArrayShape::sub_shape(
-    std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
-    std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
-  NOT_IMPLEMENTED();
+ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const {
+  legion_dim_t legion_end = legion_dim_from_ff_dim(end, num_dims());
+  return this->sub_shape(start, legion_end);
+}
+
+ArrayShape ArrayShape::sub_shape(std::optional<ff_dim_t> start,
+                                 std::optional<ff_dim_t> end) const {
+  return ArrayShape{legion_dims_from_ff_dims(
+      slice(ff_ordered_from_legion_ordered(this->dims), start, end))};
+}
+
+ArrayShape ArrayShape::sub_shape(std::optional<legion_dim_t> start,
+                                 std::optional<legion_dim_t> end) const {
+  return ArrayShape{slice(this->dims, start, end)};
 }
 
 std::optional<std::size_t> ArrayShape::at_maybe(legion_dim_t index) const {
@@ -77,6 +91,14 @@
                      dtype};
 }
 
+bool ArrayShape::operator==(ArrayShape const &other) const {
+  return this->dims == other.dims;
+}
+
+bool ArrayShape::operator!=(ArrayShape const &other) const {
+  return this->dims != other.dims;
+}
+
 std::string format_as(ArrayShape const &x) {
   std::ostringstream oss;
   oss << "<ArrayShape";

diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu
@@ -29,13 +29,13 @@ cudaError_t get_legion_stream(cudaStream_t *stream) {
 #error "Unknown device, please make sure if CUDA is enabled"
 #endif
 
-__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) {
+__global__ void scale_kernel(float *ptr, size_t size, float a, float b) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = (b - a) * ptr[i] + a;
   }
 }
 
-__global__ void ones_kernel(float *ptr, coord_t size) {
+__global__ void ones_kernel(float *ptr, size_t size) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = 1.0f;
   }

diff --git a/lib/kernels/src/cuda/optimizer_kernel.cu → lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernel.cu → lib/kernels/src/cuda/optimizer_kernels.cu
@@ -13,6 +13,8 @@
  * limitations under the License.
  */
 
+#include "device.h"
+#include "kernels/nccl.h"
 #include "kernels/optimizer_kernels.h"
 
 namespace FlexFlow {
@@ -40,66 +42,70 @@ __global__ void sgd_update(size_t count,
   }
 }
 
-__host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op,
-                                               float const *w_grad_ptr,
-                                               size_t size,
-                                               int num_replicas,
-                                               float *w_ptr,
-                                               float *v_ptr) {
-  cudaStream_t stream;
+void sgd_ps_update_task_gpu(cudaStream_t stream,
+                            float lr,
+                            float momentum,
+                            bool nesterov,
+                            float weight_decay,
+                            float const *weight_grad_ptr,
+                            size_t size,
+                            int num_replicas,
+                            float *weight_ptr,
+                            float *sgd_v_ptr) {
   checkCUDA(get_legion_stream(&stream));
   // Step 1: Gather gradients in the first replica
   for (int i = 1; i < num_replicas; i++) {
-    float const *src = w_grad_ptr + i * size;
+    float const *src = weight_grad_ptr + i * size;
     apply_add_with_scale<float>
         <<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-            (float *)w_grad_ptr, src, size, 1.0f);
+            (float *)weight_grad_ptr, src, size, 1.0f);
   }
   // checkCUDA(cudaDeviceSynchronize());
   //  Step 2: SGD update
-  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      op->lr,
-      op->weight_decay,
-      op->momentum,
-      op->nesterov,
-      w_grad_ptr,
-      v_ptr,
-      w_ptr);
+  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(size,
+                                                                lr,
+                                                                weight_decay,
+                                                                momentum,
+                                                                nesterov,
+                                                                weight_grad_ptr,
+                                                                sgd_v_ptr,
+                                                                weight_ptr);
   // checkCUDA(cudaDeviceSynchronize());
 }
 
 #ifdef FF_USE_NCCL
-__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
-                                                 PerDeviceOpState const *meta,
-                                                 float const *w_grad_ptr,
-                                                 size_t size,
-                                                 float *w_ptr,
-                                                 float *v_ptr) {
+void sgd_nccl_update_task_gpu(cudaStream_t stream,
+                              float lr,
+                              float momentum,
+                              bool nesterov,
+                              float weight_decay,
+                              PerDeviceFFHandle const &handle,
+                              float const *weight_grad_ptr,
+                              size_t size,
+                              float *weight_ptr,
+                              float *sgd_v_ptr) {
   // Use NCCL to sync gradients
   // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr);
-  cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  checkNCCL(ncclAllReduce(w_grad_ptr,
-                          (float *)w_grad_ptr,
+  checkNCCL(ncclAllReduce(weight_grad_ptr,
+                          (float *)weight_grad_ptr,
                           size,
-                          ncclFloat,
-                          ncclSum,
-                          meta->handle.ncclComm,
+                          ncclDataType_t::ncclFloat,
+                          ncclRedOp_t::ncclSum,
+                          handle.ncclComm,
                           stream));
   // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr);
   // print_tensor<float>((float*)w_grad_ptr, 16, "[After ncclAllReduce]");
 
   // Step 2: SGD update
-  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      op->lr,
-      op->weight_decay,
-      op->momentum,
-      op->nesterov,
-      w_grad_ptr,
-      v_ptr,
-      w_ptr);
+  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(size,
+                                                                lr,
+                                                                weight_decay,
+                                                                momentum,
+                                                                nesterov,
+                                                                weight_grad_ptr,
+                                                                sgd_v_ptr,
+                                                                weight_ptr);
   // checkCUDA(cudaDeviceSynchronize());
 }
 #endif
@@ -144,71 +150,79 @@ __global__ void adam_update(int count,
   }
 }
 
-__host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op,
-                                                float const *w_grad_ptr,
-                                                size_t size,
-                                                int num_replicas,
-                                                float *w_ptr,
-                                                float *v_ptr,
-                                                float *m_ptr) {
-  cudaStream_t stream;
+void adam_ps_update_task_gpu(cudaStream_t stream,
+                             float alpha_t,
+                             float beta1,
+                             float beta2,
+                             float weight_decay,
+                             float epsilon,
+                             size_t size,
+                             int num_replicas,
+                             float const *weight_grad_ptr,
+                             float *adam_m_ptr,
+                             float *adam_v_ptr,
+                             float *weight_ptr) {
   checkCUDA(get_legion_stream(&stream));
   // Step 1: Gather gradients in the first replica
   for (int i = 1; i < num_replicas; i++) {
-    float const *src = w_grad_ptr + i * size;
+    float const *src = weight_grad_ptr + i * size;
     add_kernel<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-        size, 1.0f, src, (float *)w_grad_ptr);
+        size, 1.0f, src, (float *)weight_grad_ptr);
   }
   // checkCUDA(cudaDeviceSynchronize());
   // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
   //         op->alpha, op->alpha_t, op->weight_decay);
   //  Step 2: Adam update
   adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
       size,
-      op->alpha_t,
-      op->beta1,
-      op->beta2,
-      op->weight_decay,
-      op->epsilon,
-      w_grad_ptr,
-      m_ptr,
-      v_ptr,
-      w_ptr);
+      alpha_t,
+      beta1,
+      beta2,
+      weight_decay,
+      epsilon,
+      weight_grad_ptr,
+      adam_m_ptr,
+      adam_v_ptr,
+      weight_ptr);
   // checkCUDA(cudaDeviceSynchronize());
 }
 
 #ifdef FF_USE_NCCL
-__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
-                                                  PerDeviceOpState const *meta,
-                                                  float const *w_grad_ptr,
-                                                  size_t size,
-                                                  float *w_ptr,
-                                                  float *v_ptr,
-                                                  float *m_ptr) {
+void adam_nccl_update_task_gpu(cudaStream_t stream,
+                               float alpha_t,
+                               float beta1,
+                               float beta2,
+                               float weight_decay,
+                               float epsilon,
+                               size_t size,
+                               PerDeviceFFHandle const &handle,
+                               float const *weight_grad_ptr,
+                               float *adam_m_ptr,
+                               float *adam_v_ptr,
+                               float *weight_ptr) {
   // Use NCCL to sync gradients
-  cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  checkNCCL(ncclAllReduce(w_grad_ptr,
-                          (float *)w_grad_ptr,
+  checkNCCL(ncclAllReduce(weight_grad_ptr,
+                          (float *)weight_grad_ptr,
                           size,
-                          ncclFloat,
-                          ncclSum,
-                          meta->handle.ncclComm,
+                          ncclDataType_t::ncclFloat,
+                          ncclRedOp_t::ncclSum,
+                          handle.ncclComm,
                           stream));
   // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
   //         op->alpha, op->alpha_t, op->weight_decay);
   //  Step 2: Adam update
   adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
       size,
-      op->alpha_t,
-      op->beta1,
-      op->beta2,
-      op->weight_decay,
-      op->epsilon,
-      w_grad_ptr,
-      m_ptr,
-      v_ptr,
-      w_ptr);
+      alpha_t,
+      beta1,
+      beta2,
+      weight_decay,
+      epsilon,
+      weight_grad_ptr,
+      adam_m_ptr,
+      adam_v_ptr,
+      weight_ptr);
   // checkCUDA(cudaDeviceSynchronize());
 }
 #endif