feat(cuda): use generic for example

x-pt · Sep 11, 2024 · 3f65611 · 3f65611
1 parent a31beb6
commit 3f65611
Show file tree

Hide file tree

Showing 5 changed files with 159 additions and 68 deletions.
diff --git a/template/cuda/{{cookiecutter.project_slug}}/include/matrix_add.h b/template/cuda/{{cookiecutter.project_slug}}/include/matrix_add.h
@@ -2,5 +2,22 @@
 
 #include <cuda_runtime.h>
 
-__global__ void addMatricesKernel(const float* A, const float* B, float* C, int rows, int cols);
-void addMatricesOnGPU(const float* A, const float* B, float* C, int rows, int cols);
+namespace cuda_kernel {
+
+// CUDA kernel for matrix addition
+template <typename T>
+__global__ void addMatricesKernel(const T* matrixA, const T* matrixB, T* resultMatrix, int numRows, int numCols);
+
+// Debug kernel for printing matrices (if necessary)
+template <typename T>
+__global__ void printDebugInfo(const T* matrixA, const T* matrixB, T* resultMatrix);
+
+} // namespace cuda_kernel
+
+// Function to perform matrix addition on the GPU
+template <typename T>
+void addMatricesOnGPU(const T* hostMatrixA, const T* hostMatrixB, T* hostResultMatrix, int numRows, int numCols);
+
+// Explicit instantiation declarations for addMatricesOnGPU
+extern template void addMatricesOnGPU<float>(const float*, const float*, float*, int, int);
+extern template void addMatricesOnGPU<double>(const double*, const double*, double*, int, int);
diff --git a/template/cuda/{{cookiecutter.project_slug}}/include/matrix_mult.h b/template/cuda/{{cookiecutter.project_slug}}/include/matrix_mult.h
@@ -3,4 +3,11 @@
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 
-void multiplyMatricesOnGPU(const float* A, const float* B, float* C, int rowsA, int colsA, int colsB);
+// Function to perform matrix multiplication on the GPU using cuBLAS
+template <typename T>
+void multiplyMatricesOnGPU(const T* hostMatrixA, const T* hostMatrixB, T* hostResultMatrix,
+                           int numRowsA, int numColsA, int numColsB);
+
+// Explicit instantiation declarations for multiplyMatricesOnGPU
+extern template void multiplyMatricesOnGPU<float>(const float*, const float*, float*, int, int, int);
+extern template void multiplyMatricesOnGPU<double>(const double*, const double*, double*, int, int, int);
diff --git a/template/cuda/{{cookiecutter.project_slug}}/src/matrix_add.cu b/template/cuda/{{cookiecutter.project_slug}}/src/matrix_add.cu
@@ -1,31 +1,36 @@
 #include "cuda_utils.h"
+#include "matrix_add.h"
 
-// CUDA kernel for adding two matrices element-wise
-__global__ void addMatricesKernel(const float* matrixA, const float* matrixB, float* resultMatrix, int numRows, int numCols) {
-    // Calculate the global row and column indices for this thread
-    int rowIndex = blockIdx.y * blockDim.y + threadIdx.y;
-    int colIndex = blockIdx.x * blockDim.x + threadIdx.x;
-
-    // Check if this thread is within the matrix bounds
-    if (rowIndex < numRows && colIndex < numCols) {
-        // Calculate the linear index for the current element
-        int elementIndex = rowIndex * numCols + colIndex;
-        // Perform element-wise addition
-        resultMatrix[elementIndex] = matrixA[elementIndex] + matrixB[elementIndex];
+namespace cuda_kernel {
+
+template <typename T>
+__global__ void addMatricesKernel(const T* matrixA, const T* matrixB, T* resultMatrix, int numRows, int numCols) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (row < numRows && col < numCols) {
+        int index = row * numCols + col;
+        resultMatrix[index] = matrixA[index] + matrixB[index];
+    }
+}
+
+template <typename T>
+__global__ void printDebugInfo(const T* matrixA, const T* matrixB, T* resultMatrix) {
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+        printf("GPU matrixA=%p\n", matrixA);
+        printf("GPU matrixB=%p\n", matrixB);
+        printf("GPU resultMatrix=%p\n", resultMatrix);
     }
 }
 
-// Host function to set up and execute matrix addition on GPU
-void addMatricesOnGPU(const float* hostMatrixA, const float* hostMatrixB, float* hostResultMatrix, int numRows, int numCols) {
-    // Calculate total size of the matrices in bytes
-    size_t matrixSizeBytes = numRows * numCols * sizeof(float);
+} // namespace cuda_kernel
+
+template <typename T>
+void addMatricesOnGPU(const T* hostMatrixA, const T* hostMatrixB, T* hostResultMatrix, int numRows, int numCols) {
+    size_t matrixSizeBytes = numRows * numCols * sizeof(T);
 
-    // Declare pointers for device (GPU) memory
-    float* deviceMatrixA;
-    float* deviceMatrixB;
-    float* deviceResultMatrix;
+    T *deviceMatrixA, *deviceMatrixB, *deviceResultMatrix;
 
-    // Allocate memory on the GPU
     CUDA_CHECK(cudaMalloc(&deviceMatrixA, matrixSizeBytes));
     CUDA_CHECK(cudaMalloc(&deviceMatrixB, matrixSizeBytes));
     CUDA_CHECK(cudaMalloc(&deviceResultMatrix, matrixSizeBytes));
@@ -39,8 +44,10 @@ void addMatricesOnGPU(const float* hostMatrixA, const float* hostMatrixB, float*
     dim3 numBlocks((numCols + threadsPerBlock.x - 1) / threadsPerBlock.x,
                    (numRows + threadsPerBlock.y - 1) / threadsPerBlock.y);
 
-    // Launch the CUDA kernel
-    addMatricesKernel<<<numBlocks, threadsPerBlock>>>(deviceMatrixA, deviceMatrixB, deviceResultMatrix, numRows, numCols);
+    cuda_kernel::addMatricesKernel<<<numBlocks, threadsPerBlock>>>(
+        deviceMatrixA, deviceMatrixB, deviceResultMatrix, numRows, numCols);
+
+    cuda_kernel::printDebugInfo<<<1, 1>>>(deviceMatrixA, deviceMatrixB, deviceResultMatrix);
 
     // Check for errors
     CUDA_CHECK(cudaGetLastError());
@@ -56,3 +63,7 @@ void addMatricesOnGPU(const float* hostMatrixA, const float* hostMatrixB, float*
     CUDA_CHECK(cudaFree(deviceMatrixB));
     CUDA_CHECK(cudaFree(deviceResultMatrix));
 }
+
+// Explicit instantiations
+template void addMatricesOnGPU<float>(const float*, const float*, float*, int, int);
+template void addMatricesOnGPU<double>(const double*, const double*, double*, int, int);
diff --git a/template/cuda/{{cookiecutter.project_slug}}/src/matrix_mult.cu b/template/cuda/{{cookiecutter.project_slug}}/src/matrix_mult.cu
@@ -1,15 +1,14 @@
 #include "cuda_utils.h"
+#include "matrix_mult.h"
 
-// Function to perform matrix multiplication on GPU using cuBLAS
-void multiplyMatricesOnGPU(const float* hostMatrixA, const float* hostMatrixB, float* hostResultMatrix,
+template <typename T>
+void multiplyMatricesOnGPU(const T* hostMatrixA, const T* hostMatrixB, T* hostResultMatrix,
                            int numRowsA, int numColsA, int numColsB) {
-    // Calculate sizes in bytes for each matrix
-    size_t byteSizeA = numRowsA * numColsA * sizeof(float);
-    size_t byteSizeB = numColsA * numColsB * sizeof(float);
-    size_t byteSizeC = numRowsA * numColsB * sizeof(float);
+    size_t byteSizeA = numRowsA * numColsA * sizeof(T);
+    size_t byteSizeB = numColsA * numColsB * sizeof(T);
+    size_t byteSizeC = numRowsA * numColsB * sizeof(T);
 
-    // Declare pointers for device (GPU) memory
-    float *deviceMatrixA, *deviceMatrixB, *deviceResultMatrix;
+    T *deviceMatrixA, *deviceMatrixB, *deviceResultMatrix;
 
     // Allocate memory on the GPU
     CUDA_CHECK(cudaMalloc(&deviceMatrixA, byteSizeA));
@@ -20,30 +19,44 @@ void multiplyMatricesOnGPU(const float* hostMatrixA, const float* hostMatrixB, f
     CUDA_CHECK(cudaMemcpy(deviceMatrixA, hostMatrixA, byteSizeA, cudaMemcpyHostToDevice));
     CUDA_CHECK(cudaMemcpy(deviceMatrixB, hostMatrixB, byteSizeB, cudaMemcpyHostToDevice));
 
-    // Create cuBLAS handle
     cublasHandle_t cublasHandle;
     CUBLAS_CHECK(cublasCreate(&cublasHandle));
 
-    // Set up parameters for cublasSgemm
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
+    const T alpha = 1.0;
+    const T beta = 0.0;
 
     // Perform matrix multiplication using cuBLAS
-    CUBLAS_CHECK(cublasSgemm(cublasHandle,
-                             CUBLAS_OP_N, CUBLAS_OP_N,
-                             numColsB, numRowsA, numColsA,
-                             &alpha,
-                             deviceMatrixB, numColsB,
-                             deviceMatrixA, numColsA,
-                             &beta,
-                             deviceResultMatrix, numColsB));
-
-    // Copy the result back to host memory
+    if constexpr (std::is_same_v<T, float>) {
+        CUBLAS_CHECK(cublasSgemm(cublasHandle,
+                                 CUBLAS_OP_N, CUBLAS_OP_N,
+                                 numColsB, numRowsA, numColsA,
+                                 &alpha,
+                                 deviceMatrixB, numColsB,
+                                 deviceMatrixA, numColsA,
+                                 &beta,
+                                 deviceResultMatrix, numColsB));
+    } else if constexpr (std::is_same_v<T, double>) {
+        CUBLAS_CHECK(cublasDgemm(cublasHandle,
+                                 CUBLAS_OP_N, CUBLAS_OP_N,
+                                 numColsB, numRowsA, numColsA,
+                                 &alpha,
+                                 deviceMatrixB, numColsB,
+                                 deviceMatrixA, numColsA,
+                                 &beta,
+                                 deviceResultMatrix, numColsB));
+    } else {
+        static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
+                      "Only float and double types are supported");
+    }
+
     CUDA_CHECK(cudaMemcpy(hostResultMatrix, deviceResultMatrix, byteSizeC, cudaMemcpyDeviceToHost));
 
-    // Clean up: Free GPU memory and destroy cuBLAS handle
+    CUBLAS_CHECK(cublasDestroy(cublasHandle));
     CUDA_CHECK(cudaFree(deviceMatrixA));
     CUDA_CHECK(cudaFree(deviceMatrixB));
     CUDA_CHECK(cudaFree(deviceResultMatrix));
-    CUBLAS_CHECK(cublasDestroy(cublasHandle));
 }
+
+// Explicit instantiations
+template void multiplyMatricesOnGPU<float>(const float*, const float*, float*, int, int, int);
+template void multiplyMatricesOnGPU<double>(const double*, const double*, double*, int, int, int);
diff --git a/template/cuda/{{cookiecutter.project_slug}}/tests/test_matrix.cpp b/template/cuda/{{cookiecutter.project_slug}}/tests/test_matrix.cpp
@@ -1,42 +1,85 @@
 #include <gtest/gtest.h>
 #include "matrix_add.h"
 #include "matrix_mult.h"
+#include <vector>
 
+template <typename T>
 class MatrixOperationsTest : public ::testing::Test {
 protected:
     static constexpr int kMatrixSize = 2;
     static constexpr int kMatrixElements = kMatrixSize * kMatrixSize;
 
-    float matrixA[kMatrixElements];
-    float matrixB[kMatrixElements];
-    float resultMatrix[kMatrixElements];
+    std::vector<T> matrixA;
+    std::vector<T> matrixB;
+    std::vector<T> resultMatrix;
 
     void SetUp() override {
         // Initialize matrices A and B with test data
-        float dataA[kMatrixElements] = {1, 2, 3, 4};
-        float dataB[kMatrixElements] = {5, 6, 7, 8};
-        std::copy(std::begin(dataA), std::end(dataA), std::begin(matrixA));
-        std::copy(std::begin(dataB), std::end(dataB), std::begin(matrixB));
+        matrixA = {1, 2, 3, 4};
+        matrixB = {5, 6, 7, 8};
+        resultMatrix.resize(kMatrixElements);
     }
 
-    void verifyResult(const float expected[kMatrixElements]) {
-        for (int i = 0; i < kMatrixElements; i++) {
-            EXPECT_FLOAT_EQ(resultMatrix[i], expected[i])
+    void verifyResult(const std::vector<T>& expected) {
+        ASSERT_EQ(resultMatrix.size(), expected.size());
+        for (size_t i = 0; i < expected.size(); i++) {
+            EXPECT_NEAR(resultMatrix[i], expected[i], 1e-5)
                 << "Mismatch at index " << i;
         }
     }
 };
 
-TEST_F(MatrixOperationsTest, AdditionTest) {
-    addMatricesOnGPU(matrixA, matrixB, resultMatrix, kMatrixSize, kMatrixSize);
+using TestTypes = ::testing::Types<float, double>;
+TYPED_TEST_SUITE(MatrixOperationsTest, TestTypes);
 
-    float expectedSum[kMatrixElements] = {6, 8, 10, 12};
-    verifyResult(expectedSum);
+TYPED_TEST(MatrixOperationsTest, AdditionTest) {
+    addMatricesOnGPU(this->matrixA.data(), this->matrixB.data(), this->resultMatrix.data(),
+                     this->kMatrixSize, this->kMatrixSize);
+
+    std::vector<TypeParam> expectedSum = {6, 8, 10, 12};
+    this->verifyResult(expectedSum);
+}
+
+TYPED_TEST(MatrixOperationsTest, MultiplicationTest) {
+    multiplyMatricesOnGPU(this->matrixA.data(), this->matrixB.data(), this->resultMatrix.data(),
+                          this->kMatrixSize, this->kMatrixSize, this->kMatrixSize);
+
+    std::vector<TypeParam> expectedProduct = {19, 22, 43, 50};
+    this->verifyResult(expectedProduct);
 }
 
-TEST_F(MatrixOperationsTest, MultiplicationTest) {
-    multiplyMatricesOnGPU(matrixA, matrixB, resultMatrix, kMatrixSize, kMatrixSize, kMatrixSize);
+TYPED_TEST(MatrixOperationsTest, NonSquareAdditionTest) {
+    const int rows = 2;
+    const int cols = 3;
+    std::vector<TypeParam> nonSquareA = {1, 2, 3, 4, 5, 6};
+    std::vector<TypeParam> nonSquareB = {7, 8, 9, 10, 11, 12};
+    std::vector<TypeParam> nonSquareResult(rows * cols);
+
+    addMatricesOnGPU(nonSquareA.data(), nonSquareB.data(), nonSquareResult.data(), rows, cols);
 
-    float expectedProduct[kMatrixElements] = {19, 22, 43, 50};
-    verifyResult(expectedProduct);
+    std::vector<TypeParam> expectedSum = {8, 10, 12, 14, 16, 18};
+    ASSERT_EQ(nonSquareResult.size(), expectedSum.size());
+    for (size_t i = 0; i < expectedSum.size(); i++) {
+        EXPECT_NEAR(nonSquareResult[i], expectedSum[i], 1e-5)
+            << "Mismatch at index " << i;
+    }
+}
+
+TYPED_TEST(MatrixOperationsTest, NonSquareMultiplicationTest) {
+    const int rowsA = 2;
+    const int colsA = 3;
+    const int colsB = 2;
+    std::vector<TypeParam> nonSquareA = {1, 2, 3, 4, 5, 6};
+    std::vector<TypeParam> nonSquareB = {7, 8, 9, 10, 11, 12};
+    std::vector<TypeParam> nonSquareResult(rowsA * colsB);
+
+    multiplyMatricesOnGPU(nonSquareA.data(), nonSquareB.data(), nonSquareResult.data(),
+                          rowsA, colsA, colsB);
+
+    std::vector<TypeParam> expectedProduct = {58, 64, 139, 154};
+    ASSERT_EQ(nonSquareResult.size(), expectedProduct.size());
+    for (size_t i = 0; i < expectedProduct.size(); i++) {
+        EXPECT_NEAR(nonSquareResult[i], expectedProduct[i], 1e-5)
+            << "Mismatch at index " << i;
+    }
 }