Skip to content

Commit

Permalink
feat(cuda): use generic for example
Browse files Browse the repository at this point in the history
  • Loading branch information
pplmx committed Sep 11, 2024
1 parent a31beb6 commit 3f65611
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 68 deletions.
21 changes: 19 additions & 2 deletions template/cuda/{{cookiecutter.project_slug}}/include/matrix_add.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,22 @@

#include <cuda_runtime.h>

__global__ void addMatricesKernel(const float* A, const float* B, float* C, int rows, int cols);
void addMatricesOnGPU(const float* A, const float* B, float* C, int rows, int cols);
namespace cuda_kernel {

// CUDA kernel for matrix addition
template <typename T>
__global__ void addMatricesKernel(const T* matrixA, const T* matrixB, T* resultMatrix, int numRows, int numCols);

// Debug kernel for printing matrices (if necessary)
template <typename T>
__global__ void printDebugInfo(const T* matrixA, const T* matrixB, T* resultMatrix);

} // namespace cuda_kernel

// Function to perform matrix addition on the GPU
template <typename T>
void addMatricesOnGPU(const T* hostMatrixA, const T* hostMatrixB, T* hostResultMatrix, int numRows, int numCols);

// Explicit instantiation declarations for addMatricesOnGPU
extern template void addMatricesOnGPU<float>(const float*, const float*, float*, int, int);
extern template void addMatricesOnGPU<double>(const double*, const double*, double*, int, int);
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,11 @@
#include <cuda_runtime.h>
#include <cublas_v2.h>

void multiplyMatricesOnGPU(const float* A, const float* B, float* C, int rowsA, int colsA, int colsB);
// Function to perform matrix multiplication on the GPU using cuBLAS
template <typename T>
void multiplyMatricesOnGPU(const T* hostMatrixA, const T* hostMatrixB, T* hostResultMatrix,
int numRowsA, int numColsA, int numColsB);

// Explicit instantiation declarations for multiplyMatricesOnGPU
extern template void multiplyMatricesOnGPU<float>(const float*, const float*, float*, int, int, int);
extern template void multiplyMatricesOnGPU<double>(const double*, const double*, double*, int, int, int);
57 changes: 34 additions & 23 deletions template/cuda/{{cookiecutter.project_slug}}/src/matrix_add.cu
Original file line number Diff line number Diff line change
@@ -1,31 +1,36 @@
#include "cuda_utils.h"
#include "matrix_add.h"

// CUDA kernel for adding two matrices element-wise
__global__ void addMatricesKernel(const float* matrixA, const float* matrixB, float* resultMatrix, int numRows, int numCols) {
// Calculate the global row and column indices for this thread
int rowIndex = blockIdx.y * blockDim.y + threadIdx.y;
int colIndex = blockIdx.x * blockDim.x + threadIdx.x;

// Check if this thread is within the matrix bounds
if (rowIndex < numRows && colIndex < numCols) {
// Calculate the linear index for the current element
int elementIndex = rowIndex * numCols + colIndex;
// Perform element-wise addition
resultMatrix[elementIndex] = matrixA[elementIndex] + matrixB[elementIndex];
namespace cuda_kernel {

template <typename T>
__global__ void addMatricesKernel(const T* matrixA, const T* matrixB, T* resultMatrix, int numRows, int numCols) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

if (row < numRows && col < numCols) {
int index = row * numCols + col;
resultMatrix[index] = matrixA[index] + matrixB[index];
}
}

template <typename T>
__global__ void printDebugInfo(const T* matrixA, const T* matrixB, T* resultMatrix) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
printf("GPU matrixA=%p\n", matrixA);
printf("GPU matrixB=%p\n", matrixB);
printf("GPU resultMatrix=%p\n", resultMatrix);
}
}

// Host function to set up and execute matrix addition on GPU
void addMatricesOnGPU(const float* hostMatrixA, const float* hostMatrixB, float* hostResultMatrix, int numRows, int numCols) {
// Calculate total size of the matrices in bytes
size_t matrixSizeBytes = numRows * numCols * sizeof(float);
} // namespace cuda_kernel

template <typename T>
void addMatricesOnGPU(const T* hostMatrixA, const T* hostMatrixB, T* hostResultMatrix, int numRows, int numCols) {
size_t matrixSizeBytes = numRows * numCols * sizeof(T);

// Declare pointers for device (GPU) memory
float* deviceMatrixA;
float* deviceMatrixB;
float* deviceResultMatrix;
T *deviceMatrixA, *deviceMatrixB, *deviceResultMatrix;

// Allocate memory on the GPU
CUDA_CHECK(cudaMalloc(&deviceMatrixA, matrixSizeBytes));
CUDA_CHECK(cudaMalloc(&deviceMatrixB, matrixSizeBytes));
CUDA_CHECK(cudaMalloc(&deviceResultMatrix, matrixSizeBytes));
Expand All @@ -39,8 +44,10 @@ void addMatricesOnGPU(const float* hostMatrixA, const float* hostMatrixB, float*
dim3 numBlocks((numCols + threadsPerBlock.x - 1) / threadsPerBlock.x,
(numRows + threadsPerBlock.y - 1) / threadsPerBlock.y);

// Launch the CUDA kernel
addMatricesKernel<<<numBlocks, threadsPerBlock>>>(deviceMatrixA, deviceMatrixB, deviceResultMatrix, numRows, numCols);
cuda_kernel::addMatricesKernel<<<numBlocks, threadsPerBlock>>>(
deviceMatrixA, deviceMatrixB, deviceResultMatrix, numRows, numCols);

cuda_kernel::printDebugInfo<<<1, 1>>>(deviceMatrixA, deviceMatrixB, deviceResultMatrix);

// Check for errors
CUDA_CHECK(cudaGetLastError());
Expand All @@ -56,3 +63,7 @@ void addMatricesOnGPU(const float* hostMatrixA, const float* hostMatrixB, float*
CUDA_CHECK(cudaFree(deviceMatrixB));
CUDA_CHECK(cudaFree(deviceResultMatrix));
}

// Explicit instantiations
template void addMatricesOnGPU<float>(const float*, const float*, float*, int, int);
template void addMatricesOnGPU<double>(const double*, const double*, double*, int, int);
61 changes: 37 additions & 24 deletions template/cuda/{{cookiecutter.project_slug}}/src/matrix_mult.cu
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
#include "cuda_utils.h"
#include "matrix_mult.h"

// Function to perform matrix multiplication on GPU using cuBLAS
void multiplyMatricesOnGPU(const float* hostMatrixA, const float* hostMatrixB, float* hostResultMatrix,
template <typename T>
void multiplyMatricesOnGPU(const T* hostMatrixA, const T* hostMatrixB, T* hostResultMatrix,
int numRowsA, int numColsA, int numColsB) {
// Calculate sizes in bytes for each matrix
size_t byteSizeA = numRowsA * numColsA * sizeof(float);
size_t byteSizeB = numColsA * numColsB * sizeof(float);
size_t byteSizeC = numRowsA * numColsB * sizeof(float);
size_t byteSizeA = numRowsA * numColsA * sizeof(T);
size_t byteSizeB = numColsA * numColsB * sizeof(T);
size_t byteSizeC = numRowsA * numColsB * sizeof(T);

// Declare pointers for device (GPU) memory
float *deviceMatrixA, *deviceMatrixB, *deviceResultMatrix;
T *deviceMatrixA, *deviceMatrixB, *deviceResultMatrix;

// Allocate memory on the GPU
CUDA_CHECK(cudaMalloc(&deviceMatrixA, byteSizeA));
Expand All @@ -20,30 +19,44 @@ void multiplyMatricesOnGPU(const float* hostMatrixA, const float* hostMatrixB, f
CUDA_CHECK(cudaMemcpy(deviceMatrixA, hostMatrixA, byteSizeA, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(deviceMatrixB, hostMatrixB, byteSizeB, cudaMemcpyHostToDevice));

// Create cuBLAS handle
cublasHandle_t cublasHandle;
CUBLAS_CHECK(cublasCreate(&cublasHandle));

// Set up parameters for cublasSgemm
const float alpha = 1.0f;
const float beta = 0.0f;
const T alpha = 1.0;
const T beta = 0.0;

// Perform matrix multiplication using cuBLAS
CUBLAS_CHECK(cublasSgemm(cublasHandle,
CUBLAS_OP_N, CUBLAS_OP_N,
numColsB, numRowsA, numColsA,
&alpha,
deviceMatrixB, numColsB,
deviceMatrixA, numColsA,
&beta,
deviceResultMatrix, numColsB));

// Copy the result back to host memory
if constexpr (std::is_same_v<T, float>) {
CUBLAS_CHECK(cublasSgemm(cublasHandle,
CUBLAS_OP_N, CUBLAS_OP_N,
numColsB, numRowsA, numColsA,
&alpha,
deviceMatrixB, numColsB,
deviceMatrixA, numColsA,
&beta,
deviceResultMatrix, numColsB));
} else if constexpr (std::is_same_v<T, double>) {
CUBLAS_CHECK(cublasDgemm(cublasHandle,
CUBLAS_OP_N, CUBLAS_OP_N,
numColsB, numRowsA, numColsA,
&alpha,
deviceMatrixB, numColsB,
deviceMatrixA, numColsA,
&beta,
deviceResultMatrix, numColsB));
} else {
static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
"Only float and double types are supported");
}

CUDA_CHECK(cudaMemcpy(hostResultMatrix, deviceResultMatrix, byteSizeC, cudaMemcpyDeviceToHost));

// Clean up: Free GPU memory and destroy cuBLAS handle
CUBLAS_CHECK(cublasDestroy(cublasHandle));
CUDA_CHECK(cudaFree(deviceMatrixA));
CUDA_CHECK(cudaFree(deviceMatrixB));
CUDA_CHECK(cudaFree(deviceResultMatrix));
CUBLAS_CHECK(cublasDestroy(cublasHandle));
}

// Explicit instantiations
template void multiplyMatricesOnGPU<float>(const float*, const float*, float*, int, int, int);
template void multiplyMatricesOnGPU<double>(const double*, const double*, double*, int, int, int);
79 changes: 61 additions & 18 deletions template/cuda/{{cookiecutter.project_slug}}/tests/test_matrix.cpp
Original file line number Diff line number Diff line change
@@ -1,42 +1,85 @@
#include <gtest/gtest.h>
#include "matrix_add.h"
#include "matrix_mult.h"
#include <vector>

template <typename T>
class MatrixOperationsTest : public ::testing::Test {
protected:
static constexpr int kMatrixSize = 2;
static constexpr int kMatrixElements = kMatrixSize * kMatrixSize;

float matrixA[kMatrixElements];
float matrixB[kMatrixElements];
float resultMatrix[kMatrixElements];
std::vector<T> matrixA;
std::vector<T> matrixB;
std::vector<T> resultMatrix;

void SetUp() override {
// Initialize matrices A and B with test data
float dataA[kMatrixElements] = {1, 2, 3, 4};
float dataB[kMatrixElements] = {5, 6, 7, 8};
std::copy(std::begin(dataA), std::end(dataA), std::begin(matrixA));
std::copy(std::begin(dataB), std::end(dataB), std::begin(matrixB));
matrixA = {1, 2, 3, 4};
matrixB = {5, 6, 7, 8};
resultMatrix.resize(kMatrixElements);
}

void verifyResult(const float expected[kMatrixElements]) {
for (int i = 0; i < kMatrixElements; i++) {
EXPECT_FLOAT_EQ(resultMatrix[i], expected[i])
void verifyResult(const std::vector<T>& expected) {
ASSERT_EQ(resultMatrix.size(), expected.size());
for (size_t i = 0; i < expected.size(); i++) {
EXPECT_NEAR(resultMatrix[i], expected[i], 1e-5)
<< "Mismatch at index " << i;
}
}
};

TEST_F(MatrixOperationsTest, AdditionTest) {
addMatricesOnGPU(matrixA, matrixB, resultMatrix, kMatrixSize, kMatrixSize);
using TestTypes = ::testing::Types<float, double>;
TYPED_TEST_SUITE(MatrixOperationsTest, TestTypes);

float expectedSum[kMatrixElements] = {6, 8, 10, 12};
verifyResult(expectedSum);
TYPED_TEST(MatrixOperationsTest, AdditionTest) {
addMatricesOnGPU(this->matrixA.data(), this->matrixB.data(), this->resultMatrix.data(),
this->kMatrixSize, this->kMatrixSize);

std::vector<TypeParam> expectedSum = {6, 8, 10, 12};
this->verifyResult(expectedSum);
}

TYPED_TEST(MatrixOperationsTest, MultiplicationTest) {
multiplyMatricesOnGPU(this->matrixA.data(), this->matrixB.data(), this->resultMatrix.data(),
this->kMatrixSize, this->kMatrixSize, this->kMatrixSize);

std::vector<TypeParam> expectedProduct = {19, 22, 43, 50};
this->verifyResult(expectedProduct);
}

TEST_F(MatrixOperationsTest, MultiplicationTest) {
multiplyMatricesOnGPU(matrixA, matrixB, resultMatrix, kMatrixSize, kMatrixSize, kMatrixSize);
TYPED_TEST(MatrixOperationsTest, NonSquareAdditionTest) {
const int rows = 2;
const int cols = 3;
std::vector<TypeParam> nonSquareA = {1, 2, 3, 4, 5, 6};
std::vector<TypeParam> nonSquareB = {7, 8, 9, 10, 11, 12};
std::vector<TypeParam> nonSquareResult(rows * cols);

addMatricesOnGPU(nonSquareA.data(), nonSquareB.data(), nonSquareResult.data(), rows, cols);

float expectedProduct[kMatrixElements] = {19, 22, 43, 50};
verifyResult(expectedProduct);
std::vector<TypeParam> expectedSum = {8, 10, 12, 14, 16, 18};
ASSERT_EQ(nonSquareResult.size(), expectedSum.size());
for (size_t i = 0; i < expectedSum.size(); i++) {
EXPECT_NEAR(nonSquareResult[i], expectedSum[i], 1e-5)
<< "Mismatch at index " << i;
}
}

TYPED_TEST(MatrixOperationsTest, NonSquareMultiplicationTest) {
const int rowsA = 2;
const int colsA = 3;
const int colsB = 2;
std::vector<TypeParam> nonSquareA = {1, 2, 3, 4, 5, 6};
std::vector<TypeParam> nonSquareB = {7, 8, 9, 10, 11, 12};
std::vector<TypeParam> nonSquareResult(rowsA * colsB);

multiplyMatricesOnGPU(nonSquareA.data(), nonSquareB.data(), nonSquareResult.data(),
rowsA, colsA, colsB);

std::vector<TypeParam> expectedProduct = {58, 64, 139, 154};
ASSERT_EQ(nonSquareResult.size(), expectedProduct.size());
for (size_t i = 0; i < expectedProduct.size(); i++) {
EXPECT_NEAR(nonSquareResult[i], expectedProduct[i], 1e-5)
<< "Mismatch at index " << i;
}
}

0 comments on commit 3f65611

Please sign in to comment.