Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
Zixin Zhang committed Sep 21, 2021
1 parent e9cafcc commit 4580221
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 14 deletions.
12 changes: 6 additions & 6 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#include "testing_helpers.hpp"

// The tests default to an array of size 1 << 8 = 256
const int SIZE = 1 << 16; // feel free to change the size of array
const int SIZE = 1 << 20; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
Expand Down Expand Up @@ -101,7 +101,7 @@ int main(int argc, char* argv[]) {
printCmpResult(NPOT, b, c);

printf("\n");

#if 0
zeroArray(SIZE, c);
printDesc("work-efficient scan, power-of-two");
StreamCompaction::Efficient::scan(SIZE, c, a);
Expand All @@ -115,12 +115,12 @@ int main(int argc, char* argv[]) {
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

#endif
zeroArray(SIZE, c);
printDesc("naive scan, power-of-two");
StreamCompaction::Naive::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
// printArray(SIZE, c, true);
printArray(SIZE, c, false);
printCmpResult(SIZE, b, c);

/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
Expand All @@ -141,14 +141,14 @@ int main(int argc, char* argv[]) {
printDesc("thrust scan, power-of-two");
StreamCompaction::Thrust::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printArray(SIZE, c, true);
// printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("thrust scan, non-power-of-two");
StreamCompaction::Thrust::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printArray(NPOT, c, true);
// printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);


Expand Down
4 changes: 2 additions & 2 deletions stream_compaction/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
#include <stdexcept>

/*! Block size used for CUDA kernel launch. */
#define blockSize 1024
#define sectionSize 1024
#define blockSize 256
#define sectionSize 256

#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
// usage: checkCUDAError("a descriptive name of this error")
Expand Down
57 changes: 51 additions & 6 deletions stream_compaction/naive.cu
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ namespace StreamCompaction {
}
}



/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
Expand All @@ -157,6 +159,7 @@ namespace StreamCompaction {
int* d_OutputExclusiveData;
int* d_SumArray;
int* d_SumArrayOutput;
int* d_SumArrayAx;

cudaMalloc((void**)&d_InputData, size);
checkCUDAError("cudaMalloc d_InputData failed!");
Expand All @@ -173,6 +176,9 @@ namespace StreamCompaction {
cudaMalloc((void**)&d_SumArrayOutput, sumArraySize);
checkCUDAError("cudaMalloc d_SumArrayOutput failed!");

cudaMalloc((void**)&d_SumArrayAx, sumArraySize);
checkCUDAError("cudaMalloc d_SumArrayOutput failed!");

cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice);

dim3 dimGridArray((n + blockSize - 1) / blockSize, 1, 1);
Expand All @@ -181,20 +187,61 @@ namespace StreamCompaction {
dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1);
dim3 dimBlockSumArray(blockSize, 1, 1);

// for testing
int* sumArray = new int[sumArrayNumEle];
int* sumArrayOutput = new int[sumArrayNumEle];

timer().startGpuTimer();
// First step: compute the scan result for individual sections
// then, store their block sum to sumArray
kernNaiveGPUScanFirstStep << <dimGridArray, dimBlockArray >> > (d_InputData,
d_OutputData, d_SumArray, n);
checkCUDAError("kernNaiveGPUScanFirstStep failed!");

//();
// cudaDeviceSynchronize();

cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost);
checkCUDAError("memCpy back failed!");

cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost);
checkCUDAError("memCpy back failed!");

std::cout << '\n';
for (int i = 0; i < n; i++)
{
std::cout << odata[i] << ' ';
if ((i + 1) % 8 == 0) {
std::cout << std::endl;
}
}

std::cout << '\n';
for (int i = 0; i < sumArrayNumEle; i++)
{
std::cout << sumArray[i] << ' ';
}

std::cout << '\n';

// Second step: scan block sums
kernNaiveGPUScanSecondStep << <dimGridSumArray, dimBlockSumArray >> > (
d_SumArray, d_SumArrayOutput, sumArrayNumEle);
checkCUDAError("kernNaiveGPUScanSecondStep failed!");
kernNaiveGPUScanFirstStep << <dimGridSumArray, dimBlockSumArray >> > (d_SumArray,
d_SumArrayOutput, d_SumArrayAx, n);

kernNaiveGPUScanThirdStep << <dimGridSumArray, dimBlockSumArray >> > (
d_SumArrayAx, d_SumArrayOutput, n);

cudaMemcpy(sumArrayOutput, d_SumArrayOutput, sumArraySize,
cudaMemcpyDeviceToHost);
checkCUDAError("memCpy back failed!");

printf("\n");

for (int i = 0; i < sumArrayNumEle; i++)
{
std::cout << sumArrayOutput[i] << ' ';
}

printf("\n");
//cudaDeviceSynchronize();

// Third step: add scanned block sum i to all values of scanned block
Expand All @@ -203,8 +250,6 @@ namespace StreamCompaction {
d_SumArrayOutput, d_OutputData, n);
checkCUDAError("kernNaiveGPUScanThirdStep failed!");

// cudaDeviceSynchronize();

// Last step:
convertFromInclusiveToExclusive << <dimGridArray, dimBlockArray >> > (
d_OutputData, d_OutputExclusiveData, n);
Expand Down

0 comments on commit 4580221

Please sign in to comment.