diff --git a/src/main.cpp b/src/main.cpp index 79c54fb..e511d7d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ #include "testing_helpers.hpp" // The tests default to an array of size 1 << 8 = 256 -const int SIZE = 1 << 16; // feel free to change the size of array +const int SIZE = 1 << 20; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -101,7 +101,7 @@ int main(int argc, char* argv[]) { printCmpResult(NPOT, b, c); printf("\n"); - +#if 0 zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); @@ -115,12 +115,12 @@ int main(int argc, char* argv[]) { printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - +#endif zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - // printArray(SIZE, c, true); + printArray(SIZE, c, false); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -141,14 +141,14 @@ int main(int argc, char* argv[]) { printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - printArray(SIZE, c, true); + // printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - printArray(NPOT, c, true); + // printArray(NPOT, c, true); printCmpResult(NPOT, b, c); diff --git a/stream_compaction/common.h b/stream_compaction/common.h index b456f2f..b784980 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -11,8 +11,8 @@ #include /*! Block size used for CUDA kernel launch. */ -#define blockSize 1024 -#define sectionSize 1024 +#define blockSize 256 +#define sectionSize 256 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) // usage: checkCUDAError("a descriptive name of this error") diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 8c038c4..25c6880 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -144,6 +144,8 @@ namespace StreamCompaction { } } + + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ @@ -157,6 +159,7 @@ namespace StreamCompaction { int* d_OutputExclusiveData; int* d_SumArray; int* d_SumArrayOutput; + int* d_SumArrayAx; cudaMalloc((void**)&d_InputData, size); checkCUDAError("cudaMalloc d_InputData failed!"); @@ -173,6 +176,9 @@ namespace StreamCompaction { cudaMalloc((void**)&d_SumArrayOutput, sumArraySize); checkCUDAError("cudaMalloc d_SumArrayOutput failed!"); + cudaMalloc((void**)&d_SumArrayAx, sumArraySize); + checkCUDAError("cudaMalloc d_SumArrayOutput failed!"); + cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice); dim3 dimGridArray((n + blockSize - 1) / blockSize, 1, 1); @@ -181,6 +187,10 @@ namespace StreamCompaction { dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1); dim3 dimBlockSumArray(blockSize, 1, 1); + // for testing + int* sumArray = new int[sumArrayNumEle]; + int* sumArrayOutput = new int[sumArrayNumEle]; + timer().startGpuTimer(); // First step: compute the scan result for individual sections // then, store their block sum to sumArray @@ -188,13 +198,50 @@ namespace StreamCompaction { d_OutputData, d_SumArray, n); checkCUDAError("kernNaiveGPUScanFirstStep failed!"); - //(); + // cudaDeviceSynchronize(); + + cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + + cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + + std::cout << '\n'; + for (int i = 0; i < n; i++) + { + std::cout << odata[i] << ' '; + if ((i + 1) % 8 == 0) { + std::cout << std::endl; + } + } + + std::cout << '\n'; + for (int i = 0; i < sumArrayNumEle; i++) + { + std::cout << sumArray[i] << ' '; + } + + std::cout << '\n'; // Second step: scan block sums - kernNaiveGPUScanSecondStep << > > ( - d_SumArray, d_SumArrayOutput, sumArrayNumEle); - checkCUDAError("kernNaiveGPUScanSecondStep failed!"); + kernNaiveGPUScanFirstStep << > > (d_SumArray, + d_SumArrayOutput, d_SumArrayAx, n); + + kernNaiveGPUScanThirdStep << > > ( + d_SumArrayAx, d_SumArrayOutput, n); + + cudaMemcpy(sumArrayOutput, d_SumArrayOutput, sumArraySize, + cudaMemcpyDeviceToHost); + checkCUDAError("memCpy back failed!"); + printf("\n"); + + for (int i = 0; i < sumArrayNumEle; i++) + { + std::cout << sumArrayOutput[i] << ' '; + } + + printf("\n"); //cudaDeviceSynchronize(); // Third step: add scanned block sum i to all values of scanned block @@ -203,8 +250,6 @@ namespace StreamCompaction { d_SumArrayOutput, d_OutputData, n); checkCUDAError("kernNaiveGPUScanThirdStep failed!"); - // cudaDeviceSynchronize(); - // Last step: convertFromInclusiveToExclusive << > > ( d_OutputData, d_OutputExclusiveData, n);