diff --git a/README.md b/README.md index 0e38ddb..9a949a7 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,48 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Shubham Sharma + * [LinkedIn](www.linkedin.com/in/codeshubham), [personal website](https://shubhvr.com/). +* Tested on: Windows 10, i7-9750H @ 2.26GHz, 16GB, GTX 1660ti 6GB (Personal Computer). +*GPU Compute Capability: 7.5 -### (TODO: Your README) +## Stream Compaction +This project involves +- CPU version of Scan, +- CPU version of Compact without using the Scan algorithm, +- CPU version of Compact with Scan, +- Naive version of Scan, +- Work-efficient version of Scan, and +- Work-efficient version of Compact that used the work-efficient Scan's code. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +The three cpu calculations are serialized; no multi-threading was consolidated. We have used simple cpu scan and compaction to compare the results with the GPU parallelised algorithm implementation. All the results are then compared. Results of CUDA's Thrust library are also used to compare the execution times of each implementation. +## Performance Analysis +The projects implements both CPU and GPU timing functions as a performance timer class to conveniently measure the time cost. `std::chrono` is used, to provide CPU high-precision timing and CUDA event to measure the CUDA performance. +I have collected the data across 8 executions with different array sizes to collect the data. The program generates a new array of random values with each execution, where the size of array is customisable. I have varied the size of the arrays by powers of two, starting from 2^6^ all the wai to 2^28^. The program also executes each algorithm for arrays of size "non- power of two" which are generated truncating the "power of two" arrays. +![Performance Analysis](img/18.PNG) + + +## Scan Runtime Analysis +The performance of the four scan functions is graphed below. +![Scan Runtime Analysis](img/ScanAlgorithmAnalysis.png) + +- CPU Scan vs Other : From the graph above we can see that for array of smaller sizes the CPU implementation is way better than GPU Scan implementations but as the size of array increases the performance gap starts decreasing. This can be attributed toward the serialized algorithm/ implementation for CPU compared to the parallel implementation on the GPU. At some size of an array the GPU implementation would definitely have crossed over CPU's but, Unfortunately my GPU ran out of memory before i could reach that point. +- Naive vs Work Efficient : Throughout the executions of different sizes of Arrays the Naive Implementation performed consistently better than the Work Efficient. This is due to the fact that + - Even though it looks like we have limited the number of active threads while performing "Upsweeep" and "Downsweep" the threads which are not doing anything have to wait for the other active threads in the warp to finish to become available again. + - Those idle threads cant be utilised by the GPU to perform any other tasks in the same depth of an upsweep or downsweep thereby decreasing our Parallelism. + +## Compaction Runtime Analysis +The performance of the three compact algorithm is graphed below. +![Scan Runtime Analysis](img/Compaction.png) + +We see a similar trend in values as above Scan algorithm as these compact algorithms derive from Scan algorithm respectively. + +## Radix Sort (Extra Credit) +I have implemented "Parallel" radix sort which effectively work on compare bits of a decimal number by converting them to binary. This process starts by comparing least significant bit and continues until we have reached the most significant bit. +To check the authenticity of my implementation, I have compared results from mine to CUDA's Thrust::sort results. I tested it from 2^6^ size arrays upto 2^26^ where it passed on all the scenario's thereby validating my implementation. A screenshot of my result is shown below. An array of size 2^18^ is used. + +![Radix Sort](img/RadixSort.PNG) + +## Blooper +Apparently on the GPU side function: pow(2,12) was returning a value of 4027 which is super absurd. I fixed it by using bitwise opertaion 1<<12 which gave me the expected result 4028. \ No newline at end of file diff --git a/img/18.PNG b/img/18.PNG new file mode 100644 index 0000000..4ebb5b4 Binary files /dev/null and b/img/18.PNG differ diff --git a/img/22.PNG b/img/22.PNG new file mode 100644 index 0000000..480e987 Binary files /dev/null and b/img/22.PNG differ diff --git a/img/24.PNG b/img/24.PNG new file mode 100644 index 0000000..2237636 Binary files /dev/null and b/img/24.PNG differ diff --git a/img/26.PNG b/img/26.PNG new file mode 100644 index 0000000..dcd0e20 Binary files /dev/null and b/img/26.PNG differ diff --git a/img/28.PNG b/img/28.PNG new file mode 100644 index 0000000..9dc8301 Binary files /dev/null and b/img/28.PNG differ diff --git a/img/2power_10.PNG b/img/2power_10.PNG new file mode 100644 index 0000000..e2e8b31 Binary files /dev/null and b/img/2power_10.PNG differ diff --git a/img/2power_12.PNG b/img/2power_12.PNG new file mode 100644 index 0000000..c2c6325 Binary files /dev/null and b/img/2power_12.PNG differ diff --git a/img/2power_6.PNG b/img/2power_6.PNG new file mode 100644 index 0000000..f220a34 Binary files /dev/null and b/img/2power_6.PNG differ diff --git a/img/Compaction.png b/img/Compaction.png new file mode 100644 index 0000000..9fe8341 Binary files /dev/null and b/img/Compaction.png differ diff --git a/img/RadixSort.PNG b/img/RadixSort.PNG new file mode 100644 index 0000000..b9e258b Binary files /dev/null and b/img/RadixSort.PNG differ diff --git a/img/ScanAlgorithmAnalysis.png b/img/ScanAlgorithmAnalysis.png new file mode 100644 index 0000000..52fd992 Binary files /dev/null and b/img/ScanAlgorithmAnalysis.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..bed3fdc 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -12,12 +12,15 @@ #include #include #include "testing_helpers.hpp" +#include -const int SIZE = 1 << 8; // feel free to change the size of array +//const int SIZE = 1 << 4; // feel free to change the size of array +const int SIZE = 1 << 28; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; +int *d = new int[SIZE]; int main(int argc, char* argv[]) { // Scan tests @@ -51,7 +54,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + //printArray(SIZE, c, false); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -64,7 +67,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + //printArray(SIZE, c, false); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -95,6 +98,46 @@ int main(int argc, char* argv[]) { //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + printf("\n"); + printf("*****************************\n"); + printf("** Radix Sort **\n"); + printf("*****************************\n"); + + zeroArray(SIZE, d); + printDesc("Radix Sort(power-of-two): Thrust"); + StreamCompaction::RadixSort::PerformThrustSort(SIZE, d, a); + printElapsedTime(StreamCompaction::RadixSort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, a, false); + //printArray(SIZE, d, false); + + + zeroArray(SIZE, c); + printDesc("Radix Sort(power-of-two): My Implementation"); + StreamCompaction::RadixSort::PerformGPUSort(SIZE, c, a); + printElapsedTime(StreamCompaction::RadixSort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, a, false); + //printArray(SIZE, c, false); + printCmpResult(SIZE, d, c); + + + zeroArray(NPOT, d); + printDesc("Radix Sort(non-power-of-two): Thrust"); + StreamCompaction::RadixSort::PerformThrustSort(SIZE, d, a); + printElapsedTime(StreamCompaction::RadixSort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, a, false); + //printArray(SIZE, d, false); + + + zeroArray(NPOT, c); + printDesc("Radix Sort(non-power-of-two): My Implementation"); + StreamCompaction::RadixSort::PerformGPUSort(SIZE, c, a); + printElapsedTime(StreamCompaction::RadixSort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, a, false); + //printArray(SIZE, c, false); + printCmpResult(NPOT, d, c); + + + printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); @@ -147,6 +190,8 @@ int main(int argc, char* argv[]) { //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 567795b..4bcc9e9 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -4,6 +4,7 @@ set(headers "naive.h" "efficient.h" "thrust.h" + "RadixSort.h" ) set(sources @@ -12,6 +13,7 @@ set(sources "naive.cu" "efficient.cu" "thrust.cu" + "RadixSort.cu" ) list(SORT headers) diff --git a/stream_compaction/RadixSort.cu b/stream_compaction/RadixSort.cu new file mode 100644 index 0000000..5c212f5 --- /dev/null +++ b/stream_compaction/RadixSort.cu @@ -0,0 +1,173 @@ +#include "RadixSort.h" +#include +#include +#include +#include +namespace StreamCompaction { + namespace RadixSort { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) // We can use defines provided in this project + + int* dev_buf; + int* bufBit; + int* falseBuf; + int* trueBuf; + int* bufNotBits; + int* bufScatter; + int* bufAnswer; +#define blockSize 512 + + + void AllocateMemory(int n) + { + cudaMalloc((void**)&dev_buf, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_buf failed!"); + cudaMalloc((void**)&bufBit, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_bufloader failed!"); + cudaMalloc((void**)&falseBuf, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_bufB failed!"); + cudaMalloc((void**)&trueBuf, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_bufS failed!"); + cudaMalloc((void**)&bufNotBits, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_bufAnswers failed!"); + cudaMalloc((void**)&bufScatter, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_bufAnswers failed!"); + cudaMalloc((void**)&bufAnswer, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_bufAnswers failed!"); + cudaDeviceSynchronize(); + } + + void FreeMemory() { + cudaFree(dev_buf); + cudaFree(bufBit); + cudaFree(falseBuf); + cudaFree(trueBuf); + cudaFree(bufNotBits); + cudaFree(bufScatter); + cudaFree(bufAnswer); + } + + + __global__ void PopulateBits(int bitOrder, int* bufInputData, int* bufBit, int N) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index > N - 1) + { + return; + } + int mask = 1 << bitOrder; + int masked_num = bufInputData[index] & mask; + int thebit = masked_num >> bitOrder; + bufBit[index] = thebit; + } + + __global__ void PopulateNotBits(int *bitNotBits, const int* bufBits, int N) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index > N - 1) + { + return; + } + if (bufBits[index] == 0) + { + bitNotBits[index] = 1; + return; + } + bitNotBits[index] = 0; + } + + __global__ void CopyAnswerToInputBuf(int* BufAnswer, int* ScatterBuffer, int* InputBuf, int N) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index > N - 1) + { + return; + } + BufAnswer[ScatterBuffer[index]] = InputBuf[index]; + } + + + __global__ void ComputeTArray(int* outputBuf, int *falseBuf, int totalFalses, int N) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index > N - 1) + { + return; + } + outputBuf[index] = index - falseBuf[index] + totalFalses; + } + + __global__ void PerformScatter(int* outputBuf, int* inputBuf, int* bitBuf, int*falseBuf, int *trueBuf, int N) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index > N - 1) + { + return; + } + if (bitBuf[index]) + { + outputBuf[index] = trueBuf[index]; + return; + } + outputBuf[index] = falseBuf[index]; + + } + + + void PerformThrustSort(int n, int* odata, const int* idata) + { + thrust::host_vectorhstIn(idata, idata + n); + timer().startGpuTimer(); + // TODO use `thrust::exclusive_scan` + // example: for device_vectors dv_in and dv_out: + // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::sort(hstIn.begin(), hstIn.end()); + + thrust::copy(hstIn.begin(), hstIn.end(), odata); + + timer().endGpuTimer(); + } + + + + void PerformGPUSort(int n, int* odata, const int* idata) + { + AllocateMemory(n); + cudaMemcpy(dev_buf, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + timer().startGpuTimer(); + for (int i = 0; i < 6; i++) + { + PopulateBits << < fullBlocksPerGrid, blockSize >> > (i, dev_buf, bufBit, n); + cudaDeviceSynchronize(); + PopulateNotBits << < fullBlocksPerGrid, blockSize >> > (bufNotBits, bufBit, n); + cudaDeviceSynchronize(); + + int* inputNotBits= new int[n]; + cudaMemcpy(inputNotBits, bufNotBits, n * sizeof(int), cudaMemcpyDeviceToHost); + Efficient::scan(n, odata, inputNotBits); + cudaMemcpy(falseBuf, odata, n * sizeof(int), cudaMemcpyHostToDevice); + + int TotalFalses = inputNotBits[n - 1] + odata[n - 1]; + ComputeTArray << < fullBlocksPerGrid, blockSize >> > (trueBuf, falseBuf, TotalFalses, n); + cudaDeviceSynchronize(); + PerformScatter << < fullBlocksPerGrid, blockSize >> > (bufScatter, dev_buf, bufBit, falseBuf, trueBuf, n); + cudaDeviceSynchronize(); + CopyAnswerToInputBuf << < fullBlocksPerGrid, blockSize >> > (bufAnswer, bufScatter, dev_buf, n); + cudaDeviceSynchronize(); + std::swap(dev_buf, bufAnswer); + cudaDeviceSynchronize(); + } + timer().endGpuTimer(); + cudaMemcpy(odata, dev_buf, sizeof(int) * n, cudaMemcpyDeviceToHost); + } + + } +} \ No newline at end of file diff --git a/stream_compaction/RadixSort.h b/stream_compaction/RadixSort.h new file mode 100644 index 0000000..ae50e37 --- /dev/null +++ b/stream_compaction/RadixSort.h @@ -0,0 +1,13 @@ +#pragma once +#include "common.h" +#include "efficient.h" + +namespace StreamCompaction { + namespace RadixSort { + StreamCompaction::Common::PerformanceTimer& timer(); + + + void PerformThrustSort(int n, int* odata, const int* idata); + void PerformGPUSort(int n, int* odata, const int* idata); + } +} \ No newline at end of file diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..29fbc33 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,17 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index > n - 1) + { + return; + } + if (idata[index] == 0) + { + bools[index] = 0; + return; + } + bools[index] = 1; } /** @@ -33,6 +44,17 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index > n - 1) + { + return; + } + + if (bools[index] == 0) + { + return; + } + odata[indices[index]] = idata[index]; } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..1643c05 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -18,8 +18,16 @@ namespace StreamCompaction { * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. */ void scan(int n, int *odata, const int *idata) { + timer().startCpuTimer(); // TODO + + odata[0] = 0; + for (int i = 1; i < n; i++) + { + odata[i] = odata[i - 1] + idata[i-1]; + } + timer().endCpuTimer(); } @@ -31,8 +39,18 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int outIndex=0; + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + { + odata[outIndex] = idata[i]; + outIndex++; + } + } + timer().endCpuTimer(); - return -1; + return outIndex; } /** @@ -41,10 +59,70 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + + int *arr_b = new int[n]; + int *arr_c = new int[n]; // exclusive array timer().startCpuTimer(); - // TODO + + + int *ans = new int[n]; + + int finalIndex = 0; +#pragma region NonPower2 + + int *arr_z = new int[n]; + + int nearesttwo = 1 << ilog2ceil(n); + + + int difference = nearesttwo - n; + + for (int i = 0; i < difference; i++) + { + arr_z[i] = 0; + } + + + for (int i = 0; i < n; i++) + { + arr_z[i + difference] = idata[i]; + } + + n = n + difference; + + for (int i = 0; i < n; i++) + { + if (arr_z[i] == 0) + { + arr_b[i] = 0; + continue; + } + arr_b[i] = 1; + } + + arr_c[0] = 0; + for (int i = 1; i < n; i++) + { + arr_c[i] = arr_b[i - 1] + arr_c[i - 1]; + } + + for (int i = 0; i < n; i++) + { + if (arr_b[i] == 0) + { + continue; + } + int index = arr_c[i]; + odata[index] = idata[i]; + ans[index] = idata[i]; + finalIndex = index; + } + + + +#pragma endregion timer().endCpuTimer(); - return -1; + return finalIndex+1; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..caf8c8f 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -4,37 +4,253 @@ #include "efficient.h" namespace StreamCompaction { - namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - - /** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ - int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; - } - } + namespace Efficient { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) // We can use defines provided in this project + + int* dev_buf; + int* dev_bufloader; + int* dev_bufB; + int* dev_bufS; + int* dev_bufAnswers; + +#define blockSize 512 + + __global__ void performUpSweep(int d, int* buf, int N) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + int pow2_d = 1 << d; + int pow2_dplus1 = 1 << (d + 1); + if (index + pow2_dplus1 - 1 > N - 1) + { + return; + } + + if ((index + pow2_dplus1 - 1) % pow2_dplus1 == (pow2_dplus1 - 1)) + { + buf[index + pow2_dplus1 - 1] += buf[index + pow2_d - 1]; + } + + if (index + pow2_dplus1 - 1 == N - 1) + { + buf[index + pow2_dplus1 - 1] = 0; + return; + } + + } + + __global__ void performDownSweep(int d, int* buf, int N) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + int pow2_d = 1< N - 1) + { + return; + } + + if ((index + pow2_dplus1 - 1) % pow2_dplus1 == (pow2_dplus1 - 1)) + { + + int t = buf[index + pow2_d - 1]; + buf[index + pow2_d - 1] = buf[index + pow2_dplus1 - 1]; + buf[index + pow2_dplus1 - 1] += t; + } + } + + void AllocateMemory(int n) + { + cudaMalloc((void**)&dev_buf, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_buf failed!"); + cudaMalloc((void**)&dev_bufloader, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_bufloader failed!"); + cudaMalloc((void**)&dev_bufB, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_bufB failed!"); + cudaMalloc((void**)&dev_bufS, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_bufS failed!"); + cudaMalloc((void**)&dev_bufAnswers, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_bufAnswers failed!"); + cudaDeviceSynchronize(); + } + + void FreeMemory() { + cudaFree(dev_buf); + cudaFree(dev_bufloader); + cudaFree(dev_bufB); + cudaFree(dev_bufS); + cudaFree(dev_bufAnswers); + } + + __global__ void RightShiftAddZeros(int* buf,int *buf_loader, int N, int difference) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index > N - 1) + { + return; + } + if (index < difference) + { + buf[index] = 0; + return; + } + buf[index] = buf_loader[index-difference]; + } + + __global__ void RightShiftDeleteZeroes(int* buf, int* buf_loader, int N, int difference) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index > N - 1) + { + return; + } + buf[index] = buf_loader[index + difference]; + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int* odata, const int* idata) { + timer().startGpuTimer(); + // TODO + + int nearesttwo = 1<> > (dev_buf, dev_bufloader, finalMemSize, difference); + + int d = ilog2ceil(finalMemSize); + + for (int i = 0; i <= d - 1; i++) + { + performUpSweep << < fullBlocksPerGrid, blockSize >> > (i, dev_buf, finalMemSize); + } + + + for (int i = d - 1; i >= 0; i--) + { + performDownSweep << < fullBlocksPerGrid, blockSize >> > (i, dev_buf, finalMemSize); + } + + RightShiftDeleteZeroes << < fullBlocksPerGrid, blockSize >> > (dev_bufloader, dev_buf, n, difference); + cudaDeviceSynchronize(); + cudaMemcpy(odata, dev_bufloader, sizeof(int) * n, cudaMemcpyDeviceToHost); + + timer().endGpuTimer(); + + FreeMemory(); + cudaDeviceSynchronize(); + } + + int* dev_buf2; + int* dev_bufloader2; + + void AllocateMemory2(int n) + { + cudaMalloc((void**)&dev_buf2, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_buf failed!"); + cudaMalloc((void**)&dev_bufloader2, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_bufloader failed!"); + + } + + void FreeMemory2() { + cudaFree(dev_buf2); + cudaFree(dev_bufloader2); + } + + + void scanWithoutTimer(int n, int* odata, const int* idata) { + // TODO + int nearesttwo = 1 << ilog2ceil(n); + + int difference = nearesttwo - n; + + int finalMemSize = nearesttwo; + + AllocateMemory2(finalMemSize); + dim3 fullBlocksPerGrid((finalMemSize + blockSize - 1) / blockSize); + cudaMemcpy(dev_bufloader2, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + RightShiftAddZeros << < fullBlocksPerGrid, blockSize >> > (dev_buf2, dev_bufloader2, finalMemSize, difference); + + int d = ilog2ceil(finalMemSize); + + for (int i = 0; i <= d - 1; i++) + { + performUpSweep << < fullBlocksPerGrid, blockSize >> > (i, dev_buf2, finalMemSize); + } + + for (int i = d - 1; i >= 0; i--) + { + performDownSweep << < fullBlocksPerGrid, blockSize >> > (i, dev_buf2, finalMemSize); + } + + RightShiftDeleteZeroes << < fullBlocksPerGrid, blockSize >> > (dev_bufloader2, dev_buf2, n, difference); + cudaMemcpy(odata, dev_bufloader2, sizeof(int) * n, cudaMemcpyDeviceToHost); + FreeMemory2(); + } + + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ + int compact(int n, int* odata, const int* idata) { + int finalMemSize = n; + AllocateMemory(finalMemSize); + + + timer().startGpuTimer(); + // TODO + cudaMemcpy(dev_buf, idata, sizeof(int) * finalMemSize, cudaMemcpyHostToDevice); + + dim3 fullBlocksPerGrid((finalMemSize + blockSize - 1) / blockSize); + + + Common::kernMapToBoolean << < fullBlocksPerGrid, blockSize >> > (finalMemSize, dev_bufB, dev_buf); + + //Copy bool value to new array to perform scan + int* arr_boolean = new int[finalMemSize]; + cudaMemcpy(arr_boolean, dev_bufB, sizeof(int) * finalMemSize, cudaMemcpyDeviceToHost); + + //Create new array to store answers from scan + int* arr_scanResult = new int[finalMemSize]; + scanWithoutTimer(finalMemSize, arr_scanResult, arr_boolean); + + //Copy the scan answers to Dev_BufS to further process + cudaMemcpy(dev_bufS, arr_scanResult, sizeof(int) * finalMemSize, cudaMemcpyHostToDevice); + + int numElements = arr_scanResult[finalMemSize - 1]; + + Common::kernScatter << < fullBlocksPerGrid, blockSize >> > (finalMemSize, dev_bufAnswers, dev_buf, + dev_bufB, dev_bufS); + + cudaMemcpy(odata, dev_bufAnswers, sizeof(int) * finalMemSize, cudaMemcpyDeviceToHost); + + timer().endGpuTimer(); + FreeMemory(); + + if (arr_boolean[finalMemSize - 1] == 1) + { + return numElements + 1; // Since indexing start from 0 + } + return numElements; //if last element boolean is 0 its scan result include 1 extra sum counting for 0 index + } + } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..3b5d4c0 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,24 +2,148 @@ #include #include "common.h" #include "naive.h" - namespace StreamCompaction { - namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - // TODO: __global__ - - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - } + namespace Naive { + +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) // We can use defines provided in this project + + + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + // TODO: __global__ + int* dev_buf1; + int* dev_buf2; + int* dev_bufLoader; +#define blockSize 512 + + __global__ void performScan(int d, int* buf1, int* buf2, int N) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index > N - 1) + { + return; + } + + //int pow2_dminus1 = std::round(pow(2, d - 1)); + int pow2_dminus1 = 1 <<(d - 1); + if (index >= pow2_dminus1) + { + buf2[index] = buf1[index - pow2_dminus1] + buf1[index]; + } + else + { + buf2[index] = buf1[index]; + } + + } + + __global__ void ShiftRight(int* buf1, int* buf2, int N, int difference) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index > N - 1) + { + return; + } + if (index == 0) + { + buf2[index] = 0; + return; + } + buf2[index] = buf1[index - 1]; + + } + + void FreeMemory() { + cudaFree(dev_buf1); + cudaFree(dev_buf2); + cudaFree(dev_bufLoader); + } + + void AllocateMemory(int n) + { + cudaMalloc((void**)&dev_buf1, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_buf1 failed!"); + cudaMalloc((void**)&dev_buf2, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_buf2 failed!"); + cudaMalloc((void**)&dev_bufLoader, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_bufLoader failed!"); + cudaDeviceSynchronize(); + } + + __global__ void RightShiftAddZeros(int* buf, int* buf_loader, int N, int difference) + { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index > N - 1) + { + return; + } + if (index > (N-1) - difference) + { + buf[index] = 0; + return; + } + buf[index] = buf_loader[index]; + } + + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int* odata, const int* idata) { + timer().startGpuTimer(); + // TODO + + int power2 = 1; + int nearesttwo = 1 << ilog2ceil(n); + + int difference = nearesttwo - n; + + int finalMemSize = nearesttwo; + + + AllocateMemory(finalMemSize); + + + dim3 fullBlocksPerGrid((finalMemSize + blockSize - 1) / blockSize); + dim3 threadsperblockSize(blockSize); + + + cudaMemcpy(dev_bufLoader, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + RightShiftAddZeros << < fullBlocksPerGrid, threadsperblockSize >> > (dev_buf1, dev_bufLoader, finalMemSize, difference); + + + int d = ilog2(finalMemSize); + for (int i = 1; i <= d; i++) + { + performScan << < fullBlocksPerGrid, threadsperblockSize >> > (i, dev_buf1, dev_buf2, finalMemSize); + //cudaDeviceSynchronize(); + std::swap(dev_buf1, dev_buf2); + } + + ShiftRight << < fullBlocksPerGrid, blockSize >> > (dev_buf1, dev_buf2, finalMemSize, difference); + cudaMemcpy(odata, dev_buf2, sizeof(int) * n, cudaMemcpyDeviceToHost); + + + /*printf(" \n Array After:");*/ + /*for (int i = 0; i < finalMemSize; i++) + { + printf("%3d ", arr_z[i]); + }*/ + + /* printf("]\n"); + for (int i = 0; i < n; i++) + { + printf("%3d ", odata[i]); + }*/ + + + timer().endGpuTimer(); + cudaDeviceSynchronize(); + FreeMemory(); + } + } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..fc2d00e 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,10 +18,18 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + thrust::host_vectorhstIn(idata, idata + n); + thrust::device_vectordv_in = hstIn; + thrust::device_vectordv_out(n); timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::copy(dv_out.begin(), dv_out.end(), odata); + timer().endGpuTimer(); } }