diff --git a/README.md b/README.md index b71c458..c5b8d8b 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,138 @@ -CUDA Stream Compaction -====================== +**University of Pennsylvania, CIS 565: GPU Programming and Architecture, +Project 2 - Strean Compaction** -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +* Trung Le +* Windows 10 Home, i7-4790 CPU @ 3.60GHz 12GB, GTX 980 Ti (Person desktop) -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +### Stream compaction -### (TODO: Your README) +**---- General information for CUDA device ----** +- Device name: GeForce GTX 980 Ti +- Compute capability: 5.2 +- Compute mode: Default +- Clock rate: 1076000 +- Integrated: 0 +- Device copy overlap: Enabled +- Kernel execution timeout: Enabled + +**---- Memory information for CUDA device ----** -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +- Total global memory: 6442450944 +- Total constant memory: 65536 +- Multiprocessor count: 22 +- Shared memory per multiprocessor: 98304 +- Registers per multiprocessor: 65536 +- Max threads per multiprocessor: 2048 +- Max grid dimensions: [2147483647, 65535, 65535] +- Max threads per block: 1024 +- Max registers per block: 65536 +- Max thread dimensions: [1024, 1024, 64] +- Threads per block: 512 +# Analysis + +Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU. + +(You shouldn't compare unoptimized implementations to each other!) +Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis). + +For timing GPU, I wrapped cuda events between kernel launches and for timing CPU, I used the C++11 std::chrono API. Each configuration is run 1000 times, then taken the average as displayed below: + +![Scan performance](https://github.com/trungtle/Project2-Stream-Compaction/blob/master/images/ScanPerformaceAnalysis.png "Scan performance") + + +![Compaction performance](https://github.com/trungtle/Project2-Stream-Compaction/blob/master/images/CompactPerformaceAnalysis.png "Compaction performance") + +As we can see, the CPU version is outperformed by the rest. Thrust is clearly a winner here (probably due to the fact that it was implemented properly). It occurs to me that the 'efficient' version is in fact a bit slower than the naive but is still faster than the CPU version. There are a couple reasons for this: +- We're not taking advantage of shared memory inside each block to store the partial sum results. +- Each level of upsweep/downsweep currently launches a new kernel. It would be ideal to use the same kernel and compute the next level there without having to transfer the control back to the CPU. +- At deeper level in the upsweep/downsweep calls, there are a lot of idle threads not doing work. This is wasting a lot of GPU cycles. +- There are quite a bit of memory transfering between GPU & CPU, which initially slowed the application down alot. So I rewrote my scan and compaction functions to minimize this memory transfer. + +When testing with different block sizes, I found it pretty interesting that at size 128, it seems to be the most optimal. So I decided to use this block size for the rest of profiling + +![Block sizes performance](https://github.com/trungtle/Project2-Stream-Compaction/blob/master/images/BlockSizePerformanceAnalysis.png "Block sizes performance") + +For more details on the data collected, see [link](https://docs.google.com/spreadsheets/d/1mtohoQ4BtD_RamWI2KeV-HhkSYDMmendWos7sQgdVR8/edit?usp=sharing). + +I also used NSight to profile thrust performance. It seems that thrust does take advantage of shared memory (24,528 bytes per block). It's occupancy is also lower (50.0%) and it uses more registers per threads compare to my efficient implementation. + +![Thrust performance](https://github.com/trungtle/Project2-Stream-Compaction/blob/master/images/ThrustCapture.PNG "Thrust performance") + + +# Test output + +``` +==== PROFILING ON ==== +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 35 0 ] +==== cpu scan, power-of-two ==== +Runtime: 0.1365 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] +==== cpu scan, non-power-of-two ==== +Runtime: 0.1402 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed +==== naive scan, power-of-two ==== +Runtime: 0.0925244 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== naive scan, non-power-of-two ==== +Runtime: 0.0927348 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== +Runtime: 1.72386 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== work-efficient scan, non-power-of-two ==== +Runtime: 1.79924 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed +==== thrust scan, power-of-two ==== +Runtime: 0.0006529 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== thrust scan, non-power-of-two ==== +Runtime: 0.0006317 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== +Runtime: 0.1463 ms + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== +Runtime: 0.1484 ms + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 1 ] + passed +==== cpu compact with scan ==== +Runtime: 0.47 ms + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +==== work-efficient compact, power-of-two ==== +Runtime: 2.01726 ms + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +==== work-efficient compact, non-power-of-two ==== +Runtime: 2.01408 ms + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 1 ] + passed +``` + +## Note + +### Modified test +I added a #define PROFILE and #define PROFILE_ITERATIONS flags in a new header file "profilingcommon.h". When this is on, running main() will also iterate through each function call PROFILE_ITERATIONS number of times, then measure the execution time and average it for profiling analysis. + +I also increased the reserved stack size in VS to 0x40000000 to prevent stack overflow for larger array size + +### Modified CMakeList.txt +- Added "ProfilingCommon.h" +- Changed to -arch=sm_52 diff --git a/images/BlockSizePerformanceAnalysis.png b/images/BlockSizePerformanceAnalysis.png new file mode 100644 index 0000000..44952ea Binary files /dev/null and b/images/BlockSizePerformanceAnalysis.png differ diff --git a/images/CompactPerformaceAnalysis.png b/images/CompactPerformaceAnalysis.png new file mode 100644 index 0000000..f597ed5 Binary files /dev/null and b/images/CompactPerformaceAnalysis.png differ diff --git a/images/ScanPerformaceAnalysis.png b/images/ScanPerformaceAnalysis.png new file mode 100644 index 0000000..58503ac Binary files /dev/null and b/images/ScanPerformaceAnalysis.png differ diff --git a/images/ThrustCapture.PNG b/images/ThrustCapture.PNG new file mode 100644 index 0000000..862faba Binary files /dev/null and b/images/ThrustCapture.PNG differ diff --git a/src/main.cpp b/src/main.cpp index 675da35..3cb598c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,12 +11,24 @@ #include #include #include +#include #include "testing_helpers.hpp" +#include int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int SIZE = 1 << 16; const int NPOT = SIZE - 3; - int a[SIZE], b[SIZE], c[SIZE]; + int a[SIZE]; + int b[SIZE]; + int c[SIZE]; + +#ifdef PROFILE + float timeElapsedMs = 0; + float totalTimeElapsedMs = 0; + + printDesc("PROFILING ON"); + printf("\n\n"); +#endif // Scan tests @@ -29,51 +41,138 @@ int main(int argc, char* argv[]) { a[SIZE - 1] = 0; printArray(SIZE, a, true); - zeroArray(SIZE, b); + zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printArray(SIZE, b, true); + +#ifdef PROFILE + auto begin = std::chrono::high_resolution_clock::now(); + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { +#endif + + StreamCompaction::CPU::scan(SIZE, b, a); + +#ifdef PROFILE + } + auto end = std::chrono::high_resolution_clock::now(); + std::cout << "Runtime: " << std::chrono::duration_cast((end - begin)/PROFILE_ITERATIONS).count() / 1000000.0f << " ms" << std::endl; +#endif + + printArray(SIZE, b, true); zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printArray(NPOT, b, true); + +#ifdef PROFILE + begin = std::chrono::high_resolution_clock::now(); + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { +#endif + StreamCompaction::CPU::scan(NPOT, c, a); +#ifdef PROFILE + } + end = std::chrono::high_resolution_clock::now(); + std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() / 1000000.0f << " ms" << std::endl; +#endif + + printArray(NPOT, b, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); + +#ifdef PROFILE + totalTimeElapsedMs = 0; + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { + timeElapsedMs = 0; +#endif + StreamCompaction::Naive::scan(SIZE, c, a, &timeElapsedMs); +#ifdef PROFILE + totalTimeElapsedMs += timeElapsedMs; + } + std::cout << "Runtime: " << totalTimeElapsedMs / PROFILE_ITERATIONS << " ms" << std::endl; +#endif + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); +#ifdef PROFILE + totalTimeElapsedMs = 0; + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { + timeElapsedMs = 0; +#endif + StreamCompaction::Naive::scan(NPOT, c, a, &timeElapsedMs); +#ifdef PROFILE + totalTimeElapsedMs += timeElapsedMs; + } + std::cout << "Runtime: " << totalTimeElapsedMs / PROFILE_ITERATIONS << " ms" << std::endl; +#endif + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); + +#ifdef PROFILE + totalTimeElapsedMs = 0; + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { + timeElapsedMs = 0; +#endif + StreamCompaction::Efficient::scan(SIZE, c, a, &timeElapsedMs); +#ifdef PROFILE + totalTimeElapsedMs += timeElapsedMs; + } + std::cout << "Runtime: " << totalTimeElapsedMs / PROFILE_ITERATIONS << " ms" << std::endl; +#endif + + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); +#ifdef PROFILE + totalTimeElapsedMs = 0; + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { + timeElapsedMs = 0; +#endif + StreamCompaction::Efficient::scan(NPOT, c, a, &timeElapsedMs); +#ifdef PROFILE + totalTimeElapsedMs += timeElapsedMs; + } + std::cout << "Runtime: " << totalTimeElapsedMs / PROFILE_ITERATIONS << " ms" << std::endl; +#endif + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); +#ifdef PROFILE + totalTimeElapsedMs = 0; + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { + timeElapsedMs = 0; +#endif + StreamCompaction::Thrust::scan(SIZE, c, a, &timeElapsedMs); +#ifdef PROFILE + totalTimeElapsedMs += timeElapsedMs; + } + std::cout << "Runtime: " << totalTimeElapsedMs / PROFILE_ITERATIONS << " ms" << std::endl; +#endif + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); +#ifdef PROFILE + totalTimeElapsedMs = 0; + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { + timeElapsedMs = 0; +#endif + StreamCompaction::Thrust::scan(NPOT, c, a, &timeElapsedMs); +#ifdef PROFILE + totalTimeElapsedMs += timeElapsedMs; + } + std::cout << "Runtime: " << totalTimeElapsedMs / PROFILE_ITERATIONS << " ms" << std::endl; +#endif + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -87,37 +186,84 @@ int main(int argc, char* argv[]) { a[SIZE - 1] = 0; printArray(SIZE, a, true); - int count, expectedCount, expectedNPOT; + int count = 0, expectedCount, expectedNPOT; zeroArray(SIZE, b); printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - expectedCount = count; +#ifdef PROFILE + begin = std::chrono::high_resolution_clock::now(); + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { +#endif + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); +#ifdef PROFILE + } + end = std::chrono::high_resolution_clock::now(); + std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() / 1000000.0f << " ms" << std::endl; +#endif + expectedCount = count; printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); zeroArray(SIZE, c); printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - expectedNPOT = count; +#ifdef PROFILE + begin = std::chrono::high_resolution_clock::now(); + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { +#endif + count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); +#ifdef PROFILE + } + end = std::chrono::high_resolution_clock::now(); + std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() / 1000000.0f << " ms" << std::endl; +#endif + expectedNPOT = count; printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); zeroArray(SIZE, c); printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printArray(count, c, true); +#ifdef PROFILE + begin = std::chrono::high_resolution_clock::now(); + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { +#endif + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); +#ifdef PROFILE + } + end = std::chrono::high_resolution_clock::now(); + std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() / 1000000.0f << " ms" << std::endl; +#endif + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - //printArray(count, c, true); +#ifdef PROFILE + totalTimeElapsedMs = 0; + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { + timeElapsedMs = 0; +#endif + count = StreamCompaction::Efficient::compact(SIZE, c, a, &timeElapsedMs); +#ifdef PROFILE + totalTimeElapsedMs += timeElapsedMs; + } + std::cout << "Runtime: " << totalTimeElapsedMs / PROFILE_ITERATIONS << " ms" << std::endl; +#endif + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - //printArray(count, c, true); +#ifdef PROFILE + totalTimeElapsedMs = 0; + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { + timeElapsedMs = 0; +#endif + count = StreamCompaction::Efficient::compact(NPOT, c, a, &timeElapsedMs); +#ifdef PROFILE + totalTimeElapsedMs += timeElapsedMs; + } + std::cout << "Runtime: " << totalTimeElapsedMs / PROFILE_ITERATIONS << " ms" << std::endl; +#endif + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..a97c3e4 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -1,6 +1,7 @@ set(SOURCE_FILES "common.h" "common.cu" + "profilingcommon.h" "cpu.h" "cpu.cu" "naive.h" @@ -13,5 +14,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_52 ) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..33080b3 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -14,16 +14,40 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) { exit(EXIT_FAILURE); } - namespace StreamCompaction { namespace Common { + /** + * Convert an inclusice scan result to an exclusive scan result + * + */ +__global__ void inclusiveToExclusiveScanResult(int n, int* odata, const int* idata) { + int tid = threadIdx.x + (blockIdx.x * blockDim.x); + if (tid >= n) { + return; + } + + if (tid == 0) { + odata[0] = 0; + return; + } + + odata[tid] = idata[tid - 1]; +} + + + /** * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int tid = threadIdx.x + blockDim.x * blockIdx.x; + if (tid >= n) { + return; + } + + bools[tid] = (bool)idata[tid]; } /** @@ -33,6 +57,14 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int tid = threadIdx.x + blockDim.x * blockIdx.x; + if (tid >= n) { + return; + } + + if (bools[tid] == 1) { + odata[indices[tid]] = idata[tid]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 4f52663..e3eedd3 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -7,6 +7,15 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define imin(a, b) (((a) < (b)) ? (a) : (b)) +#define imax(a, b) (((a) > (b)) ? (a) : (b)) + +#define BLOCK_SIZE 128 +#define BLOCK_COUNT(n) (((n) + BLOCK_SIZE - 1) / BLOCK_SIZE) + +// Milliseconds to nanoseconds +#define MS_TO_NS(ms) ((ms) * 1000000) + /** * Check for CUDA errors; print and exit if there was a problem. */ @@ -24,9 +33,10 @@ inline int ilog2ceil(int x) { return ilog2(x - 1) + 1; } - namespace StreamCompaction { namespace Common { + __global__ void inclusiveToExclusiveScanResult(int n, int* odata, const int* idata); + __global__ void kernMapToBoolean(int n, int *bools, const int *idata); __global__ void kernScatter(int n, int *odata, diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..294f06d 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,5 +1,6 @@ #include #include "cpu.h" +#include namespace StreamCompaction { namespace CPU { @@ -9,7 +10,10 @@ namespace CPU { */ void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + odata[0] = 0; + for (int i = 1; i < n; ++i) { + odata[i] = odata[i - 1] + idata[i - 1]; + } } /** @@ -19,7 +23,13 @@ void scan(int n, int *odata, const int *idata) { */ int compactWithoutScan(int n, int *odata, const int *idata) { // TODO - return -1; + int oIndex = 0; + for (int iIndex = 0; iIndex < n; ++iIndex) { + if (idata[iIndex] != 0) { + odata[oIndex++] = idata[iIndex]; + } + } + return oIndex; } /** @@ -29,7 +39,27 @@ int compactWithoutScan(int n, int *odata, const int *idata) { */ int compactWithScan(int n, int *odata, const int *idata) { // TODO - return -1; + memset(odata, 0, n * sizeof(int)); + + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + odata[i] = 1; + } + } + + int* scanResult = new int[n]; + scan(n, scanResult, odata); + + int remainingNumberOfElements = 0; + for (int i = 0; i < n; ++i) { + if (odata[i] == 1) { + odata[scanResult[i]] = idata[i]; + remainingNumberOfElements = scanResult[i] + 1; + } + } + + delete[] scanResult; + return remainingNumberOfElements; } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..123309a 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,17 +3,112 @@ #include "common.h" #include "efficient.h" +#define PROFILE + namespace StreamCompaction { namespace Efficient { // TODO: __global__ +__global__ void upsweep(int n, int level, int* odata) { + int tid = threadIdx.x + (blockIdx.x * blockDim.x); + if (tid >= n) { + return; + } + + int twoToLevel = 1 << level; + int twoToLevelPlusOne = 1 << (level + 1); + if (tid % twoToLevelPlusOne == 0) { + odata[tid + twoToLevelPlusOne - 1] += odata[tid + twoToLevel - 1]; + } +} + +__global__ void downsweep(int n, int level, int* odata) { + int tid = threadIdx.x + (blockIdx.x * blockDim.x); + if (tid >= n) { + return; + } + int twoToLevel = 1 << level; + int twoToLevelPlusOne = 1 << (level + 1); + + if (tid % twoToLevelPlusOne == 0) { + int t = odata[tid + twoToLevel - 1]; + odata[tid + twoToLevel - 1] = odata[tid + twoToLevelPlusOne - 1]; + odata[tid + twoToLevelPlusOne - 1] += t; + } +} + +// Should only be launched with 1 thread? +__global__ void kernRemainingElementsCountForCompact(const int n, int* dev_indices, const int* dev_bools, size_t* remainingElementsCount) { + *remainingElementsCount = dev_bools[n - 1] + dev_indices[n - 1]; +} + +void deviceScan(int n, int* dev_odata) { + + int height = ilog2ceil(n); + int ceilPower2 = 1 << height; + + for (int level = 0; level < height; ++level) { + upsweep << > >(ceilPower2, level, dev_odata); + cudaThreadSynchronize(); + } + + // Set the root to zero + cudaMemset(dev_odata + (ceilPower2 - 1), 0, sizeof(int)); + + // Downsweep + for (int level = height - 1; level >= 0; --level) { + downsweep << > >(ceilPower2, level, dev_odata); + cudaThreadSynchronize(); + } +} + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ -void scan(int n, int *odata, const int *idata) { +void scan(int n, int *odata, const int *idata, float* timeElapsedMs) { // TODO - printf("TODO\n"); + int* dev_odata; + int height = ilog2ceil(n); + int ceilPower2 = 1 << height; + cudaMalloc((void**)&dev_odata, ceilPower2 * sizeof(int)); + + // Reset to zeros + cudaMemset(dev_odata, 0, ceilPower2 * sizeof(int)); + + // Copy idata to device memory + cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + +#ifdef PROFILE + // CUDA events for profiling + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); +#endif + +#ifdef PROFILE + cudaEventRecord(start); + // -- Start code to profile +#endif + deviceScan(n, dev_odata); +#ifdef PROFILE + // -- End code to profile + cudaEventRecord(stop); +#endif + + +#ifdef PROFILE + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + *timeElapsedMs = milliseconds; +#endif + // Transfer data back to host + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + // Cleanup + cudaFree(dev_odata); + } /** @@ -25,9 +120,73 @@ void scan(int n, int *odata, const int *idata) { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ -int compact(int n, int *odata, const int *idata) { +int compact(int n, int *odata, const int *idata, float* timeElapsedMs) { // TODO - return -1; + + int height = ilog2ceil(n); + int ceilPower2 = 1 << height; + int *dev_bools, *dev_indices, *dev_odata, *dev_idata; + cudaMalloc((void**)&dev_bools, sizeof(int) * ceilPower2); + cudaMalloc((void**)&dev_idata, sizeof(int) * ceilPower2); + cudaMalloc((void**)&dev_indices, sizeof(int) * ceilPower2); + cudaMalloc((void**)&dev_odata, sizeof(int) * ceilPower2); + + // Transfer idata from host to device + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + +#ifdef PROFILE + // CUDA events for profiling + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); +#endif + +#ifdef PROFILE + // -- Start code block to profile + cudaEventRecord(start); +#endif + + // Set all non-zeros to 1s and zeros to 0s. This is our pass condition for an element to remain/discard after compaction + Common::kernMapToBoolean << > >(ceilPower2, dev_bools, dev_idata); + + // Compute indices of the out compacted stream + // Reset to zeros + cudaMemset(dev_indices, 0, ceilPower2 * sizeof(int)); + // Copy dev_bools to dev_indices to device memory + cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice); + StreamCompaction::Efficient::deviceScan(ceilPower2, dev_indices); + + // Move elements that are not discarded into appropriate slots based on scan result + Common::kernScatter << > >(ceilPower2, dev_odata, dev_idata, dev_bools, dev_indices); + + // The max value of all the valid indices for the compacted stream is the number of remaining elements + size_t* dev_remainingElementCount; + cudaMalloc((void**)&dev_remainingElementCount, sizeof(size_t)); + kernRemainingElementsCountForCompact<<<1, 1>>>(ceilPower2, dev_indices, dev_bools, dev_remainingElementCount); + size_t remainingElementCount = 0; + cudaMemcpy(&remainingElementCount, dev_remainingElementCount, sizeof(size_t), cudaMemcpyDeviceToHost); + +#ifdef PROFILE + // -- End code block to profile + cudaEventRecord(stop); +#endif + + // Transfer output back to host + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + + // Cleanup + cudaFree(dev_idata); + cudaFree(dev_indices); + cudaFree(dev_odata); + +#ifdef PROFILE + // Print runtime result + cudaEventSynchronize(stop); + float milliseconds; + cudaEventElapsedTime(&milliseconds, start, stop); + *timeElapsedMs = milliseconds; +#endif + return remainingElementCount; } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 395ba10..35d9b46 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -2,8 +2,8 @@ namespace StreamCompaction { namespace Efficient { - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, float* timeElapsedMs); - int compact(int n, int *odata, const int *idata); + int compact(int n, int *odata, const int *idata, float* timeElapsedMs); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..bf3790a 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -8,12 +8,66 @@ namespace Naive { // TODO: __global__ +__global__ void naiveScan(int n, int offset, int* odata, const int *idata) { + int tid = threadIdx.x + (blockIdx.x * blockDim.x); + if (tid >= n) { + return; + } + if (tid >= offset) { + odata[tid] = idata[tid - offset] + idata[tid] ; + } else { + odata[tid] = idata[tid]; + } +} + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ -void scan(int n, int *odata, const int *idata) { +void scan(int n, int *odata, const int *idata, float* timeElapsedMs) { // TODO - printf("TODO\n"); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + int* dev_odata1; + int* dev_odata2; + + cudaMalloc((void**)&dev_odata1, n * sizeof(int)); + cudaMalloc((void**)&dev_odata2, n * sizeof(int)); + + cudaMemcpy(dev_odata1, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata2, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + cudaEventRecord(start); + int height = ilog2ceil(n); + for (int level = 1; level <= height; ++level) { + int offset = 1 << (level - 1); + naiveScan << > >( + n, + offset, + (level % 2) == 0 ? dev_odata1 : dev_odata2, + (level % 2) == 0 ? dev_odata2 : dev_odata1 + ); + } + + if (height % 2 == 0) { + Common::inclusiveToExclusiveScanResult << > >(n, dev_odata2, dev_odata1); + cudaEventRecord(stop); + cudaMemcpy(odata, dev_odata2, n * sizeof(int), cudaMemcpyDeviceToHost); + } else { + Common::inclusiveToExclusiveScanResult << > >(n, dev_odata1, dev_odata2); + cudaEventRecord(stop); + cudaMemcpy(odata, dev_odata1, n * sizeof(int), cudaMemcpyDeviceToHost); + } + + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + *timeElapsedMs = milliseconds; + + cudaFree(dev_odata1); + cudaFree(dev_odata2); } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 21152d6..25b7c4e 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -2,6 +2,6 @@ namespace StreamCompaction { namespace Naive { - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, float* timeElapsedMs); } } diff --git a/stream_compaction/profilingcommon.h b/stream_compaction/profilingcommon.h new file mode 100644 index 0000000..ede143c --- /dev/null +++ b/stream_compaction/profilingcommon.h @@ -0,0 +1,10 @@ +#pragma once + +#include + +#define PROFILE + +#ifdef PROFILE +#include +#define PROFILE_ITERATIONS 1000 +#endif \ No newline at end of file diff --git a/stream_compaction/radixsort.cu b/stream_compaction/radixsort.cu new file mode 100644 index 0000000..7f7aad7 --- /dev/null +++ b/stream_compaction/radixsort.cu @@ -0,0 +1,18 @@ +#include +#include +#include "common.h" +#include "efficient.h" + +namespace StreamCompaction { + namespace RadixSort { + + // stream compaction on 1s bits + + // stream compaction on 0s bits + + // puts 0s in the front and + int sort(int n, int *odata, const int *idata) { + return -1; + } + } +} \ No newline at end of file diff --git a/stream_compaction/radixsort.h b/stream_compaction/radixsort.h new file mode 100644 index 0000000..5dadbf9 --- /dev/null +++ b/stream_compaction/radixsort.h @@ -0,0 +1,7 @@ +#pragma once + +namespace StreamCompaction { + namespace RadixSort { + int sort(int n, int *odata, const int *idata); + } +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..a109198 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -5,6 +5,7 @@ #include #include "common.h" #include "thrust.h" +#include "profilingcommon.h" namespace StreamCompaction { namespace Thrust { @@ -12,10 +13,28 @@ namespace Thrust { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ -void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata, float* timeElapsedMs) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + // Convert to device vector + thrust::device_vector dev_idata(idata, idata + n); + thrust::device_vector dev_odata(odata, odata + n); + +#ifdef PROFILE + auto begin = std::chrono::high_resolution_clock::now(); +#endif + + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); + +#ifdef PROFILE + auto end = std::chrono::high_resolution_clock::now(); + *timeElapsedMs = std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() / 1000000.0f; +#endif + + thrust::host_vector host_odata = dev_odata; + cudaMemcpy(odata, host_odata.data(), n * sizeof(int), cudaMemcpyHostToHost); } } diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h index 06707f3..44d7fcf 100644 --- a/stream_compaction/thrust.h +++ b/stream_compaction/thrust.h @@ -2,6 +2,6 @@ namespace StreamCompaction { namespace Thrust { - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, float* timeElapsedMs); } }