From 65770b78f207b43799aa2aae26bf37b5bf881103 Mon Sep 17 00:00:00 2001 From: pratchpak Date: Sun, 13 Sep 2015 23:42:07 -0400 Subject: [PATCH 1/7] efficient done --- src/main.cpp | 4 + stream_compaction/common.cu | 15 +++- stream_compaction/common.h | 30 +++++++ stream_compaction/cpu.cu | 39 +++++++-- stream_compaction/efficient.cu | 142 +++++++++++++++++++++++++++++++-- stream_compaction/naive.cu | 63 +++++++++++++-- stream_compaction/thrust.cu | 1 - 7 files changed, 277 insertions(+), 17 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 675da35..b2fa9bb 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -6,6 +6,7 @@ * @copyright University of Pennsylvania */ +#include #include #include #include @@ -120,4 +121,7 @@ int main(int argc, char* argv[]) { count = StreamCompaction::Efficient::compact(NPOT, c, a); //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + + int exit; + std::cin >> exit; } diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..9431045 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,4 +1,5 @@ #include "common.h" +#include "device_launch_parameters.h" void checkCUDAErrorFn(const char *msg, const char *file, int line) { cudaError_t err = cudaGetLastError(); @@ -24,6 +25,13 @@ namespace Common { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int idx = (blockDim.x * blockIdx.x) + threadIdx.x; + if (idx > n) return; + + if (idata[idx] == 0) + bools[idx] = 0; + else + bools[idx] = 1; } /** @@ -33,7 +41,12 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO -} + int idx = (blockDim.x * blockIdx.x) + threadIdx.x; + if (idx > n) return; + if (bools[idx] == 1) + odata[indices[idx]] = idata[idx]; } + } +} \ No newline at end of file diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 4f52663..c57c04f 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -3,6 +3,7 @@ #include #include #include +#include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) @@ -33,3 +34,32 @@ namespace Common { const int *idata, const int *bools, const int *indices); } } + +inline void findOptimizedSize(const int n, int& gridDim, int& blockDim){ + //assuming that cc >= 3.0, max gridDim = 16. + + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, 0); + + int prev_t_diff, curr_t_diff; + + blockDim = prop.maxThreadsPerBlock; + gridDim = ceil(n / (float)blockDim); + curr_t_diff = (blockDim * gridDim) - n; + + do{ + prev_t_diff = curr_t_diff; + + blockDim /= 2; + gridDim = ceil(n / (float)blockDim); + curr_t_diff = (blockDim * gridDim) - n; + + } + while (prev_t_diff > curr_t_diff && + gridDim < 16 && + blockDim > 32); + + blockDim *= 2; + gridDim = ceil(n / (float)blockDim); + +} \ No newline at end of file diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..9fe20ad 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,15 +1,16 @@ #include #include "cpu.h" +#include namespace StreamCompaction { namespace CPU { - /** * CPU scan (prefix sum). */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + odata[0] = 0; + for (int i = 1; i < n; i++) + odata[i] = odata[i - 1] + idata[i - 1]; } /** @@ -19,7 +20,15 @@ void scan(int n, int *odata, const int *idata) { */ int compactWithoutScan(int n, int *odata, const int *idata) { // TODO - return -1; + int elemNo = 0; + + for (int i = 0; i < n; i++) + if (idata[i] != 0){ + odata[elemNo] = idata[i]; + elemNo++; + } + + return elemNo; } /** @@ -29,7 +38,27 @@ int compactWithoutScan(int n, int *odata, const int *idata) { */ int compactWithScan(int n, int *odata, const int *idata) { // TODO - return -1; + int *tmp = (int*) malloc(sizeof(int) * n); + + //compute temp array + for (int i = 0; i < n; i++) { + if (idata[i] == 0) + tmp[i] = 0; + else + tmp[i] = 1; + } + + //scan + int *s = (int*)malloc(sizeof(int) * n); + scan(n, s, tmp); + + //scatter + for (int i = 0; i < n; i++) + if (tmp[i] == 1){ + odata[s[i]] = idata[i]; + } + + return s[n-1]; } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..6b938f5 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,18 +2,113 @@ #include #include "common.h" #include "efficient.h" +#include "device_launch_parameters.h" + +#include namespace StreamCompaction { namespace Efficient { + +__device__ __host__ int pow(int n, int p){ + int out = 1; + for (int i = 0; i < p; i++) + out *= n; + return out; +}; + +__global__ void upSweep(int* arr, int n, int iter){ + int k = (blockDim.x * blockIdx.x) + threadIdx.x; + if (k >= n) return; + + for (int d = 0; d <= iter; d++) { + int p = pow(2, d + 1); + int ans = 0; + + if (k % p == 0) + ans = arr[k + pow(2, d) - 1]; + + __syncthreads(); + + if (k % p == 0) + arr[k + p - 1] += ans; + + __syncthreads(); + } +} + +__global__ void downSweep(int* arr, int n, int iter){ + int k = (blockDim.x * blockIdx.x) + threadIdx.x; + if (k >= n) return; + if (k == n - 1){ + arr[k] = 0; + } + __syncthreads(); -// TODO: __global__ + for (int d = iter; d >= 0; d--) { + int p = pow(2, d); + int p_plusOne = pow(2, d + 1); + + int t = 0; + int t2 = 0; + + if (k % p_plusOne == 0) { + t = arr[k + p - 1]; + t2 = arr[k + p_plusOne - 1]; + } + + __syncthreads(); + + if (k % p_plusOne == 0){ + arr[k + p - 1] = t2; + arr[k + p_plusOne - 1] += t; + } + + __syncthreads(); + } +} /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + int log2ceil = ilog2ceil(n); + int log2 = ilog2(n); + int n_p2 = pow(2, log2ceil); + + int size = sizeof(int) * n; + int size_n_p2 = sizeof(int) * n_p2; + + int *d_idata; + cudaMalloc(&d_idata, size_n_p2); + cudaMemcpy(d_idata, idata, size, cudaMemcpyHostToDevice); + cudaMemset(d_idata + size, 0, size_n_p2 - size); + + int blockDim, gridDim; + findOptimizedSize(n_p2, gridDim, blockDim); + + upSweep <<< gridDim, blockDim >>>(d_idata, n_p2, log2ceil - 1); + downSweep <<< gridDim, blockDim >>>(d_idata, n_p2, log2ceil - 1); + + //test + cudaMemcpy(odata, d_idata, size, cudaMemcpyDeviceToHost); + + /* + std::cout << "idata: "; + int total = 0; + for (int i = 0; i < n; i++) { + std::cout << idata[i] << " "; + total += idata[i]; + } + std::cout << "Total : " << total << std::endl; + std::cout << std::endl; + std::cout << std::endl; + std::cout << "odata: "; + for (int i = 0; i < n; i++) { + std::cout << odata[i] << " "; + } + std::cout << std::endl; + */ + cudaFree(d_idata); } /** @@ -25,10 +120,47 @@ void scan(int n, int *odata, const int *idata) { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ + int compact(int n, int *odata, const int *idata) { // TODO - return -1; -} + int size = sizeof(int) * n; + + ////////////////////// + + int *d_idata; + cudaMalloc(&d_idata, size); + cudaMemcpy(d_idata, idata, size, cudaMemcpyHostToDevice); + + int* h_indices = (int*)malloc(sizeof(int) * n); + int* d_indices; + cudaMalloc(&d_indices, size); + int* h_bools = (int*)malloc(sizeof(int) * n); + int* d_bools; + cudaMalloc(&d_bools, size); + + int *d_odata; + cudaMalloc(&d_odata, size); + + ////////////////////////// + + int blockDim, gridDim; + findOptimizedSize(n, gridDim, blockDim); + + StreamCompaction::Common::kernMapToBoolean << < gridDim, blockDim >> >(n, d_bools, d_idata); + cudaMemcpy(h_bools, d_bools, size, cudaMemcpyDeviceToHost); + StreamCompaction::Efficient::scan(n, h_indices, h_bools); + cudaMemcpy(d_indices, h_indices, size, cudaMemcpyHostToDevice); + StreamCompaction::Common::kernScatter << < gridDim, blockDim >> >(n, d_odata, d_idata, d_bools, d_indices); + + cudaMemcpy(odata, d_odata, size, cudaMemcpyDeviceToHost); + + cudaFree(d_idata); + cudaFree(d_odata); + cudaFree(d_indices); + cudaFree(d_bools); + + return h_indices[n - 1]; +} } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..42b4968 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,18 +2,71 @@ #include #include "common.h" #include "naive.h" +#include "device_launch_parameters.h" + +#include namespace StreamCompaction { namespace Naive { -// TODO: __global__ +__device__ __host__ int pow(int n, int p){ + int out = 1; + for (int i = 0; i < p; i++) + out *= n; + return out; +}; + +__device__ __host__ void swap(int** x, int** y){ + int *tmp = *x; + *x = *y; + *y = tmp; +}; + +__global__ void gpuScan(int* out, int* in, int n, int iter){ + int k = (blockDim.x * blockIdx.x) + threadIdx.x; + if (k > n) return; + + for (int d = 1; d <= iter; d++) { + int p = pow(2, d - 1); + if (k >= p) + out[k] = in[k - p] + in[k]; + else + out[k] = in[k]; + + __syncthreads(); + swap(&in, &out); + } + + out[k] = k > 0 ? in[k - 1] : 0; +} /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ +* Performs prefix-sum (aka scan) on idata, storing the result into odata. +*/ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + + int log2ceil = ilog2ceil(n); + int log2 = ilog2(n); + int n_p2 = pow(2, log2ceil); + + int size = sizeof(int) * n; + int size_n_p2 = sizeof(int) * n_p2; + + int *d_idata, *d_odata; + cudaMalloc(&d_idata, size_n_p2); + cudaMalloc(&d_odata, size_n_p2); + cudaMemcpy(d_idata, idata, size, cudaMemcpyHostToDevice); + cudaMemset(d_idata + size, 0, size_n_p2 - size); + + int blockDim, gridDim; + findOptimizedSize(n_p2, gridDim, blockDim); + + gpuScan << < gridDim, blockDim >> >(d_odata, d_idata, n_p2, log2ceil); + + cudaMemcpy(odata, d_odata, size, cudaMemcpyDeviceToHost); + + cudaFree(d_idata); + cudaFree(d_odata); } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..cdd95b1 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -8,7 +8,6 @@ namespace StreamCompaction { namespace Thrust { - /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ From 52588e16cd92d1dc6b0e27cfb91af6b3981cd5f7 Mon Sep 17 00:00:00 2001 From: pratchpak Date: Sun, 13 Sep 2015 23:49:48 -0400 Subject: [PATCH 2/7] thrust done --- stream_compaction/thrust.cu | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index cdd95b1..ac94e96 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -11,11 +11,12 @@ namespace Thrust { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ -void scan(int n, int *odata, const int *idata) { - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); -} + void scan(int n, int *odata, const int *idata) { + // TODO use `thrust::exclusive_scan` + // example: for device_vectors dv_in and dv_out: + // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(idata, idata + n, odata); + } } } From 59de4e5aac1dfa73feaf05b781d4c0a51671586b Mon Sep 17 00:00:00 2001 From: pratchpak Date: Sun, 13 Sep 2015 23:51:30 -0400 Subject: [PATCH 3/7] readme --- README.md | 192 +----------------------------------------------------- 1 file changed, 2 insertions(+), 190 deletions(-) diff --git a/README.md b/README.md index 4535eea..07599c8 100644 --- a/README.md +++ b/README.md @@ -3,181 +3,8 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) - -### (TODO: Your README) - -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) - -Instructions (delete me) -======================== - -This is due Sunday, September 13 at midnight. - -**Summary:** In this project, you'll implement GPU stream compaction in CUDA, -from scratch. This algorithm is widely used, and will be important for -accelerating your path tracer project. - -Your stream compaction implementations in this project will simply remove `0`s -from an array of `int`s. In the path tracer, you will remove terminated paths -from an array of rays. - -In addition to being useful for your path tracer, this project is meant to -reorient your algorithmic thinking to the way of the GPU. On GPUs, many -algorithms can benefit from massive parallelism and, in particular, data -parallelism: executing the same code many times simultaneously with different -data. - -You'll implement a few different versions of the *Scan* (*Prefix Sum*) -algorithm. First, you'll implement a CPU version of the algorithm to reinforce -your understanding. Then, you'll write a few GPU implementations: "naive" and -"work-efficient." Finally, you'll use some of these to implement GPU stream -compaction. - -**Algorithm overview & details:** There are two primary references for details -on the implementation of scan and stream compaction. - -* The [slides on Parallel Algorithms](https://github.com/CIS565-Fall-2015/cis565-fall-2015.github.io/raw/master/lectures/2-Parallel-Algorithms.pptx) - for Scan, Stream Compaction, and Work-Efficient Parallel Scan. -* GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html). - -Your GPU stream compaction implementation will live inside of the -`stream_compaction` subproject. This way, you will be able to easily copy it -over for use in your GPU path tracer. - - -## Part 0: The Usual - -This project (and all other CUDA projects in this course) requires an NVIDIA -graphics card with CUDA capability. Any card with Compute Capability 2.0 -(`sm_20`) or greater will work. Check your GPU on this -[compatibility table](https://developer.nvidia.com/cuda-gpus). -If you do not have a personal machine with these specs, you may use those -computers in the Moore 100B/C which have supported GPUs. - -**HOWEVER**: If you need to use the lab computer for your development, you will -not presently be able to do GPU performance profiling. This will be very -important for debugging performance bottlenecks in your program. - -### Useful existing code - -* `stream_compaction/common.h` - * `checkCUDAError` macro: checks for CUDA errors and exits if there were any. - * `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer. -* `main.cpp` - * Some testing code for your implementations. - -**Note 1:** The tests will simply compare against your CPU implementation -Do it first! - -**Note 2:** The tests default to an array of size 256. -Test with something larger (10,000?), too! - - -## Part 1: CPU Scan & Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -Do this first, and double check the output! It will be used as the expected -value for the other tests. - -In `stream_compaction/cpu.cu`, implement: - -* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum. -* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using - the `scan` function. -* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan` - function. Map the input array to an array of 0s and 1s, scan it, and use - scatter to produce the output. You will need a **CPU** scatter implementation - for this (see slides or GPU Gems chapter for an explanation). - -These implementations should only be a few lines long. - - -## Part 2: Naive GPU Scan Algorithm - -In `stream_compaction/naive.cu`, implement `StreamCompaction::Naive::scan` - -This uses the "Naive" algorithm from GPU Gems 3, Section 39.2.1. We haven't yet -taught shared memory, and you **shouldn't use it yet**. Example 39-1 uses -shared memory, but is limited to operating on very small arrays! Instead, write -this using global memory only. As a result of this, you will have to do -`ilog2ceil(n)` separate kernel invocations. - -Beware of errors in Example 39-1 in the book; both the pseudocode and the CUDA -code in the online version of Chapter 39 are known to have a few small errors -(in superscripting, missing braces, bad indentation, etc.) - -Since the parallel scan algorithm operates on a binary tree structure, it works -best with arrays with power-of-two length. Make sure your implementation works -on non-power-of-two sized arrays (see `ilog2ceil`). This requires extra memory -- your intermediate array sizes will need to be rounded to the next power of -two. - - -## Part 3: Work-Efficient GPU Scan & Stream Compaction - -### 3.1. Scan - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::scan` - -All of the text in Part 2 applies. - -* This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2. -* Beware of errors in Example 39-2. -* Test non-power-of-two sized arrays. - -### 3.2. Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::compact` - -For compaction, you will also need to implement the scatter algorithm presented -in the slides and the GPU Gems chapter. - -In `stream_compaction/common.cu`, implement these for use in `compact`: - -* `StreamCompaction::Common::kernMapToBoolean` -* `StreamCompaction::Common::kernScatter` - - -## Part 4: Using Thrust's Implementation - -In `stream_compaction/thrust.cu`, implement: - -* `StreamCompaction::Thrust::scan` - -This should be a very short function which wraps a call to the Thrust library -function `thrust::exclusive_scan(first, last, result)`. - -To measure timing, be sure to exclude memory operations by passing -`exclusive_scan` a `thrust::device_vector` (which is already allocated on the -GPU). You can create a `thrust::device_vector` by creating a -`thrust::host_vector` from the given pointer, then casting it. - - -## Part 5: Radix Sort (Extra Credit) (+10) - -Add an additional module to the `stream_compaction` subproject. Implement radix -sort using one of your scan implementations. Add tests to check its correctness. - - -## Write-up - -1. Update all of the TODOs at the top of this README. -2. Add a description of this project including a list of its features. -3. Add your performance analysis (see below). - -All extra credit features must be documented in your README, explaining its -value (with performance comparison, if applicable!) and showing an example how -it works. For radix sort, show how it is called and an example of its output. - -Always profile with Release mode builds and run without debugging. +* Ratchpak (Dome) Pongmongkol +* Tested on: OSX Yosemite 10.10.5, i7 @ 2.4GHz 16GB, GT 650M 1024MB (rMBP Early 2013) ### Questions @@ -205,18 +32,3 @@ Always profile with Release mode builds and run without debugging. These questions should help guide you in performance analysis on future assignments, as well. - -## Submit - -If you have modified any of the `CMakeLists.txt` files at all (aside from the -list of `SOURCE_FILES`), you must test that your project can build in Moore -100B/C. Beware of any build issues discussed on the Google Group. - -1. Open a GitHub pull request so that we can see that you have finished. - The title should be "Submission: YOUR NAME". -2. Send an email to the TA (gmail: kainino1+cis565@) with: - * **Subject**: in the form of `[CIS565] Project 2: PENNKEY` - * Direct link to your pull request on GitHub - * In the form of a grade (0-100+) with comments, evaluate your own - performance on the project. - * Feedback on the project itself, if any. From 6c67f2e476c9fb82fe9ade32e1c7739bc23861c2 Mon Sep 17 00:00:00 2001 From: pratchpak Date: Mon, 14 Sep 2015 00:07:04 -0400 Subject: [PATCH 4/7] result for readme --- README.md | 66 +++++++++++++++++++++++++++----------- stream_compaction/common.h | 6 ++-- 2 files changed, 51 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 07599c8..5e02f79 100644 --- a/README.md +++ b/README.md @@ -8,27 +8,57 @@ CUDA Stream Compaction ### Questions -* Roughly optimize the block sizes of each of your implementations for minimal - run time on your GPU. - * (You shouldn't compare unoptimized implementations to each other!) - -* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and - Thrust) to the serial CPU version of Scan. Plot a graph of the comparison - (with array size on the independent axis). - * You should use CUDA events for timing. Be sure **not** to include any - explicit memory operations in your performance measurements, for - comparability. - * To guess at what might be happening inside the Thrust implementation, take - a look at the Nsight timeline for its execution. +* For block sizing, I implemented a function "findOptimizedSize" in common.h, The strategy is to find +a block size that, in total, has the least number of unused threads. This start from decreasing +prop.maxThreadsPerBlock by prop.warpSize (32), while keeping gridDim < 16 and blockDim > prop.warpSize (32) + +# Graph * Write a brief explanation of the phenomena you see here. * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is it different for each implementation? -* Paste the output of the test program into a triple-backtick block in your - README. - * If you add your own tests (e.g. for radix sort or to test additional corner - cases), be sure to mention it explicitly. +## Example Output + +``` +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 26 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6146 6190 ] + passed +==== naive scan, power-of-two ==== + passed +==== naive scan, non-power-of-two ==== + passed +==== work-efficient scan, power-of-two ==== + passed +==== work-efficient scan, non-power-of-two ==== + passed +==== thrust scan, power-of-two ==== + passed +==== thrust scan, non-power-of-two ==== + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 2 ] + passed +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 1 ] + passed +==== work-efficient compact, power-of-two ==== + passed +==== work-efficient compact, non-power-of-two ==== + passed -These questions should help guide you in performance analysis on future -assignments, as well. +``` \ No newline at end of file diff --git a/stream_compaction/common.h b/stream_compaction/common.h index c57c04f..0d767ec 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -50,16 +50,16 @@ inline void findOptimizedSize(const int n, int& gridDim, int& blockDim){ do{ prev_t_diff = curr_t_diff; - blockDim /= 2; + blockDim -= prop.warpSize; gridDim = ceil(n / (float)blockDim); curr_t_diff = (blockDim * gridDim) - n; } while (prev_t_diff > curr_t_diff && gridDim < 16 && - blockDim > 32); + blockDim > prop.warpSize); - blockDim *= 2; + blockDim += prop.warpSize; gridDim = ceil(n / (float)blockDim); } \ No newline at end of file From 354dded9d1a4cb835900d587de327d26f34f7a51 Mon Sep 17 00:00:00 2001 From: pratchpak Date: Mon, 14 Sep 2015 03:20:20 -0400 Subject: [PATCH 5/7] Finished readme. --- README.md | 25 ++++++--- src/main.cpp | 2 +- stream_compaction/common.h | 20 +++---- stream_compaction/cpu.cu | 8 +++ stream_compaction/efficient.cu | 96 ++++++++++++++++++++-------------- stream_compaction/naive.cu | 20 ++++++- stream_compaction/thrust.cu | 13 +++++ 7 files changed, 124 insertions(+), 60 deletions(-) diff --git a/README.md b/README.md index 5e02f79..ec1d509 100644 --- a/README.md +++ b/README.md @@ -6,17 +6,26 @@ CUDA Stream Compaction * Ratchpak (Dome) Pongmongkol * Tested on: OSX Yosemite 10.10.5, i7 @ 2.4GHz 16GB, GT 650M 1024MB (rMBP Early 2013) -### Questions - * For block sizing, I implemented a function "findOptimizedSize" in common.h, The strategy is to find -a block size that, in total, has the least number of unused threads. This start from decreasing -prop.maxThreadsPerBlock by prop.warpSize (32), while keeping gridDim < 16 and blockDim > prop.warpSize (32) +a block size that, in total, has the least number of unused threads. +This start by... +1. setting gridSize = 16 and blockDim = ceil(n / (float)gridDim); +2. decreasing blockDim by prop.warpSize (32), while keeping blockDim > prop.warpSize (32) && gridDim <= 16 + - gridDimis calculated from : ceil(n / blockDim); + +# Analysis + +For N = 256, the execution time of each methods are as follows +Thrust < Naive < Work-Efficient +Which is quite unexpected at first, as the 'work-efficient' one is supposed to be faster than +the 'naive' one. -# Graph +My speculation is that the 'Work-Efficient' one will only shine when N is larger than the maximum +concurrent thread the graphics card can handle (which means, for the naive one, we would need to divide +N threads into N/maxThread batches for every step. Meanwhile, for 'Work-Efficient', the number of +threads for most step will be comparatively, and substantially, lower than its counterpart. -* Write a brief explanation of the phenomena you see here. - * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is - it different for each implementation? +Please note that I did not implement the Naive algorithm to support N > totalMaxThread ## Example Output diff --git a/src/main.cpp b/src/main.cpp index b2fa9bb..d86ea40 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -15,7 +15,7 @@ #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int SIZE = 256; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 0d767ec..61bae7b 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -43,21 +43,23 @@ inline void findOptimizedSize(const int n, int& gridDim, int& blockDim){ int prev_t_diff, curr_t_diff; - blockDim = prop.maxThreadsPerBlock; - gridDim = ceil(n / (float)blockDim); + gridDim = 16; + blockDim = ceil(n / (float)gridDim); + if (blockDim < 32) blockDim = 32; curr_t_diff = (blockDim * gridDim) - n; + prev_t_diff = curr_t_diff; - do{ - prev_t_diff = curr_t_diff; - + do + { blockDim -= prop.warpSize; gridDim = ceil(n / (float)blockDim); curr_t_diff = (blockDim * gridDim) - n; - } - while (prev_t_diff > curr_t_diff && - gridDim < 16 && - blockDim > prop.warpSize); + prev_t_diff = curr_t_diff; + } + while (prev_t_diff >= curr_t_diff && + blockDim > prop.warpSize && + blockDim <= prop.maxThreadsPerBlock); blockDim += prop.warpSize; gridDim = ceil(n / (float)blockDim); diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 9fe20ad..ffdb197 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,6 +1,8 @@ #include #include "cpu.h" #include +#include +#include namespace StreamCompaction { namespace CPU { @@ -8,9 +10,15 @@ namespace CPU { * CPU scan (prefix sum). */ void scan(int n, int *odata, const int *idata) { + auto begin = std::chrono::high_resolution_clock::now(); odata[0] = 0; for (int i = 1; i < n; i++) odata[i] = odata[i - 1] + idata[i - 1]; + + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end - begin).count(); + + std::cout << "CPU - time : " << duration << " ms" << std::endl; } /** diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 6b938f5..b748e01 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -16,55 +16,39 @@ __device__ __host__ int pow(int n, int p){ return out; }; -__global__ void upSweep(int* arr, int n, int iter){ - int k = (blockDim.x * blockIdx.x) + threadIdx.x; +__global__ void upSweep(int* arr, int n, int d, int p){ + int k = ((blockDim.x * blockIdx.x) + threadIdx.x) * p; if (k >= n) return; - for (int d = 0; d <= iter; d++) { - int p = pow(2, d + 1); - int ans = 0; - - if (k % p == 0) - ans = arr[k + pow(2, d) - 1]; + int ans = arr[k + pow(2, d) - 1]; - __syncthreads(); + __syncthreads(); - if (k % p == 0) - arr[k + p - 1] += ans; + arr[k + p - 1] += ans; +} - __syncthreads(); - } +__global__ void setValArray(int* arr, int index, int value){ + arr[index] = value; } -__global__ void downSweep(int* arr, int n, int iter){ - int k = (blockDim.x * blockIdx.x) + threadIdx.x; +__global__ void downSweep(int* arr, int n, int d, int p){ + int k = ((blockDim.x * blockIdx.x) + threadIdx.x) * p; if (k >= n) return; - if (k == n - 1){ - arr[k] = 0; - } - __syncthreads(); - - for (int d = iter; d >= 0; d--) { - int p = pow(2, d); - int p_plusOne = pow(2, d + 1); - int t = 0; - int t2 = 0; + int t = arr[k + pow(2, d) - 1]; + int t2 = arr[k + p - 1]; - if (k % p_plusOne == 0) { - t = arr[k + p - 1]; - t2 = arr[k + p_plusOne - 1]; - } + __syncthreads(); - __syncthreads(); + arr[k + pow(2, d) - 1] = t2; + arr[k + p - 1] += t; +} - if (k % p_plusOne == 0){ - arr[k + p - 1] = t2; - arr[k + p_plusOne - 1] += t; - } +__global__ void setExclusive(int* out, int* in, int n){ + int k = (blockDim.x * blockIdx.x) + threadIdx.x; + if (k >= n) return; - __syncthreads(); - } + out[k] = k > 0 ? in[k - 1] : 0; } /** @@ -78,17 +62,48 @@ void scan(int n, int *odata, const int *idata) { int size = sizeof(int) * n; int size_n_p2 = sizeof(int) * n_p2; - int *d_idata; + int *d_idata, *d_odata; cudaMalloc(&d_idata, size_n_p2); + cudaMalloc(&d_odata, size_n_p2); cudaMemcpy(d_idata, idata, size, cudaMemcpyHostToDevice); - cudaMemset(d_idata + size, 0, size_n_p2 - size); + cudaMemset(&d_idata[n], 0, size_n_p2 - size); int blockDim, gridDim; findOptimizedSize(n_p2, gridDim, blockDim); + //std::cout << "blockDim : " << blockDim << std::endl; + + ////////////////////////// + float time = 0; + cudaEvent_t beginEvent_acc; + cudaEvent_t endEvent_acc; + + cudaEventCreate(&beginEvent_acc); + cudaEventCreate(&endEvent_acc); + cudaEventRecord(beginEvent_acc, 0); - upSweep <<< gridDim, blockDim >>>(d_idata, n_p2, log2ceil - 1); - downSweep <<< gridDim, blockDim >>>(d_idata, n_p2, log2ceil - 1); + for (int d = 0; d <= log2ceil - 1; d++){ + int p = pow(2, d + 1); + findOptimizedSize(n_p2 / p, gridDim, blockDim); + upSweep << < gridDim, blockDim >> >(d_idata, n_p2, d, p); + } + setValArray << < 1, 1 >> >(d_idata, n_p2 - 1, 0); + + for (int d = log2ceil - 1; d >= 0; d--){ + int p = pow(2, d + 1); + findOptimizedSize(n_p2 / p, gridDim, blockDim); + downSweep << < gridDim, blockDim >> >(d_idata, n_p2, d, p); + } + + findOptimizedSize(n_p2, gridDim, blockDim); + setExclusive << < blockDim, gridDim >> >(d_odata, d_idata, n); + + cudaEventRecord(endEvent_acc, 0); + cudaEventSynchronize(endEvent_acc); + cudaEventElapsedTime(&time, beginEvent_acc, endEvent_acc); + + std::cout << "Work Efficient - time : " << time << " ms" << std::endl; + ////////////////////////// //test cudaMemcpy(odata, d_idata, size, cudaMemcpyDeviceToHost); @@ -108,6 +123,7 @@ void scan(int n, int *odata, const int *idata) { } std::cout << std::endl; */ + cudaFree(d_idata); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 42b4968..abd798d 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -24,7 +24,7 @@ __device__ __host__ void swap(int** x, int** y){ __global__ void gpuScan(int* out, int* in, int n, int iter){ int k = (blockDim.x * blockIdx.x) + threadIdx.x; - if (k > n) return; + if (k >= n) return; for (int d = 1; d <= iter; d++) { int p = pow(2, d - 1); @@ -38,7 +38,9 @@ __global__ void gpuScan(int* out, int* in, int n, int iter){ swap(&in, &out); } - out[k] = k > 0 ? in[k - 1] : 0; + if (iter % 2 == 1) swap(&in, &out); + + out[k] = k > 0 ? in[k-1] : 0; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. @@ -60,9 +62,23 @@ void scan(int n, int *odata, const int *idata) { int blockDim, gridDim; findOptimizedSize(n_p2, gridDim, blockDim); + //std::cout << "blockDim : " << blockDim << std::endl; + + ////////////////////////// + float time = 0; + cudaEvent_t beginEvent_acc; + cudaEvent_t endEvent_acc; + cudaEventCreate(&beginEvent_acc); + cudaEventCreate(&endEvent_acc); + cudaEventRecord(beginEvent_acc, 0); gpuScan << < gridDim, blockDim >> >(d_odata, d_idata, n_p2, log2ceil); + cudaEventRecord(endEvent_acc, 0); + cudaEventSynchronize(endEvent_acc); + cudaEventElapsedTime(&time, beginEvent_acc, endEvent_acc); + std::cout << "Naive - time : " << time << " ms" << std::endl; + ////////////////////////// cudaMemcpy(odata, d_odata, size, cudaMemcpyDeviceToHost); cudaFree(d_idata); diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index ac94e96..efec95a 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -15,8 +15,21 @@ namespace Thrust { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + float time = 0; + cudaEvent_t beginEvent_acc; + cudaEvent_t endEvent_acc; + + cudaEventCreate(&beginEvent_acc); + cudaEventCreate(&endEvent_acc); + cudaEventRecord(beginEvent_acc, 0); thrust::exclusive_scan(idata, idata + n, odata); + + cudaEventRecord(endEvent_acc, 0); + cudaEventSynchronize(endEvent_acc); + cudaEventElapsedTime(&time, beginEvent_acc, endEvent_acc); + + std::cout << "thrust - time : " << time << " ms" << std::endl; } } } From 699396c74b984a2af929faa2a035f99fbdc33933 Mon Sep 17 00:00:00 2001 From: pratchpak Date: Mon, 14 Sep 2015 03:56:50 -0400 Subject: [PATCH 6/7] fixed minor bugs --- README.md | 10 ++-------- src/main.cpp | 2 +- stream_compaction/common.h | 26 ++++++++------------------ stream_compaction/efficient.cu | 7 ++++++- stream_compaction/naive.cu | 2 +- 5 files changed, 18 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index ec1d509..d3a70a6 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,8 @@ CUDA Stream Compaction * Ratchpak (Dome) Pongmongkol * Tested on: OSX Yosemite 10.10.5, i7 @ 2.4GHz 16GB, GT 650M 1024MB (rMBP Early 2013) -* For block sizing, I implemented a function "findOptimizedSize" in common.h, The strategy is to find -a block size that, in total, has the least number of unused threads. -This start by... -1. setting gridSize = 16 and blockDim = ceil(n / (float)gridDim); -2. decreasing blockDim by prop.warpSize (32), while keeping blockDim > prop.warpSize (32) && gridDim <= 16 - - gridDimis calculated from : ceil(n / blockDim); +* For block sizing, I implemented a function "findOptimizedSize" in common.h. +The strategy is to spread out the thread to several blocks as much as possible (gridDim < 16) # Analysis @@ -25,8 +21,6 @@ concurrent thread the graphics card can handle (which means, for the naive one, N threads into N/maxThread batches for every step. Meanwhile, for 'Work-Efficient', the number of threads for most step will be comparatively, and substantially, lower than its counterpart. -Please note that I did not implement the Naive algorithm to support N > totalMaxThread - ## Example Output ``` diff --git a/src/main.cpp b/src/main.cpp index d86ea40..b2fa9bb 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -15,7 +15,7 @@ #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 256; + const int SIZE = 1 << 8; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 61bae7b..ac6e662 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -41,27 +41,17 @@ inline void findOptimizedSize(const int n, int& gridDim, int& blockDim){ cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); - int prev_t_diff, curr_t_diff; - - gridDim = 16; + int diff; + + gridDim = 16 * prop.multiProcessorCount; blockDim = ceil(n / (float)gridDim); if (blockDim < 32) blockDim = 32; - curr_t_diff = (blockDim * gridDim) - n; - prev_t_diff = curr_t_diff; - do + if (blockDim > prop.maxThreadsPerBlock) { - blockDim -= prop.warpSize; - gridDim = ceil(n / (float)blockDim); - curr_t_diff = (blockDim * gridDim) - n; - - prev_t_diff = curr_t_diff; + diff = (blockDim - prop.maxThreadsPerBlock) * gridDim; + int additionalGrid = diff / prop.maxThreadsPerBlock; + gridDim += additionalGrid; + blockDim = prop.maxThreadsPerBlock; } - while (prev_t_diff >= curr_t_diff && - blockDim > prop.warpSize && - blockDim <= prop.maxThreadsPerBlock); - - blockDim += prop.warpSize; - gridDim = ceil(n / (float)blockDim); - } \ No newline at end of file diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b748e01..cc341f5 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -95,6 +95,10 @@ void scan(int n, int *odata, const int *idata) { downSweep << < gridDim, blockDim >> >(d_idata, n_p2, d, p); } + cudaEventRecord(endEvent_acc, 0); + cudaEventSynchronize(endEvent_acc); + cudaEventElapsedTime(&time, beginEvent_acc, endEvent_acc); + findOptimizedSize(n_p2, gridDim, blockDim); setExclusive << < blockDim, gridDim >> >(d_odata, d_idata, n); @@ -166,6 +170,7 @@ int compact(int n, int *odata, const int *idata) { StreamCompaction::Common::kernMapToBoolean << < gridDim, blockDim >> >(n, d_bools, d_idata); cudaMemcpy(h_bools, d_bools, size, cudaMemcpyDeviceToHost); StreamCompaction::Efficient::scan(n, h_indices, h_bools); + cudaMemcpy(d_indices, h_indices, size, cudaMemcpyHostToDevice); StreamCompaction::Common::kernScatter << < gridDim, blockDim >> >(n, d_odata, d_idata, d_bools, d_indices); @@ -176,7 +181,7 @@ int compact(int n, int *odata, const int *idata) { cudaFree(d_indices); cudaFree(d_bools); - return h_indices[n - 1]; + return h_indices[n - 1] + h_bools[n - 1]; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index abd798d..1bb96ad 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -62,7 +62,7 @@ void scan(int n, int *odata, const int *idata) { int blockDim, gridDim; findOptimizedSize(n_p2, gridDim, blockDim); - //std::cout << "blockDim : " << blockDim << std::endl; + std::cout << "blockDim : " << blockDim << std::endl; ////////////////////////// float time = 0; From c69892dbcbc70503c77843d466ecd477a5e8ce22 Mon Sep 17 00:00:00 2001 From: pratchpak Date: Mon, 14 Sep 2015 23:46:13 -0400 Subject: [PATCH 7/7] tweaked? --- README.md | 15 ++++++--- src/main.cpp | 2 +- stream_compaction/common.h | 12 +++++-- stream_compaction/efficient.cu | 61 ++++++++++++++++++++++------------ stream_compaction/naive.cu | 35 +++++++++---------- 5 files changed, 79 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index d3a70a6..f1a21c0 100644 --- a/README.md +++ b/README.md @@ -14,12 +14,17 @@ The strategy is to spread out the thread to several blocks as much as possible ( For N = 256, the execution time of each methods are as follows Thrust < Naive < Work-Efficient Which is quite unexpected at first, as the 'work-efficient' one is supposed to be faster than -the 'naive' one. +the 'naive' one. -My speculation is that the 'Work-Efficient' one will only shine when N is larger than the maximum -concurrent thread the graphics card can handle (which means, for the naive one, we would need to divide -N threads into N/maxThread batches for every step. Meanwhile, for 'Work-Efficient', the number of -threads for most step will be comparatively, and substantially, lower than its counterpart. +My speculation is that the 'Work-Efficient' one will only shine when N is larger than the +maximum concurrent thread the graphics card can handle (which means, for the naive one, +we would need to divide N threads into N/maxThread batches for every step. Meanwhile, for 'Work-Efficient', +the number of threads for most step will be comparatively, and substantially, lower than its counterpart. + +Also, the current implementation of 'work-efficient' requires a lot of global +memory access, which substantially build up the access delay. Given that its calculation shows the +locality property, the speed of the calculation should be substantially lowered if the calculations +happen on the shared memory instead. ## Example Output diff --git a/src/main.cpp b/src/main.cpp index b2fa9bb..0ead633 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -15,7 +15,7 @@ #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int SIZE = 1 << 16; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; diff --git a/stream_compaction/common.h b/stream_compaction/common.h index ac6e662..a0b76d6 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -41,11 +41,19 @@ inline void findOptimizedSize(const int n, int& gridDim, int& blockDim){ cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); + if (n > prop.multiProcessorCount * prop.maxThreadsPerMultiProcessor) { + blockDim = prop.maxThreadsPerMultiProcessor / 16; + gridDim = ceil(n / (float)blockDim); + return; +} + int diff; - gridDim = 16 * prop.multiProcessorCount; blockDim = ceil(n / (float)gridDim); - if (blockDim < 32) blockDim = 32; + if (blockDim < 32) { + blockDim = 32; + gridDim = ceil(n / (float)blockDim); + } if (blockDim > prop.maxThreadsPerBlock) { diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index cc341f5..1b1cf82 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -16,32 +16,48 @@ __device__ __host__ int pow(int n, int p){ return out; }; -__global__ void upSweep(int* arr, int n, int d, int p){ +__device__ __host__ void swap(int** x, int** y){ + int *tmp = *x; + *x = *y; + *y = tmp; +}; + +__global__ void copyModifiedUpSweep(int* out, int* arr, int n, int p){ int k = ((blockDim.x * blockIdx.x) + threadIdx.x) * p; if (k >= n) return; - int ans = arr[k + pow(2, d) - 1]; + out[k + p - 1] = arr[k + p - 1]; +} - __syncthreads(); +__global__ void upSweep(int* out, int* arr, int n, int d, int p){ + int k = ((blockDim.x * blockIdx.x) + threadIdx.x) * p; + if (k >= n) return; - arr[k + p - 1] += ans; + out[k + p - 1] = arr[k + p - 1] + arr[k + pow(2, d) - 1]; } __global__ void setValArray(int* arr, int index, int value){ arr[index] = value; } -__global__ void downSweep(int* arr, int n, int d, int p){ +__global__ void copyModifiedDownSweep(int* out, int* arr, int n, int d, int p){ int k = ((blockDim.x * blockIdx.x) + threadIdx.x) * p; if (k >= n) return; - int t = arr[k + pow(2, d) - 1]; - int t2 = arr[k + p - 1]; + int p_d = pow(2, d); - __syncthreads(); + out[k + p - 1] = arr[k + p - 1]; + out[k + p_d - 1] = arr[k + p_d - 1]; +} - arr[k + pow(2, d) - 1] = t2; - arr[k + p - 1] += t; +__global__ void downSweep(int* out, int* arr, int n, int d, int p){ + int k = ((blockDim.x * blockIdx.x) + threadIdx.x) * p; + if (k >= n) return; + + int p_d = pow(2, d); + + out[k + p - 1] = arr[k + p - 1] + arr[k + p_d - 1]; + out[k + p_d - 1] = arr[k + p - 1]; } __global__ void setExclusive(int* out, int* in, int n){ @@ -51,6 +67,12 @@ __global__ void setExclusive(int* out, int* in, int n){ out[k] = k > 0 ? in[k - 1] : 0; } + +__global__ void shiftBit(int* out, int* in){ + int k = (blockDim.x * blockIdx.x) + threadIdx.x; + out[k] = k > 0 ? in[k - 1] : 0; +} + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ @@ -84,33 +106,30 @@ void scan(int n, int *odata, const int *idata) { for (int d = 0; d <= log2ceil - 1; d++){ int p = pow(2, d + 1); findOptimizedSize(n_p2 / p, gridDim, blockDim); - upSweep << < gridDim, blockDim >> >(d_idata, n_p2, d, p); + upSweep << < gridDim, blockDim >> >(d_odata, d_idata, n_p2, d, p); + copyModifiedUpSweep << < gridDim, blockDim >> >(d_idata, d_odata, n_p2, p); } - + setValArray << < 1, 1 >> >(d_idata, n_p2 - 1, 0); for (int d = log2ceil - 1; d >= 0; d--){ int p = pow(2, d + 1); findOptimizedSize(n_p2 / p, gridDim, blockDim); - downSweep << < gridDim, blockDim >> >(d_idata, n_p2, d, p); + downSweep << < gridDim, blockDim >> >(d_odata, d_idata, n_p2, d, p); + copyModifiedDownSweep << < gridDim, blockDim >> >(d_idata, d_odata, n_p2, d, p); } cudaEventRecord(endEvent_acc, 0); cudaEventSynchronize(endEvent_acc); cudaEventElapsedTime(&time, beginEvent_acc, endEvent_acc); - findOptimizedSize(n_p2, gridDim, blockDim); - setExclusive << < blockDim, gridDim >> >(d_odata, d_idata, n); + //setExclusive << < blockDim, gridDim >> >(d_odata, d_idata, n); - cudaEventRecord(endEvent_acc, 0); - cudaEventSynchronize(endEvent_acc); - cudaEventElapsedTime(&time, beginEvent_acc, endEvent_acc); - std::cout << "Work Efficient - time : " << time << " ms" << std::endl; + ////////////////////////// //test cudaMemcpy(odata, d_idata, size, cudaMemcpyDeviceToHost); - /* std::cout << "idata: "; int total = 0; @@ -127,7 +146,7 @@ void scan(int n, int *odata, const int *idata) { } std::cout << std::endl; */ - + cudaFree(d_odata); cudaFree(d_idata); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 1bb96ad..e36fcd1 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -22,25 +22,19 @@ __device__ __host__ void swap(int** x, int** y){ *y = tmp; }; -__global__ void gpuScan(int* out, int* in, int n, int iter){ +__global__ void gpuScan(int* out, int* in, int n, int p){ int k = (blockDim.x * blockIdx.x) + threadIdx.x; if (k >= n) return; - for (int d = 1; d <= iter; d++) { - int p = pow(2, d - 1); - - if (k >= p) - out[k] = in[k - p] + in[k]; - else - out[k] = in[k]; - - __syncthreads(); - swap(&in, &out); - } - - if (iter % 2 == 1) swap(&in, &out); + if (k >= p) + out[k] = in[k - p] + in[k]; + else + out[k] = in[k]; +} - out[k] = k > 0 ? in[k-1] : 0; +__global__ void shiftBit(int* out, int* in){ + int k = (blockDim.x * blockIdx.x) + threadIdx.x; + out[k] = k > 0 ? in[k - 1] : 0; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. @@ -62,7 +56,7 @@ void scan(int n, int *odata, const int *idata) { int blockDim, gridDim; findOptimizedSize(n_p2, gridDim, blockDim); - std::cout << "blockDim : " << blockDim << std::endl; + //std::cout << "blockDim : " << blockDim << std::endl; ////////////////////////// float time = 0; @@ -72,7 +66,14 @@ void scan(int n, int *odata, const int *idata) { cudaEventCreate(&beginEvent_acc); cudaEventCreate(&endEvent_acc); cudaEventRecord(beginEvent_acc, 0); - gpuScan << < gridDim, blockDim >> >(d_odata, d_idata, n_p2, log2ceil); + + for (int d = 1; d <= log2ceil; d++) { + int p = pow(2, d - 1); + gpuScan << < gridDim, blockDim >> >(d_odata, d_idata, n_p2, p); + swap(&d_odata, &d_idata); + } + shiftBit<< < gridDim, blockDim >> >(d_odata, d_idata); + cudaEventRecord(endEvent_acc, 0); cudaEventSynchronize(endEvent_acc); cudaEventElapsedTime(&time, beginEvent_acc, endEvent_acc);