From 65770b78f207b43799aa2aae26bf37b5bf881103 Mon Sep 17 00:00:00 2001
From: pratchpak <rpong@seas.upenn.edu>
Date: Sun, 13 Sep 2015 23:42:07 -0400
Subject: [PATCH 1/7] efficient done

---
 src/main.cpp                   |   4 +
 stream_compaction/common.cu    |  15 +++-
 stream_compaction/common.h     |  30 +++++++
 stream_compaction/cpu.cu       |  39 +++++++--
 stream_compaction/efficient.cu | 142 +++++++++++++++++++++++++++++++--
 stream_compaction/naive.cu     |  63 +++++++++++++--
 stream_compaction/thrust.cu    |   1 -
 7 files changed, 277 insertions(+), 17 deletions(-)
diff --git a/src/main.cpp b/src/main.cpp
index 675da35..b2fa9bb 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -6,6 +6,7 @@
  * @copyright University of Pennsylvania
  */
 
+#include <iostream>
 #include <cstdio>
 #include <stream_compaction/cpu.h>
 #include <stream_compaction/naive.h>
@@ -120,4 +121,7 @@ int main(int argc, char* argv[]) {
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     //printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
+
+	int exit;
+	std::cin >> exit;
 }
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index fe872d4..9431045 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,4 +1,5 @@
 #include "common.h"
+#include "device_launch_parameters.h"
 
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     cudaError_t err = cudaGetLastError();
@@ -24,6 +25,13 @@ namespace Common {
  */
 __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
     // TODO
+	int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
+	if (idx > n) return;
+
+	if (idata[idx] == 0)
+		bools[idx] = 0;
+	else
+		bools[idx] = 1;
 }
 
 /**
@@ -33,7 +41,12 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
 __global__ void kernScatter(int n, int *odata,
         const int *idata, const int *bools, const int *indices) {
     // TODO
-}
+	int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
+	if (idx > n) return;
 
+	if (bools[idx] == 1)
+		odata[indices[idx]] = idata[idx];
 }
+
 }
+}
\ No newline at end of file
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 4f52663..c57c04f 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -3,6 +3,7 @@
 #include <cstdio>
 #include <cstring>
 #include <cmath>
+#include <cuda_runtime.h>
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
@@ -33,3 +34,32 @@ namespace Common {
             const int *idata, const int *bools, const int *indices);
 }
 }
+
+inline void findOptimizedSize(const int n, int& gridDim, int& blockDim){
+	//assuming that cc >= 3.0, max gridDim = 16.
+
+	cudaDeviceProp prop;
+	cudaGetDeviceProperties(&prop, 0);
+
+	int prev_t_diff, curr_t_diff;
+
+	blockDim = prop.maxThreadsPerBlock;
+	gridDim = ceil(n / (float)blockDim);
+	curr_t_diff = (blockDim * gridDim) - n;
+	
+	do{
+		prev_t_diff = curr_t_diff;
+
+		blockDim /= 2;
+		gridDim = ceil(n / (float)blockDim);
+		curr_t_diff = (blockDim * gridDim) - n;
+
+	}
+	while (prev_t_diff > curr_t_diff &&
+		gridDim < 16 &&
+		blockDim > 32);
+
+	blockDim *= 2;
+	gridDim = ceil(n / (float)blockDim);
+
+}
\ No newline at end of file
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index e600c29..9fe20ad 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,15 +1,16 @@
 #include <cstdio>
 #include "cpu.h"
+#include <stdlib.h> 
 
 namespace StreamCompaction {
 namespace CPU {
-
 /**
  * CPU scan (prefix sum).
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+	odata[0] = 0;
+	for (int i = 1; i < n; i++)
+		odata[i] = odata[i - 1] + idata[i - 1];
 }
 
 /**
@@ -19,7 +20,15 @@ void scan(int n, int *odata, const int *idata) {
  */
 int compactWithoutScan(int n, int *odata, const int *idata) {
     // TODO
-    return -1;
+	int elemNo = 0;
+
+	for (int i = 0; i < n; i++)
+		if (idata[i] != 0){
+			odata[elemNo] = idata[i];
+			elemNo++;
+		}
+
+	return elemNo;
 }
 
 /**
@@ -29,7 +38,27 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
  */
 int compactWithScan(int n, int *odata, const int *idata) {
     // TODO
-    return -1;
+	int *tmp = (int*) malloc(sizeof(int) * n);
+
+	//compute temp array
+	for (int i = 0; i < n; i++) {
+		if (idata[i] == 0)
+			tmp[i] = 0;
+		else
+			tmp[i] = 1;
+	}
+
+	//scan
+	int *s = (int*)malloc(sizeof(int) * n);
+	scan(n, s, tmp);
+
+	//scatter
+	for (int i = 0; i < n; i++)
+		if (tmp[i] == 1){
+			odata[s[i]] = idata[i];
+		}
+
+	return s[n-1];
 }
 
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index b2f739b..6b938f5 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -2,18 +2,113 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "efficient.h"
+#include "device_launch_parameters.h"
+
+#include <iostream>
 
 namespace StreamCompaction {
 namespace Efficient {
+	
+__device__  __host__ int pow(int n, int p){
+	int out = 1;
+	for (int i = 0; i < p; i++)
+		out *= n;
+	return out;
+};
+
+__global__ void upSweep(int* arr, int n, int iter){
+	int k = (blockDim.x * blockIdx.x) + threadIdx.x;
+	if (k >= n) return;
+
+	for (int d = 0; d <= iter; d++) {
+		int p = pow(2, d + 1);
+		int ans = 0;
+
+		if (k % p == 0)
+			ans = arr[k + pow(2, d) - 1];
+
+		__syncthreads();
+
+		if (k % p == 0)
+			arr[k + p - 1] += ans;
+
+		__syncthreads();
+	}
+}
+
+__global__ void downSweep(int* arr, int n, int iter){
+	int k = (blockDim.x * blockIdx.x) + threadIdx.x;
+	if (k >= n) return;
+	if (k == n - 1){
+		arr[k] = 0;
+	}
+	__syncthreads();
 
-// TODO: __global__
+	for (int d = iter; d >= 0; d--) {
+		int p = pow(2, d);
+		int p_plusOne = pow(2, d + 1);
+
+		int t = 0;
+		int t2 = 0;
+
+		if (k % p_plusOne == 0) {
+			t = arr[k + p - 1];
+			t2 = arr[k + p_plusOne - 1];
+		}
+
+		__syncthreads();
+
+		if (k % p_plusOne == 0){
+			arr[k + p - 1] = t2;
+			arr[k + p_plusOne - 1] += t;
+		}
+
+		__syncthreads();
+	}
+}
 
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+	int log2ceil = ilog2ceil(n);
+	int log2 = ilog2(n);
+	int n_p2 = pow(2, log2ceil);
+
+	int size = sizeof(int) * n;
+	int size_n_p2 = sizeof(int) * n_p2;
+
+	int *d_idata;
+	cudaMalloc(&d_idata, size_n_p2);
+	cudaMemcpy(d_idata, idata, size, cudaMemcpyHostToDevice);
+	cudaMemset(d_idata + size, 0, size_n_p2 - size);
+
+	int blockDim, gridDim;
+	findOptimizedSize(n_p2, gridDim, blockDim);
+
+	upSweep <<< gridDim, blockDim >>>(d_idata, n_p2, log2ceil - 1);
+	downSweep <<< gridDim, blockDim >>>(d_idata, n_p2, log2ceil - 1);
+
+	//test
+	cudaMemcpy(odata, d_idata, size, cudaMemcpyDeviceToHost);
+
+	/*
+	std::cout << "idata: ";
+	int total = 0;
+	for (int i = 0; i < n; i++) {
+		std::cout << idata[i] << " ";
+		total += idata[i];
+	}
+	std::cout << "Total : " << total << std::endl;
+	std::cout << std::endl;
+	std::cout << std::endl;
+	std::cout << "odata: ";
+	for (int i = 0; i < n; i++) {
+		std::cout << odata[i] << " ";
+	}
+	std::cout << std::endl;
+	*/
+	cudaFree(d_idata);
 }
 
 /**
@@ -25,10 +120,47 @@ void scan(int n, int *odata, const int *idata) {
  * @param idata  The array of elements to compact.
  * @returns      The number of elements remaining after compaction.
  */
+
 int compact(int n, int *odata, const int *idata) {
     // TODO
-    return -1;
-}
+	int size = sizeof(int) * n;
+
+	//////////////////////
+
+	int *d_idata;
+	cudaMalloc(&d_idata, size);
+	cudaMemcpy(d_idata, idata, size, cudaMemcpyHostToDevice);
+
+	int* h_indices = (int*)malloc(sizeof(int) * n);
+	int* d_indices;
+	cudaMalloc(&d_indices, size);
 
+	int* h_bools = (int*)malloc(sizeof(int) * n);
+	int* d_bools;
+	cudaMalloc(&d_bools, size);
+
+	int *d_odata;
+	cudaMalloc(&d_odata, size);
+
+	//////////////////////////
+
+	int blockDim, gridDim;
+	findOptimizedSize(n, gridDim, blockDim);
+
+	StreamCompaction::Common::kernMapToBoolean << < gridDim, blockDim >> >(n, d_bools, d_idata);
+	cudaMemcpy(h_bools, d_bools, size, cudaMemcpyDeviceToHost);
+	StreamCompaction::Efficient::scan(n, h_indices, h_bools);
+	cudaMemcpy(d_indices, h_indices, size, cudaMemcpyHostToDevice);
+	StreamCompaction::Common::kernScatter << < gridDim, blockDim >> >(n, d_odata, d_idata, d_bools, d_indices);
+
+	cudaMemcpy(odata, d_odata, size, cudaMemcpyDeviceToHost);
+
+	cudaFree(d_idata);
+	cudaFree(d_odata);
+	cudaFree(d_indices);
+	cudaFree(d_bools);
+
+	return h_indices[n - 1];
+}
 }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 3d86b60..42b4968 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -2,18 +2,71 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "naive.h"
+#include "device_launch_parameters.h"
+
+#include <iostream>
 
 namespace StreamCompaction {
 namespace Naive {
 
-// TODO: __global__
+__device__  __host__ int pow(int n, int p){
+	int out = 1;
+	for (int i = 0; i < p; i++)
+		out *= n;
+	return out;
+};
+
+__device__  __host__ void swap(int** x, int** y){
+	int *tmp = *x;
+	*x = *y;
+	*y = tmp;
+};
+
+__global__ void gpuScan(int* out, int* in, int n, int iter){
+	int k = (blockDim.x * blockIdx.x) + threadIdx.x;
+	if (k > n) return;
+
+	for (int d = 1; d <= iter; d++) {
+		int p = pow(2, d - 1);
 
+		if (k >= p)
+			out[k] = in[k - p] + in[k];
+		else
+			out[k] = in[k];
+
+		__syncthreads();
+		swap(&in, &out);
+	}
+
+	out[k] = k > 0 ? in[k - 1] : 0;
+}
 /**
- * Performs prefix-sum (aka scan) on idata, storing the result into odata.
- */
+* Performs prefix-sum (aka scan) on idata, storing the result into odata.
+*/
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+
+	int log2ceil = ilog2ceil(n);
+	int log2 = ilog2(n);
+	int n_p2 = pow(2, log2ceil);
+
+	int size = sizeof(int) * n;
+	int size_n_p2 = sizeof(int) * n_p2;
+
+	int *d_idata, *d_odata;
+	cudaMalloc(&d_idata, size_n_p2);
+	cudaMalloc(&d_odata, size_n_p2);
+	cudaMemcpy(d_idata, idata, size, cudaMemcpyHostToDevice);
+	cudaMemset(d_idata + size, 0, size_n_p2 - size);
+
+	int blockDim, gridDim;
+	findOptimizedSize(n_p2, gridDim, blockDim);
+
+	gpuScan << < gridDim, blockDim >> >(d_odata, d_idata, n_p2, log2ceil);
+
+	cudaMemcpy(odata, d_odata, size, cudaMemcpyDeviceToHost);
+
+	cudaFree(d_idata);
+	cudaFree(d_odata);
 }
 
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index d8dbb32..cdd95b1 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -8,7 +8,6 @@
 
 namespace StreamCompaction {
 namespace Thrust {
-
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */

From 52588e16cd92d1dc6b0e27cfb91af6b3981cd5f7 Mon Sep 17 00:00:00 2001
From: pratchpak <rpong@seas.upenn.edu>
Date: Sun, 13 Sep 2015 23:49:48 -0400
Subject: [PATCH 2/7] thrust done

---
 stream_compaction/thrust.cu | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index cdd95b1..ac94e96 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -11,11 +11,12 @@ namespace Thrust {
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
-void scan(int n, int *odata, const int *idata) {
-    // TODO use `thrust::exclusive_scan`
-    // example: for device_vectors dv_in and dv_out:
-    // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
-}
+	void scan(int n, int *odata, const int *idata) {
+		// TODO use `thrust::exclusive_scan`
+		// example: for device_vectors dv_in and dv_out:
+		// thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
 
+		thrust::exclusive_scan(idata, idata + n, odata);
+	}
 }
 }

From 59de4e5aac1dfa73feaf05b781d4c0a51671586b Mon Sep 17 00:00:00 2001
From: pratchpak <rpong@seas.upenn.edu>
Date: Sun, 13 Sep 2015 23:51:30 -0400
Subject: [PATCH 3/7] readme

---
 README.md | 192 +-----------------------------------------------------
 1 file changed, 2 insertions(+), 190 deletions(-)

diff --git a/README.md b/README.md
index 4535eea..07599c8 100644
--- a/README.md
+++ b/README.md
@@ -3,181 +3,8 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
-
-### (TODO: Your README)
-
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
-
-Instructions (delete me)
-========================
-
-This is due Sunday, September 13 at midnight.
-
-**Summary:** In this project, you'll implement GPU stream compaction in CUDA,
-from scratch. This algorithm is widely used, and will be important for
-accelerating your path tracer project.
-
-Your stream compaction implementations in this project will simply remove `0`s
-from an array of `int`s. In the path tracer, you will remove terminated paths
-from an array of rays.
-
-In addition to being useful for your path tracer, this project is meant to
-reorient your algorithmic thinking to the way of the GPU. On GPUs, many
-algorithms can benefit from massive parallelism and, in particular, data
-parallelism: executing the same code many times simultaneously with different
-data.
-
-You'll implement a few different versions of the *Scan* (*Prefix Sum*)
-algorithm. First, you'll implement a CPU version of the algorithm to reinforce
-your understanding. Then, you'll write a few GPU implementations: "naive" and
-"work-efficient." Finally, you'll use some of these to implement GPU stream
-compaction.
-
-**Algorithm overview & details:** There are two primary references for details
-on the implementation of scan and stream compaction.
-
-* The [slides on Parallel Algorithms](https://github.com/CIS565-Fall-2015/cis565-fall-2015.github.io/raw/master/lectures/2-Parallel-Algorithms.pptx)
-  for Scan, Stream Compaction, and Work-Efficient Parallel Scan.
-* GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html).
-
-Your GPU stream compaction implementation will live inside of the
-`stream_compaction` subproject. This way, you will be able to easily copy it
-over for use in your GPU path tracer.
-
-
-## Part 0: The Usual
-
-This project (and all other CUDA projects in this course) requires an NVIDIA
-graphics card with CUDA capability. Any card with Compute Capability 2.0
-(`sm_20`) or greater will work. Check your GPU on this
-[compatibility table](https://developer.nvidia.com/cuda-gpus).
-If you do not have a personal machine with these specs, you may use those
-computers in the Moore 100B/C which have supported GPUs.
-
-**HOWEVER**: If you need to use the lab computer for your development, you will
-not presently be able to do GPU performance profiling. This will be very
-important for debugging performance bottlenecks in your program.
-
-### Useful existing code
-
-* `stream_compaction/common.h`
-  * `checkCUDAError` macro: checks for CUDA errors and exits if there were any.
-  * `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer.
-* `main.cpp`
-  * Some testing code for your implementations.
-
-**Note 1:** The tests will simply compare against your CPU implementation
-Do it first!
-
-**Note 2:** The tests default to an array of size 256.
-Test with something larger (10,000?), too!
-
-
-## Part 1: CPU Scan & Stream Compaction
-
-This stream compaction method will remove `0`s from an array of `int`s.
-
-Do this first, and double check the output! It will be used as the expected
-value for the other tests.
-
-In `stream_compaction/cpu.cu`, implement:
-
-* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum.
-* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using
-  the `scan` function.
-* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan`
-  function. Map the input array to an array of 0s and 1s, scan it, and use
-  scatter to produce the output. You will need a **CPU** scatter implementation
-  for this (see slides or GPU Gems chapter for an explanation).
-
-These implementations should only be a few lines long.
-
-
-## Part 2: Naive GPU Scan Algorithm
-
-In `stream_compaction/naive.cu`, implement `StreamCompaction::Naive::scan`
-
-This uses the "Naive" algorithm from GPU Gems 3, Section 39.2.1. We haven't yet
-taught shared memory, and you **shouldn't use it yet**. Example 39-1 uses
-shared memory, but is limited to operating on very small arrays! Instead, write
-this using global memory only. As a result of this, you will have to do
-`ilog2ceil(n)` separate kernel invocations.
-
-Beware of errors in Example 39-1 in the book; both the pseudocode and the CUDA
-code in the online version of Chapter 39 are known to have a few small errors
-(in superscripting, missing braces, bad indentation, etc.)
-
-Since the parallel scan algorithm operates on a binary tree structure, it works
-best with arrays with power-of-two length. Make sure your implementation works
-on non-power-of-two sized arrays (see `ilog2ceil`). This requires extra memory
-- your intermediate array sizes will need to be rounded to the next power of
-two.
-
-
-## Part 3: Work-Efficient GPU Scan & Stream Compaction
-
-### 3.1. Scan
-
-In `stream_compaction/efficient.cu`, implement
-`StreamCompaction::Efficient::scan`
-
-All of the text in Part 2 applies.
-
-* This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2.
-* Beware of errors in Example 39-2.
-* Test non-power-of-two sized arrays.
-
-### 3.2. Stream Compaction
-
-This stream compaction method will remove `0`s from an array of `int`s.
-
-In `stream_compaction/efficient.cu`, implement
-`StreamCompaction::Efficient::compact`
-
-For compaction, you will also need to implement the scatter algorithm presented
-in the slides and the GPU Gems chapter.
-
-In `stream_compaction/common.cu`, implement these for use in `compact`:
-
-* `StreamCompaction::Common::kernMapToBoolean`
-* `StreamCompaction::Common::kernScatter`
-
-
-## Part 4: Using Thrust's Implementation
-
-In `stream_compaction/thrust.cu`, implement:
-
-* `StreamCompaction::Thrust::scan`
-
-This should be a very short function which wraps a call to the Thrust library
-function `thrust::exclusive_scan(first, last, result)`.
-
-To measure timing, be sure to exclude memory operations by passing
-`exclusive_scan` a `thrust::device_vector` (which is already allocated on the
-GPU).  You can create a `thrust::device_vector` by creating a
-`thrust::host_vector` from the given pointer, then casting it.
-
-
-## Part 5: Radix Sort (Extra Credit) (+10)
-
-Add an additional module to the `stream_compaction` subproject. Implement radix
-sort using one of your scan implementations. Add tests to check its correctness.
-
-
-## Write-up
-
-1. Update all of the TODOs at the top of this README.
-2. Add a description of this project including a list of its features.
-3. Add your performance analysis (see below).
-
-All extra credit features must be documented in your README, explaining its
-value (with performance comparison, if applicable!) and showing an example how
-it works. For radix sort, show how it is called and an example of its output.
-
-Always profile with Release mode builds and run without debugging.
+* Ratchpak (Dome) Pongmongkol
+* Tested on: OSX Yosemite 10.10.5, i7 @ 2.4GHz 16GB, GT 650M 1024MB (rMBP Early 2013)
 
 ### Questions
 
@@ -205,18 +32,3 @@ Always profile with Release mode builds and run without debugging.
 
 These questions should help guide you in performance analysis on future
 assignments, as well.
-
-## Submit
-
-If you have modified any of the `CMakeLists.txt` files at all (aside from the
-list of `SOURCE_FILES`), you must test that your project can build in Moore
-100B/C. Beware of any build issues discussed on the Google Group.
-
-1. Open a GitHub pull request so that we can see that you have finished.
-   The title should be "Submission: YOUR NAME".
-2. Send an email to the TA (gmail: kainino1+cis565@) with:
-   * **Subject**: in the form of `[CIS565] Project 2: PENNKEY`
-   * Direct link to your pull request on GitHub
-   * In the form of a grade (0-100+) with comments, evaluate your own
-     performance on the project.
-   * Feedback on the project itself, if any.

From 6c67f2e476c9fb82fe9ade32e1c7739bc23861c2 Mon Sep 17 00:00:00 2001
From: pratchpak <rpong@seas.upenn.edu>
Date: Mon, 14 Sep 2015 00:07:04 -0400
Subject: [PATCH 4/7] result for readme

---
 README.md                  | 66 +++++++++++++++++++++++++++-----------
 stream_compaction/common.h |  6 ++--
 2 files changed, 51 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 07599c8..5e02f79 100644
--- a/README.md
+++ b/README.md
@@ -8,27 +8,57 @@ CUDA Stream Compaction
 
 ### Questions
 
-* Roughly optimize the block sizes of each of your implementations for minimal
-  run time on your GPU.
-  * (You shouldn't compare unoptimized implementations to each other!)
-
-* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and
-  Thrust) to the serial CPU version of Scan. Plot a graph of the comparison
-  (with array size on the independent axis).
-  * You should use CUDA events for timing. Be sure **not** to include any
-    explicit memory operations in your performance measurements, for
-    comparability.
-  * To guess at what might be happening inside the Thrust implementation, take
-    a look at the Nsight timeline for its execution.
+* For block sizing, I implemented a function "findOptimizedSize" in common.h, The strategy is to find 
+a block size that, in total, has the least number of unused threads. This start from decreasing 
+prop.maxThreadsPerBlock by prop.warpSize (32), while keeping gridDim < 16 and blockDim > prop.warpSize (32)
+
+# Graph
 
 * Write a brief explanation of the phenomena you see here.
   * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is
     it different for each implementation?
 
-* Paste the output of the test program into a triple-backtick block in your
-  README.
-  * If you add your own tests (e.g. for radix sort or to test additional corner
-    cases), be sure to mention it explicitly.
+## Example Output
+
+```
+****************
+** SCAN TESTS **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...   26   0 ]
+==== cpu scan, power-of-two ====
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ]
+==== cpu scan, non-power-of-two ====
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 6146 6190 ]
+    passed
+==== naive scan, power-of-two ====
+    passed
+==== naive scan, non-power-of-two ====
+    passed
+==== work-efficient scan, power-of-two ====
+    passed
+==== work-efficient scan, non-power-of-two ====
+    passed
+==== thrust scan, power-of-two ====
+    passed
+==== thrust scan, non-power-of-two ====
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   3   2   1   3   1   1   1   2   0   1   0   2 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   1 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   3   2 ]
+    passed
+==== cpu compact with scan ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   1 ]
+    passed
+==== work-efficient compact, power-of-two ====
+    passed
+==== work-efficient compact, non-power-of-two ====
+    passed
 
-These questions should help guide you in performance analysis on future
-assignments, as well.
+```
\ No newline at end of file
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index c57c04f..0d767ec 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -50,16 +50,16 @@ inline void findOptimizedSize(const int n, int& gridDim, int& blockDim){
 	do{
 		prev_t_diff = curr_t_diff;
 
-		blockDim /= 2;
+		blockDim -= prop.warpSize;
 		gridDim = ceil(n / (float)blockDim);
 		curr_t_diff = (blockDim * gridDim) - n;
 
 	}
 	while (prev_t_diff > curr_t_diff &&
 		gridDim < 16 &&
-		blockDim > 32);
+		blockDim > prop.warpSize);
 
-	blockDim *= 2;
+	blockDim += prop.warpSize;
 	gridDim = ceil(n / (float)blockDim);
 
 }
\ No newline at end of file

From 354dded9d1a4cb835900d587de327d26f34f7a51 Mon Sep 17 00:00:00 2001
From: pratchpak <rpong@seas.upenn.edu>
Date: Mon, 14 Sep 2015 03:20:20 -0400
Subject: [PATCH 5/7] Finished readme.

---
 README.md                      | 25 ++++++---
 src/main.cpp                   |  2 +-
 stream_compaction/common.h     | 20 +++----
 stream_compaction/cpu.cu       |  8 +++
 stream_compaction/efficient.cu | 96 ++++++++++++++++++++--------------
 stream_compaction/naive.cu     | 20 ++++++-
 stream_compaction/thrust.cu    | 13 +++++
 7 files changed, 124 insertions(+), 60 deletions(-)

diff --git a/README.md b/README.md
index 5e02f79..ec1d509 100644
--- a/README.md
+++ b/README.md
@@ -6,17 +6,26 @@ CUDA Stream Compaction
 * Ratchpak (Dome) Pongmongkol
 * Tested on: OSX Yosemite 10.10.5, i7 @ 2.4GHz 16GB, GT 650M 1024MB (rMBP Early 2013)
 
-### Questions
-
 * For block sizing, I implemented a function "findOptimizedSize" in common.h, The strategy is to find 
-a block size that, in total, has the least number of unused threads. This start from decreasing 
-prop.maxThreadsPerBlock by prop.warpSize (32), while keeping gridDim < 16 and blockDim > prop.warpSize (32)
+a block size that, in total, has the least number of unused threads. 
+This start by... 
+1. setting gridSize = 16 and blockDim = ceil(n / (float)gridDim);
+2. decreasing blockDim by prop.warpSize (32), while keeping blockDim > prop.warpSize (32) && gridDim <= 16
+	- gridDimis calculated from : ceil(n / blockDim);
+
+# Analysis
+
+For N = 256, the execution time of each methods are as follows
+Thrust < Naive < Work-Efficient
+Which is quite unexpected at first, as the 'work-efficient' one is supposed to be faster than
+the 'naive' one.
 
-# Graph
+My speculation is that the 'Work-Efficient' one will only shine when N is larger than the maximum
+concurrent thread the graphics card can handle (which means, for the naive one, we would need to divide
+N threads into N/maxThread batches for every step. Meanwhile, for 'Work-Efficient', the number of 
+threads for most step will be comparatively, and substantially, lower than its counterpart.
 
-* Write a brief explanation of the phenomena you see here.
-  * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is
-    it different for each implementation?
+Please note that I did not implement the Naive algorithm to support N > totalMaxThread
 
 ## Example Output
 
diff --git a/src/main.cpp b/src/main.cpp
index b2fa9bb..d86ea40 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -15,7 +15,7 @@
 #include "testing_helpers.hpp"
 
 int main(int argc, char* argv[]) {
-    const int SIZE = 1 << 8;
+    const int SIZE = 256;
     const int NPOT = SIZE - 3;
     int a[SIZE], b[SIZE], c[SIZE];
 
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 0d767ec..61bae7b 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -43,21 +43,23 @@ inline void findOptimizedSize(const int n, int& gridDim, int& blockDim){
 
 	int prev_t_diff, curr_t_diff;
 
-	blockDim = prop.maxThreadsPerBlock;
-	gridDim = ceil(n / (float)blockDim);
+	gridDim = 16;
+	blockDim = ceil(n / (float)gridDim);
+	if (blockDim < 32) blockDim = 32;
 	curr_t_diff = (blockDim * gridDim) - n;
+	prev_t_diff = curr_t_diff;
 	
-	do{
-		prev_t_diff = curr_t_diff;
-
+	do
+	{
 		blockDim -= prop.warpSize;
 		gridDim = ceil(n / (float)blockDim);
 		curr_t_diff = (blockDim * gridDim) - n;
 
-	}
-	while (prev_t_diff > curr_t_diff &&
-		gridDim < 16 &&
-		blockDim > prop.warpSize);
+		prev_t_diff = curr_t_diff;
+	} 
+	while (prev_t_diff >= curr_t_diff &&
+		blockDim > prop.warpSize &&
+		blockDim <= prop.maxThreadsPerBlock);
 
 	blockDim += prop.warpSize;
 	gridDim = ceil(n / (float)blockDim);
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 9fe20ad..ffdb197 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,6 +1,8 @@
 #include <cstdio>
 #include "cpu.h"
 #include <stdlib.h> 
+#include <chrono> 
+#include <iostream>
 
 namespace StreamCompaction {
 namespace CPU {
@@ -8,9 +10,15 @@ namespace CPU {
  * CPU scan (prefix sum).
  */
 void scan(int n, int *odata, const int *idata) {
+	auto begin = std::chrono::high_resolution_clock::now();
 	odata[0] = 0;
 	for (int i = 1; i < n; i++)
 		odata[i] = odata[i - 1] + idata[i - 1];
+
+	auto end = std::chrono::high_resolution_clock::now();
+	auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
+
+	std::cout << "CPU - time : " << duration << " ms" << std::endl;
 }
 
 /**
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 6b938f5..b748e01 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -16,55 +16,39 @@ __device__  __host__ int pow(int n, int p){
 	return out;
 };
 
-__global__ void upSweep(int* arr, int n, int iter){
-	int k = (blockDim.x * blockIdx.x) + threadIdx.x;
+__global__ void upSweep(int* arr, int n, int d, int p){
+	int k = ((blockDim.x * blockIdx.x) + threadIdx.x) * p;
 	if (k >= n) return;
 
-	for (int d = 0; d <= iter; d++) {
-		int p = pow(2, d + 1);
-		int ans = 0;
-
-		if (k % p == 0)
-			ans = arr[k + pow(2, d) - 1];
+	int ans = arr[k + pow(2, d) - 1];
 
-		__syncthreads();
+	__syncthreads();
 
-		if (k % p == 0)
-			arr[k + p - 1] += ans;
+	arr[k + p - 1] += ans;
+}
 
-		__syncthreads();
-	}
+__global__ void setValArray(int* arr, int index, int value){
+	arr[index] = value;
 }
 
-__global__ void downSweep(int* arr, int n, int iter){
-	int k = (blockDim.x * blockIdx.x) + threadIdx.x;
+__global__ void downSweep(int* arr, int n, int d, int p){
+	int k = ((blockDim.x * blockIdx.x) + threadIdx.x) * p;
 	if (k >= n) return;
-	if (k == n - 1){
-		arr[k] = 0;
-	}
-	__syncthreads();
-
-	for (int d = iter; d >= 0; d--) {
-		int p = pow(2, d);
-		int p_plusOne = pow(2, d + 1);
 
-		int t = 0;
-		int t2 = 0;
+	int t = arr[k + pow(2, d) - 1];
+	int t2 = arr[k + p - 1];
 
-		if (k % p_plusOne == 0) {
-			t = arr[k + p - 1];
-			t2 = arr[k + p_plusOne - 1];
-		}
+	__syncthreads();
 
-		__syncthreads();
+	arr[k + pow(2, d) - 1] = t2;
+	arr[k + p - 1] += t;
+}
 
-		if (k % p_plusOne == 0){
-			arr[k + p - 1] = t2;
-			arr[k + p_plusOne - 1] += t;
-		}
+__global__ void setExclusive(int* out, int* in, int n){
+	int k = (blockDim.x * blockIdx.x) + threadIdx.x;
+	if (k >= n) return;
 
-		__syncthreads();
-	}
+	out[k] = k > 0 ? in[k - 1] : 0;
 }
 
 /**
@@ -78,17 +62,48 @@ void scan(int n, int *odata, const int *idata) {
 	int size = sizeof(int) * n;
 	int size_n_p2 = sizeof(int) * n_p2;
 
-	int *d_idata;
+	int *d_idata, *d_odata;
 	cudaMalloc(&d_idata, size_n_p2);
+	cudaMalloc(&d_odata, size_n_p2);
 	cudaMemcpy(d_idata, idata, size, cudaMemcpyHostToDevice);
-	cudaMemset(d_idata + size, 0, size_n_p2 - size);
+	cudaMemset(&d_idata[n], 0, size_n_p2 - size);
 
 	int blockDim, gridDim;
 	findOptimizedSize(n_p2, gridDim, blockDim);
+	//std::cout << "blockDim : " << blockDim << std::endl;
+
+	//////////////////////////
+	float time = 0;
+	cudaEvent_t beginEvent_acc;
+	cudaEvent_t endEvent_acc;
+
+	cudaEventCreate(&beginEvent_acc);
+	cudaEventCreate(&endEvent_acc);
+	cudaEventRecord(beginEvent_acc, 0);
 
-	upSweep <<< gridDim, blockDim >>>(d_idata, n_p2, log2ceil - 1);
-	downSweep <<< gridDim, blockDim >>>(d_idata, n_p2, log2ceil - 1);
+	for (int d = 0; d <= log2ceil - 1; d++){
+		int p = pow(2, d + 1);
+		findOptimizedSize(n_p2 / p, gridDim, blockDim);
+		upSweep << < gridDim, blockDim >> >(d_idata, n_p2, d, p);
+	}
 
+	setValArray << < 1, 1 >> >(d_idata, n_p2 - 1, 0);
+
+	for (int d = log2ceil - 1; d >= 0; d--){
+		int p = pow(2, d + 1);
+		findOptimizedSize(n_p2 / p, gridDim, blockDim);
+		downSweep << < gridDim, blockDim >> >(d_idata, n_p2, d, p);
+	}
+
+	findOptimizedSize(n_p2, gridDim, blockDim);
+	setExclusive << < blockDim, gridDim >> >(d_odata, d_idata, n);
+
+	cudaEventRecord(endEvent_acc, 0);
+	cudaEventSynchronize(endEvent_acc);
+	cudaEventElapsedTime(&time, beginEvent_acc, endEvent_acc);
+	
+	std::cout << "Work Efficient - time : " << time << " ms" << std::endl;
+	//////////////////////////
 	//test
 	cudaMemcpy(odata, d_idata, size, cudaMemcpyDeviceToHost);
 
@@ -108,6 +123,7 @@ void scan(int n, int *odata, const int *idata) {
 	}
 	std::cout << std::endl;
 	*/
+
 	cudaFree(d_idata);
 }
 
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 42b4968..abd798d 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -24,7 +24,7 @@ __device__  __host__ void swap(int** x, int** y){
 
 __global__ void gpuScan(int* out, int* in, int n, int iter){
 	int k = (blockDim.x * blockIdx.x) + threadIdx.x;
-	if (k > n) return;
+	if (k >= n) return;
 
 	for (int d = 1; d <= iter; d++) {
 		int p = pow(2, d - 1);
@@ -38,7 +38,9 @@ __global__ void gpuScan(int* out, int* in, int n, int iter){
 		swap(&in, &out);
 	}
 
-	out[k] = k > 0 ? in[k - 1] : 0;
+	if (iter % 2 == 1) swap(&in, &out);
+
+	out[k] = k > 0 ? in[k-1] : 0;
 }
 /**
 * Performs prefix-sum (aka scan) on idata, storing the result into odata.
@@ -60,9 +62,23 @@ void scan(int n, int *odata, const int *idata) {
 
 	int blockDim, gridDim;
 	findOptimizedSize(n_p2, gridDim, blockDim);
+	//std::cout << "blockDim : " << blockDim << std::endl;
+
+	//////////////////////////
+	float time = 0;
+	cudaEvent_t beginEvent_acc;
+	cudaEvent_t endEvent_acc;
 
+	cudaEventCreate(&beginEvent_acc);
+	cudaEventCreate(&endEvent_acc);
+	cudaEventRecord(beginEvent_acc, 0);
 	gpuScan << < gridDim, blockDim >> >(d_odata, d_idata, n_p2, log2ceil);
+	cudaEventRecord(endEvent_acc, 0);
+	cudaEventSynchronize(endEvent_acc);
+	cudaEventElapsedTime(&time, beginEvent_acc, endEvent_acc);
 
+	std::cout << "Naive - time : " << time << " ms" << std::endl;
+	//////////////////////////
 	cudaMemcpy(odata, d_odata, size, cudaMemcpyDeviceToHost);
 
 	cudaFree(d_idata);
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index ac94e96..efec95a 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -15,8 +15,21 @@ namespace Thrust {
 		// TODO use `thrust::exclusive_scan`
 		// example: for device_vectors dv_in and dv_out:
 		// thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+		float time = 0;
+		cudaEvent_t beginEvent_acc;
+		cudaEvent_t endEvent_acc;
+
+		cudaEventCreate(&beginEvent_acc);
+		cudaEventCreate(&endEvent_acc);
+		cudaEventRecord(beginEvent_acc, 0);
 
 		thrust::exclusive_scan(idata, idata + n, odata);
+
+		cudaEventRecord(endEvent_acc, 0);
+		cudaEventSynchronize(endEvent_acc);
+		cudaEventElapsedTime(&time, beginEvent_acc, endEvent_acc);
+
+		std::cout << "thrust - time : " << time << " ms" << std::endl;
 	}
 }
 }

From 699396c74b984a2af929faa2a035f99fbdc33933 Mon Sep 17 00:00:00 2001
From: pratchpak <rpong@seas.upenn.edu>
Date: Mon, 14 Sep 2015 03:56:50 -0400
Subject: [PATCH 6/7] fixed minor bugs

---
 README.md                      | 10 ++--------
 src/main.cpp                   |  2 +-
 stream_compaction/common.h     | 26 ++++++++------------------
 stream_compaction/efficient.cu |  7 ++++++-
 stream_compaction/naive.cu     |  2 +-
 5 files changed, 18 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index ec1d509..d3a70a6 100644
--- a/README.md
+++ b/README.md
@@ -6,12 +6,8 @@ CUDA Stream Compaction
 * Ratchpak (Dome) Pongmongkol
 * Tested on: OSX Yosemite 10.10.5, i7 @ 2.4GHz 16GB, GT 650M 1024MB (rMBP Early 2013)
 
-* For block sizing, I implemented a function "findOptimizedSize" in common.h, The strategy is to find 
-a block size that, in total, has the least number of unused threads. 
-This start by... 
-1. setting gridSize = 16 and blockDim = ceil(n / (float)gridDim);
-2. decreasing blockDim by prop.warpSize (32), while keeping blockDim > prop.warpSize (32) && gridDim <= 16
-	- gridDimis calculated from : ceil(n / blockDim);
+* For block sizing, I implemented a function "findOptimizedSize" in common.h. 
+The strategy is to spread out the thread to several blocks as much as possible (gridDim < 16)
 
 # Analysis
 
@@ -25,8 +21,6 @@ concurrent thread the graphics card can handle (which means, for the naive one,
 N threads into N/maxThread batches for every step. Meanwhile, for 'Work-Efficient', the number of 
 threads for most step will be comparatively, and substantially, lower than its counterpart.
 
-Please note that I did not implement the Naive algorithm to support N > totalMaxThread
-
 ## Example Output
 
 ```
diff --git a/src/main.cpp b/src/main.cpp
index d86ea40..b2fa9bb 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -15,7 +15,7 @@
 #include "testing_helpers.hpp"
 
 int main(int argc, char* argv[]) {
-    const int SIZE = 256;
+    const int SIZE = 1 << 8;
     const int NPOT = SIZE - 3;
     int a[SIZE], b[SIZE], c[SIZE];
 
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 61bae7b..ac6e662 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -41,27 +41,17 @@ inline void findOptimizedSize(const int n, int& gridDim, int& blockDim){
 	cudaDeviceProp prop;
 	cudaGetDeviceProperties(&prop, 0);
 
-	int prev_t_diff, curr_t_diff;
-
-	gridDim = 16;
+	int diff;
+	
+	gridDim = 16 * prop.multiProcessorCount;
 	blockDim = ceil(n / (float)gridDim);
 	if (blockDim < 32) blockDim = 32;
-	curr_t_diff = (blockDim * gridDim) - n;
-	prev_t_diff = curr_t_diff;
 	
-	do
+	if (blockDim > prop.maxThreadsPerBlock)
 	{
-		blockDim -= prop.warpSize;
-		gridDim = ceil(n / (float)blockDim);
-		curr_t_diff = (blockDim * gridDim) - n;
-
-		prev_t_diff = curr_t_diff;
+		diff = (blockDim - prop.maxThreadsPerBlock) * gridDim;
+		int additionalGrid = diff / prop.maxThreadsPerBlock;
+		gridDim += additionalGrid;
+		blockDim = prop.maxThreadsPerBlock;
 	} 
-	while (prev_t_diff >= curr_t_diff &&
-		blockDim > prop.warpSize &&
-		blockDim <= prop.maxThreadsPerBlock);
-
-	blockDim += prop.warpSize;
-	gridDim = ceil(n / (float)blockDim);
-
 }
\ No newline at end of file
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index b748e01..cc341f5 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -95,6 +95,10 @@ void scan(int n, int *odata, const int *idata) {
 		downSweep << < gridDim, blockDim >> >(d_idata, n_p2, d, p);
 	}
 
+	cudaEventRecord(endEvent_acc, 0);
+	cudaEventSynchronize(endEvent_acc);
+	cudaEventElapsedTime(&time, beginEvent_acc, endEvent_acc);
+
 	findOptimizedSize(n_p2, gridDim, blockDim);
 	setExclusive << < blockDim, gridDim >> >(d_odata, d_idata, n);
 
@@ -166,6 +170,7 @@ int compact(int n, int *odata, const int *idata) {
 	StreamCompaction::Common::kernMapToBoolean << < gridDim, blockDim >> >(n, d_bools, d_idata);
 	cudaMemcpy(h_bools, d_bools, size, cudaMemcpyDeviceToHost);
 	StreamCompaction::Efficient::scan(n, h_indices, h_bools);
+
 	cudaMemcpy(d_indices, h_indices, size, cudaMemcpyHostToDevice);
 	StreamCompaction::Common::kernScatter << < gridDim, blockDim >> >(n, d_odata, d_idata, d_bools, d_indices);
 
@@ -176,7 +181,7 @@ int compact(int n, int *odata, const int *idata) {
 	cudaFree(d_indices);
 	cudaFree(d_bools);
 
-	return h_indices[n - 1];
+	return h_indices[n - 1] + h_bools[n - 1];
 }
 }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index abd798d..1bb96ad 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -62,7 +62,7 @@ void scan(int n, int *odata, const int *idata) {
 
 	int blockDim, gridDim;
 	findOptimizedSize(n_p2, gridDim, blockDim);
-	//std::cout << "blockDim : " << blockDim << std::endl;
+	std::cout << "blockDim : " << blockDim << std::endl;
 
 	//////////////////////////
 	float time = 0;

From c69892dbcbc70503c77843d466ecd477a5e8ce22 Mon Sep 17 00:00:00 2001
From: pratchpak <rpong@seas.upenn.edu>
Date: Mon, 14 Sep 2015 23:46:13 -0400
Subject: [PATCH 7/7] tweaked?

---
 README.md                      | 15 ++++++---
 src/main.cpp                   |  2 +-
 stream_compaction/common.h     | 12 +++++--
 stream_compaction/efficient.cu | 61 ++++++++++++++++++++++------------
 stream_compaction/naive.cu     | 35 +++++++++----------
 5 files changed, 79 insertions(+), 46 deletions(-)

diff --git a/README.md b/README.md
index d3a70a6..f1a21c0 100644
--- a/README.md
+++ b/README.md
@@ -14,12 +14,17 @@ The strategy is to spread out the thread to several blocks as much as possible (
 For N = 256, the execution time of each methods are as follows
 Thrust < Naive < Work-Efficient
 Which is quite unexpected at first, as the 'work-efficient' one is supposed to be faster than
-the 'naive' one.
+the 'naive' one. 
 
-My speculation is that the 'Work-Efficient' one will only shine when N is larger than the maximum
-concurrent thread the graphics card can handle (which means, for the naive one, we would need to divide
-N threads into N/maxThread batches for every step. Meanwhile, for 'Work-Efficient', the number of 
-threads for most step will be comparatively, and substantially, lower than its counterpart.
+My speculation is that the 'Work-Efficient' one will only shine when N is larger than the 
+maximum concurrent thread the graphics card can handle (which means, for the naive one, 
+we would need to divide N threads into N/maxThread batches for every step. Meanwhile, for 'Work-Efficient', 
+the number of threads for most step will be comparatively, and substantially, lower than its counterpart.
+
+Also, the current implementation of 'work-efficient' requires a lot of global
+memory access, which substantially build up the access delay. Given that its calculation shows the
+locality property, the speed of the calculation should be substantially lowered if the calculations
+happen on the shared memory instead.
 
 ## Example Output
 
diff --git a/src/main.cpp b/src/main.cpp
index b2fa9bb..0ead633 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -15,7 +15,7 @@
 #include "testing_helpers.hpp"
 
 int main(int argc, char* argv[]) {
-    const int SIZE = 1 << 8;
+    const int SIZE = 1 << 16;
     const int NPOT = SIZE - 3;
     int a[SIZE], b[SIZE], c[SIZE];
 
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index ac6e662..a0b76d6 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -41,11 +41,19 @@ inline void findOptimizedSize(const int n, int& gridDim, int& blockDim){
 	cudaDeviceProp prop;
 	cudaGetDeviceProperties(&prop, 0);
 
+	if (n > prop.multiProcessorCount * prop.maxThreadsPerMultiProcessor) {
+		blockDim = prop.maxThreadsPerMultiProcessor / 16;
+		gridDim = ceil(n / (float)blockDim);
+		return;
+}	
+
 	int diff;
-	
 	gridDim = 16 * prop.multiProcessorCount;
 	blockDim = ceil(n / (float)gridDim);
-	if (blockDim < 32) blockDim = 32;
+	if (blockDim < 32) {
+		blockDim = 32;
+		gridDim = ceil(n / (float)blockDim);
+	}
 	
 	if (blockDim > prop.maxThreadsPerBlock)
 	{
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index cc341f5..1b1cf82 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -16,32 +16,48 @@ __device__  __host__ int pow(int n, int p){
 	return out;
 };
 
-__global__ void upSweep(int* arr, int n, int d, int p){
+__device__  __host__ void swap(int** x, int** y){
+	int *tmp = *x;
+	*x = *y;
+	*y = tmp;
+};
+
+__global__ void copyModifiedUpSweep(int* out, int* arr, int n, int p){
 	int k = ((blockDim.x * blockIdx.x) + threadIdx.x) * p;
 	if (k >= n) return;
 
-	int ans = arr[k + pow(2, d) - 1];
+	out[k + p - 1] = arr[k + p - 1];
+}
 
-	__syncthreads();
+__global__ void upSweep(int* out, int* arr, int n, int d, int p){
+	int k = ((blockDim.x * blockIdx.x) + threadIdx.x) * p;
+	if (k >= n) return;
 
-	arr[k + p - 1] += ans;
+	out[k + p - 1] = arr[k + p - 1] + arr[k + pow(2, d) - 1];
 }
 
 __global__ void setValArray(int* arr, int index, int value){
 	arr[index] = value;
 }
 
-__global__ void downSweep(int* arr, int n, int d, int p){
+__global__ void copyModifiedDownSweep(int* out, int* arr, int n, int d, int p){
 	int k = ((blockDim.x * blockIdx.x) + threadIdx.x) * p;
 	if (k >= n) return;
 
-	int t = arr[k + pow(2, d) - 1];
-	int t2 = arr[k + p - 1];
+	int p_d = pow(2, d);
 
-	__syncthreads();
+	out[k + p - 1] = arr[k + p - 1];
+	out[k + p_d - 1] = arr[k + p_d - 1];
+}
 
-	arr[k + pow(2, d) - 1] = t2;
-	arr[k + p - 1] += t;
+__global__ void downSweep(int* out, int* arr, int n, int d, int p){
+	int k = ((blockDim.x * blockIdx.x) + threadIdx.x) * p;
+	if (k >= n) return;
+
+	int p_d = pow(2, d);
+
+	out[k + p - 1] = arr[k + p - 1] + arr[k + p_d - 1];
+	out[k + p_d - 1] = arr[k + p - 1];
 }
 
 __global__ void setExclusive(int* out, int* in, int n){
@@ -51,6 +67,12 @@ __global__ void setExclusive(int* out, int* in, int n){
 	out[k] = k > 0 ? in[k - 1] : 0;
 }
 
+
+__global__ void shiftBit(int* out, int* in){
+	int k = (blockDim.x * blockIdx.x) + threadIdx.x;
+	out[k] = k > 0 ? in[k - 1] : 0;
+}
+
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
@@ -84,33 +106,30 @@ void scan(int n, int *odata, const int *idata) {
 	for (int d = 0; d <= log2ceil - 1; d++){
 		int p = pow(2, d + 1);
 		findOptimizedSize(n_p2 / p, gridDim, blockDim);
-		upSweep << < gridDim, blockDim >> >(d_idata, n_p2, d, p);
+		upSweep << < gridDim, blockDim >> >(d_odata, d_idata, n_p2, d, p);
+		copyModifiedUpSweep << < gridDim, blockDim >> >(d_idata, d_odata, n_p2, p);
 	}
-
+	
 	setValArray << < 1, 1 >> >(d_idata, n_p2 - 1, 0);
 
 	for (int d = log2ceil - 1; d >= 0; d--){
 		int p = pow(2, d + 1);
 		findOptimizedSize(n_p2 / p, gridDim, blockDim);
-		downSweep << < gridDim, blockDim >> >(d_idata, n_p2, d, p);
+		downSweep << < gridDim, blockDim >> >(d_odata, d_idata, n_p2, d, p);
+		copyModifiedDownSweep << < gridDim, blockDim >> >(d_idata, d_odata, n_p2, d, p);
 	}
 
 	cudaEventRecord(endEvent_acc, 0);
 	cudaEventSynchronize(endEvent_acc);
 	cudaEventElapsedTime(&time, beginEvent_acc, endEvent_acc);
 
-	findOptimizedSize(n_p2, gridDim, blockDim);
-	setExclusive << < blockDim, gridDim >> >(d_odata, d_idata, n);
+	//setExclusive << < blockDim, gridDim >> >(d_odata, d_idata, n);
 
-	cudaEventRecord(endEvent_acc, 0);
-	cudaEventSynchronize(endEvent_acc);
-	cudaEventElapsedTime(&time, beginEvent_acc, endEvent_acc);
-	
 	std::cout << "Work Efficient - time : " << time << " ms" << std::endl;
+
 	//////////////////////////
 	//test
 	cudaMemcpy(odata, d_idata, size, cudaMemcpyDeviceToHost);
-
 	/*
 	std::cout << "idata: ";
 	int total = 0;
@@ -127,7 +146,7 @@ void scan(int n, int *odata, const int *idata) {
 	}
 	std::cout << std::endl;
 	*/
-
+	cudaFree(d_odata);
 	cudaFree(d_idata);
 }
 
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 1bb96ad..e36fcd1 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -22,25 +22,19 @@ __device__  __host__ void swap(int** x, int** y){
 	*y = tmp;
 };
 
-__global__ void gpuScan(int* out, int* in, int n, int iter){
+__global__ void gpuScan(int* out, int* in, int n, int p){
 	int k = (blockDim.x * blockIdx.x) + threadIdx.x;
 	if (k >= n) return;
 
-	for (int d = 1; d <= iter; d++) {
-		int p = pow(2, d - 1);
-
-		if (k >= p)
-			out[k] = in[k - p] + in[k];
-		else
-			out[k] = in[k];
-
-		__syncthreads();
-		swap(&in, &out);
-	}
-
-	if (iter % 2 == 1) swap(&in, &out);
+	if (k >= p)
+		out[k] = in[k - p] + in[k];
+	else
+		out[k] = in[k];
+}
 
-	out[k] = k > 0 ? in[k-1] : 0;
+__global__ void shiftBit(int* out, int* in){
+	int k = (blockDim.x * blockIdx.x) + threadIdx.x;
+	out[k] = k > 0 ? in[k - 1] : 0;
 }
 /**
 * Performs prefix-sum (aka scan) on idata, storing the result into odata.
@@ -62,7 +56,7 @@ void scan(int n, int *odata, const int *idata) {
 
 	int blockDim, gridDim;
 	findOptimizedSize(n_p2, gridDim, blockDim);
-	std::cout << "blockDim : " << blockDim << std::endl;
+	//std::cout << "blockDim : " << blockDim << std::endl;
 
 	//////////////////////////
 	float time = 0;
@@ -72,7 +66,14 @@ void scan(int n, int *odata, const int *idata) {
 	cudaEventCreate(&beginEvent_acc);
 	cudaEventCreate(&endEvent_acc);
 	cudaEventRecord(beginEvent_acc, 0);
-	gpuScan << < gridDim, blockDim >> >(d_odata, d_idata, n_p2, log2ceil);
+
+	for (int d = 1; d <= log2ceil; d++) {
+		int p = pow(2, d - 1);
+		gpuScan << < gridDim, blockDim >> >(d_odata, d_idata, n_p2, p);
+		swap(&d_odata, &d_idata);
+	}
+	shiftBit<< < gridDim, blockDim >> >(d_odata, d_idata);
+
 	cudaEventRecord(endEvent_acc, 0);
 	cudaEventSynchronize(endEvent_acc);
 	cudaEventElapsedTime(&time, beginEvent_acc, endEvent_acc);