diff --git a/src/main.cpp b/src/main.cpp
index 79c54fb..e511d7d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -16,7 +16,7 @@
 #include "testing_helpers.hpp"
 
 // The tests default to an array of size 1 << 8 = 256
-const int SIZE = 1 << 16; // feel free to change the size of array
+const int SIZE = 1 << 20; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -101,7 +101,7 @@ int main(int argc, char* argv[]) {
     printCmpResult(NPOT, b, c);
 
     printf("\n");
-
+#if 0
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
@@ -115,12 +115,12 @@ int main(int argc, char* argv[]) {
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     //printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
-
+#endif
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    // printArray(SIZE, c, true);
+    printArray(SIZE, c, false);
     printCmpResult(SIZE, b, c);
 
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -141,14 +141,14 @@ int main(int argc, char* argv[]) {
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    printArray(SIZE, c, true);
+    // printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    printArray(NPOT, c, true);
+    // printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
 
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index b456f2f..b784980 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -11,8 +11,8 @@
 #include <stdexcept>
 
 /*! Block size used for CUDA kernel launch. */
-#define blockSize 1024
-#define sectionSize 1024
+#define blockSize 256
+#define sectionSize 256
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 // usage: checkCUDAError("a descriptive name of this error")
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 8c038c4..25c6880 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -144,6 +144,8 @@ namespace StreamCompaction {
             }
         }
 
+        
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
@@ -157,6 +159,7 @@ namespace StreamCompaction {
             int* d_OutputExclusiveData;
             int* d_SumArray;
             int* d_SumArrayOutput;
+            int* d_SumArrayAx;
 
             cudaMalloc((void**)&d_InputData, size);
             checkCUDAError("cudaMalloc d_InputData failed!");
@@ -173,6 +176,9 @@ namespace StreamCompaction {
             cudaMalloc((void**)&d_SumArrayOutput, sumArraySize);
             checkCUDAError("cudaMalloc d_SumArrayOutput failed!");
 
+            cudaMalloc((void**)&d_SumArrayAx, sumArraySize);
+            checkCUDAError("cudaMalloc d_SumArrayOutput failed!");
+
             cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice);
 
             dim3 dimGridArray((n + blockSize - 1) / blockSize, 1, 1);
@@ -181,6 +187,10 @@ namespace StreamCompaction {
             dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1);
             dim3 dimBlockSumArray(blockSize, 1, 1);
 
+            // for testing
+            int* sumArray = new int[sumArrayNumEle];
+            int* sumArrayOutput = new int[sumArrayNumEle];
+
             timer().startGpuTimer();
             // First step: compute the scan result for individual sections
             // then, store their block sum to sumArray
@@ -188,13 +198,50 @@ namespace StreamCompaction {
                 d_OutputData, d_SumArray, n);
             checkCUDAError("kernNaiveGPUScanFirstStep failed!");
 
-            //();
+            // cudaDeviceSynchronize();
+
+            cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost);
+            checkCUDAError("memCpy back failed!");
+
+            cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost);
+            checkCUDAError("memCpy back failed!");
+
+            std::cout << '\n';
+            for (int i = 0; i < n; i++)
+            {
+                std::cout << odata[i] << ' ';
+                if ((i + 1) % 8 == 0) {
+                    std::cout << std::endl;
+                }
+            }
+
+            std::cout << '\n';
+            for (int i = 0; i < sumArrayNumEle; i++)
+            {
+                std::cout << sumArray[i] << ' ';
+            }
+
+            std::cout << '\n';
 
             // Second step: scan block sums
-            kernNaiveGPUScanSecondStep << <dimGridSumArray, dimBlockSumArray >> > (
-                d_SumArray, d_SumArrayOutput, sumArrayNumEle);
-            checkCUDAError("kernNaiveGPUScanSecondStep failed!");
+            kernNaiveGPUScanFirstStep << <dimGridSumArray, dimBlockSumArray >> > (d_SumArray,
+                d_SumArrayOutput, d_SumArrayAx, n);
+
+            kernNaiveGPUScanThirdStep << <dimGridSumArray, dimBlockSumArray >> > (
+                d_SumArrayAx, d_SumArrayOutput, n);
+
+            cudaMemcpy(sumArrayOutput, d_SumArrayOutput, sumArraySize,
+                cudaMemcpyDeviceToHost);
+            checkCUDAError("memCpy back failed!");
 
+            printf("\n");
+
+            for (int i = 0; i < sumArrayNumEle; i++)
+            {
+                std::cout << sumArrayOutput[i] << ' ';
+            }
+
+            printf("\n");
             //cudaDeviceSynchronize();
 
             // Third step: add scanned block sum i to all values of scanned block
@@ -203,8 +250,6 @@ namespace StreamCompaction {
                 d_SumArrayOutput, d_OutputData, n);
             checkCUDAError("kernNaiveGPUScanThirdStep failed!");
 
-           // cudaDeviceSynchronize();
-
             // Last step:
             convertFromInclusiveToExclusive << <dimGridArray, dimBlockArray >> > (
                 d_OutputData, d_OutputExclusiveData, n);