diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62c0e59..88a0e0c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,6 +73,8 @@ set(headers
     src/sceneStructs.h
     src/preview.h
     src/utilities.h
+    src/common.h
+    src/efficient.h
     )
 
 set(sources
@@ -84,6 +86,8 @@ set(sources
     src/scene.cpp
     src/preview.cpp
     src/utilities.cpp
+    src/common.cu
+    src/efficient.cu
     )
 
 list(SORT headers)
diff --git a/README.md b/README.md
index 110697c..10ef6c5 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,128 @@
 CUDA Path Tracer
 ================
 
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 3**
+**University of Pennsylvania, CIS 565: GPU Programming and Architecture,
+Project 3 - CUDA Path Tracer**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Srinath Rajagopalan
+  * [LinkedIn](https://www.linkedin.com/in/srinath-rajagopalan-07a43155), [twitter](https://twitter.com/srinath132)
+* Tested on: Windows 10, i7-6700 @ 3.4GHz 16GB, Nvidia Quadro P1000 4GB (Moore 100B Lab)
 
-### (TODO: Your README)
+## Path Tracer
+<p align='center'>
+<img src="data/final_render.png">
+</p>
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+### Introduction
 
+Given the fundamental building blocks of a scene, how do we create it? That is the basic purpose of a path tracer. The building blocks will be the various objects present in the scene, along with their material properties. Is it a reflective material? refractive? diffusive? Or both? Is the material a light source? If so what is its illuminance? These properties are explicitly specified in the scene file.
+
+We assume there is a camera present in the scene and project the 3D scene onto the 2D image screen of the camera. A basic path tracing algorithm is as follows:
+
+1) Initialize a HxW image and construct Hxw rays generating from the camera _through_ the image. We have one ray per pixel. The rays serve as messenger vehicles that figures out the color of its corresponding pixel. The rays are initialized to RGB(1,1,1). So if you render this image, we expect to get a white screen. 
+2) For each ray, we find which object in the scene they intersect. If the ray does not intersect any object, the ray is assigned a black color.Since each ray of a pixel is independent of one another, we can perform this computation for all the rays in parallel.
+3) For a given ray, if the object of interesction is a light source, the ray is assigned the color of the light source. If the object is not a light source, then we check if we can either reflect or refract from the object. The nature of reflection/refraction is determined by the material properties of the object. From this, a _new_ ray is constructed which absorbs the color of the object. This process is done for all rays in parallel.
+4) We can recursively perform steps 2) and 3) for a fixed number of bounces. After each bounce, the ray absorbs the color of the material it bounced from. A ray "dies" after a fixed number of bounces OR if it hits a light source before. The final color of the ray is the product of colors of all the material it intersected. The color of the pixel in our final 2D image is the color of its corresponding ray.
+
+The above process is a 10000-ft view of a basic path-tracer. In reality, when we see an object, a light ray from some light source strikes the object, reflects a million times, and finally enters the retina. A path tracer tries to simulate this but in the _reverse_ direction. For each final pixel in our retina/camera, we ask where did this come from by shooting a ray from the pixel and observing its life.
+
+
+## BSDF Computation
+
+We explore three different possibilities for calculation of the new ray.
+1) Ideal Diffuse Reflection - the new ray does not depend on the incident ray, but can scatter in all the directions. We approximate this by probabilistically choosing a ray from the hemisphere enclosing the surface and point of intersection. The probability of a ray in a given direction is weighted by the cosine of the angle between that ray and the normal to the surface. So most of the diffused rays will be in the direction closer to the normal.
+2) Ideal Specular Reflection - The new ray perfectly reflects from the surface.
+3) Refraction - A ray can pass _through_ the object and we can calculate it from Snell's law. 
+
+For the Cornell scene, three different cases illustrating the above are included below
+
+Diffusion                    |  Reflection               | Transmission
+:-------------------------:|:-------------------------:|:-------------------------:
+![](data/full_diffuse.png)| ![](data/full_reflect.png) |![](data/full_refract.png)
+
+We can also have a material to be a _combination_ of reflection, refraction, and diffusion. If reflection coeffecient is `p1`, refraction coefficient is `p2`, then for a given a ray we sample a specular reflected ray with probability `p1`, refracted ray with probability `p2`, and diffused ray with probability `1 - p1 - p2`.
+
+<p align='center'>
+	<img src="data/reflect_refract.png" width=500>
+</p>
+
+We can see the red wall and the green wall on the corresponding side of the sphere. This highlights the reflective property of the material. But, more importantly, notice the shiny surface at the bottom of the sphere and the shadow sprinkled with a speck of white light. This is _because_ of refraction. The white light passed through the sphere by refracting from air to glass and came out by refracting from glass to air. 
+
+## Deeper the better
+
+We can control the the number of bounces for each ray using a `depth` parameter.  Higher the depth, more realistic we can expect our image to be but, like everything else in life, that comes at the tradeoff of expensive computation. 
+
+Depth 1                    |  Depth 3               | Depth 8
+:-------------------------:|:-------------------------:|:-------------------------:
+![](data/depth_0.png)| 		![](data/depth_1.png) 	|![](data/full_refract.png)
+
+
+The effect on realism can be better visualized by completely removing the light source. If there is no light source, everything should be black. But if depth is low enough, it won't be.
+
+No Light Depth 0                    |  No Light Depth 8               | No Light Depth 15
+:-------------------------:|:-------------------------:|:-------------------------:
+![](data/no_light_depth_0.png)| 		![](data/no_light_depth_8.png) 	|![](data/no_light_depth_15.png)
+
+## Anti-Aliasing
+
+As explained before, we use one ray per pixel, and the ray passes through the center of the pixel. This can lead to zagged edges which are clearly evident when we zoom in. We can limit this by jittering the ray through the pixel. Instead of the ray always passing through the center, we randomize the ray location within the pixel. This helps in diminishing the discontinuity when jumping between pixels and therefore renders an overall smoother image. The effects are illustrated below:
+
+
+Aliased                    |  Anti-Aliased               
+:-------------------------:|:-------------------------:
+![](data/aliased.png)| 		![](data/anti_aliased.png)
+
+The jagged edges bordering the green reflection is smoother in the anti-aliased image.
+
+## Stream Compaction
+
+Since we are parallelizing by rays, and after each depth, some rays will terminate because they didn't intersect any object or terminated at a light source, it is better to launch threads for number of rays still alive. By stream compacting on alive rays and bringing them ttogeher, we can reduce warp divergence as we are only dealing with the active threads _that are grouped together_.  
+
+Performance for a complete shader call after each depth is included below
+
+<p align="center">
+<img src="data/perf_depth.png">
+</p>
+
+We can see that the time taken reduces, unsurprisingly. But this by itself doesn't justify why stream compacttion is better. For that we analyze the time taken for an entire iteration (across all depth). The performance graph is given below.
+
+<p align="center">
+<img src="data/perf_stream.png">
+</p>
+
+For smaller image resolutions, the effect of stream compaction is neglible. In fact, it might even be slower because the overhead might not be worth it. But as we scale to larger image resolutions, we have a clear winner.
+
+For a relatively open scene, more rays die out aftter each bounce, so the effectt of stream compaction on performance will not be as significant as it is on a closed scene. The following plot, taken on an image of `1500x1500` resolution, illustrates this.
+
+<p align="center">
+<img src="data/closed_open.png">
+</p>
+
+## Overall performance analysis
+
+Sorting by material id, though a good idea to limit warp divergence, did not turn out to be a good idea in the present case because our scenes are not big enough to justify the sorting overhead. The performance was for a screen resolution `700x700`. As discussed above, the image resolution is not big enough for the stream compactiton to do its magic, which is why we get comparable performance to the implementation without one. 
+
+<p align="center">
+<img src="data/per_bar.png">
+</p>
+
+The work-efficient stream compaction which I implemented uses shared memory to perform the scan over varying number of blocks. Though this passed the test cases provided in Project 2 (working upto array sizes of `2^27`), I could not scale it for the path tracer beyond image resolution of `720x720`. The performance is comparable to thrust's implementatiton though a bit slower. I atttribute this to not accounting for bank-conflicts within shared-memory and sloppy use of `cudaMemcpy` to transfer data to-and-fro so as to fit in well with the previously written API.
+
+## Bloopers
+
+While attempting motion blur, if the updates were too quick, or if the transform matrix was not updated properly, I got some funny results
+
+
+
+Motion Blur Rip Through                    |  Motion Blur Shaved               
+:-------------------------:|:-------------------------:
+![](data/blooper_motion_blur.png)| 		![](data/blooper_motion_blur_2.png)
+
+
+
+While calculatitng the refracted ray using Snell's law, if the normal to the surface was not flipped correctly, we get a doughnout. 
+
+
+<p align="center">
+<img src="data/blooper_refract.png" width=500>
+</p>
\ No newline at end of file
diff --git a/data/aliased.png b/data/aliased.png
new file mode 100644
index 0000000..eeb609c
Binary files /dev/null and b/data/aliased.png differ
diff --git a/data/anti_aliased.png b/data/anti_aliased.png
new file mode 100644
index 0000000..2abe15b
Binary files /dev/null and b/data/anti_aliased.png differ
diff --git a/data/blooper_motion_blur.png b/data/blooper_motion_blur.png
new file mode 100644
index 0000000..8f46a97
Binary files /dev/null and b/data/blooper_motion_blur.png differ
diff --git a/data/blooper_motion_blur_2.png b/data/blooper_motion_blur_2.png
new file mode 100644
index 0000000..60102bc
Binary files /dev/null and b/data/blooper_motion_blur_2.png differ
diff --git a/data/blooper_refract.png b/data/blooper_refract.png
new file mode 100644
index 0000000..4670fd2
Binary files /dev/null and b/data/blooper_refract.png differ
diff --git a/data/closed_open.png b/data/closed_open.png
new file mode 100644
index 0000000..f9380fe
Binary files /dev/null and b/data/closed_open.png differ
diff --git a/data/depth_0.png b/data/depth_0.png
new file mode 100644
index 0000000..bce134c
Binary files /dev/null and b/data/depth_0.png differ
diff --git a/data/depth_1.png b/data/depth_1.png
new file mode 100644
index 0000000..d139fa2
Binary files /dev/null and b/data/depth_1.png differ
diff --git a/data/final_render.png b/data/final_render.png
new file mode 100644
index 0000000..82a18b7
Binary files /dev/null and b/data/final_render.png differ
diff --git a/data/full_diffuse.png b/data/full_diffuse.png
new file mode 100644
index 0000000..f286fce
Binary files /dev/null and b/data/full_diffuse.png differ
diff --git a/data/full_reflect.png b/data/full_reflect.png
new file mode 100644
index 0000000..a593bc9
Binary files /dev/null and b/data/full_reflect.png differ
diff --git a/data/full_refract.png b/data/full_refract.png
new file mode 100644
index 0000000..acf45b0
Binary files /dev/null and b/data/full_refract.png differ
diff --git a/data/no_light_depth_0.png b/data/no_light_depth_0.png
new file mode 100644
index 0000000..9911b5a
Binary files /dev/null and b/data/no_light_depth_0.png differ
diff --git a/data/no_light_depth_15.png b/data/no_light_depth_15.png
new file mode 100644
index 0000000..a33c4c0
Binary files /dev/null and b/data/no_light_depth_15.png differ
diff --git a/data/no_light_depth_8.png b/data/no_light_depth_8.png
new file mode 100644
index 0000000..468f9d0
Binary files /dev/null and b/data/no_light_depth_8.png differ
diff --git a/data/path_tracer_book.xlsx b/data/path_tracer_book.xlsx
new file mode 100644
index 0000000..078dd32
Binary files /dev/null and b/data/path_tracer_book.xlsx differ
diff --git a/data/per_bar.png b/data/per_bar.png
new file mode 100644
index 0000000..9ae9aaa
Binary files /dev/null and b/data/per_bar.png differ
diff --git a/data/perf_depth.png b/data/perf_depth.png
new file mode 100644
index 0000000..cdd31d5
Binary files /dev/null and b/data/perf_depth.png differ
diff --git a/data/perf_stream.png b/data/perf_stream.png
new file mode 100644
index 0000000..b39ba74
Binary files /dev/null and b/data/perf_stream.png differ
diff --git a/data/reflect_refract.png b/data/reflect_refract.png
new file mode 100644
index 0000000..ff6fc28
Binary files /dev/null and b/data/reflect_refract.png differ
diff --git a/scenes/cornell_mod.txt b/scenes/cornell_mod.txt
new file mode 100644
index 0000000..06578b3
--- /dev/null
+++ b/scenes/cornell_mod.txt
@@ -0,0 +1,165 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   3
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        0.8
+REFR        0.2
+REFRIOR     1.33
+EMITTANCE   0
+
+MATERIAL 5
+RGB         .98 .98 0
+SPECEX      0
+SPECRGB     0 .9 0.9
+REFL        0.3
+REFR        0.8
+REFRIOR     1.5
+EMITTANCE   0
+
+MATERIAL 6
+RGB         .98 .98 0
+SPECEX      0
+SPECRGB     .98 .98 0
+REFL        0.5
+REFR        0.5
+REFRIOR     1.5
+EMITTANCE   0
+
+MATERIAL 7
+RGB         1 0.576 0.160
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   3
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  5000
+DEPTH       10
+FILE        cornell
+EYE         0.0 5 10.5
+LOOKAT      0 5 0
+UP          0 1 0
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Sphere
+OBJECT 6
+sphere
+material 4
+TRANS       -2 4 0
+ROTAT       0 0 0
+SCALE       3 3 3
+
+OBJECT 7
+sphere
+material 5
+TRANS       2 4 0
+ROTAT       0 0 0
+SCALE       3 3 3
+
+OBJECT 8
+cube
+material 6
+TRANS       2 7 0
+ROTAT       45 45 0
+SCALE       1 1 1
+
+OBJECT 9
+cube
+material 7
+TRANS       4 0 3
+ROTAT       0 0 0
+SCALE       0.5 .3 0.5
diff --git a/scenes/cornell_orig.txt b/scenes/cornell_orig.txt
new file mode 100644
index 0000000..40ade59
--- /dev/null
+++ b/scenes/cornell_orig.txt
@@ -0,0 +1,117 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Camera
+CAMERA
+RES         1500 1500
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 10.5
+LOOKAT      0 5 0
+UP          0 1 0
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Sphere
+OBJECT 6
+sphere
+material 4
+TRANS       -1 4 -1
+ROTAT       0 0 0
+SCALE       3 3 3
diff --git a/scenes/final_scene.txt b/scenes/final_scene.txt
new file mode 100644
index 0000000..95d81eb
--- /dev/null
+++ b/scenes/final_scene.txt
@@ -0,0 +1,193 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// reflective beige
+MATERIAL 2
+RGB         0.960 0.960 0.862
+SPECEX      0
+SPECRGB     0.960 0.960 0.862
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// diffuse beige
+MATERIAL 3
+RGB         0.960 0.960 0.862
+SPECEX      0
+SPECRGB     0.960 0.960 0.862
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white
+MATERIAL 4
+RGB         .55 .1 .15
+SPECEX      0
+SPECRGB     .55 .1 .15
+REFL        0.2
+REFR        0.7
+REFRIOR     1.5
+EMITTANCE   0
+
+// Bottom floor material
+MATERIAL 5
+RGB         0.658 0.466 0.352
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// couch color
+MATERIAL 6
+RGB         0 0.501 0.501
+SPECEX      0
+SPECRGB     0 0.501 0.501
+REFL        0.4
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+MATERIAL 7
+RGB         0.95 0.20 0.0
+SPECEX      0
+SPECRGB     0.20 0.95 0.0
+REFL        0.1
+REFR        0.9
+REFRIOR     1.5
+EMITTANCE   0
+
+MATERIAL 8
+RGB         0.364 0.227 0.101
+SPECEX      0
+SPECRGB     0.364 0.227 0.101
+REFL        0.7
+REFR        0.3
+REFRIOR     0
+EMITTANCE   0
+
+MATERIAL 9
+RGB         0.9 0.2 0.0
+SPECEX      0
+SPECRGB     0.96 0.40 0.0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 10.5
+LOOKAT      0 5 0
+UP          0 1 0
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 5
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Sphere
+OBJECT 6
+sphere
+material 4
+TRANS       -2 5 -2
+ROTAT       0 0 0
+SCALE       2 2 2
+
+OBJECT 7
+cube
+material 6
+TRANS       3.5 1 0
+ROTAT       0 0 0
+SCALE       2 2 10
+
+// Sphere
+OBJECT 8
+sphere
+material 7
+TRANS       -1 5 4
+ROTAT       0 0 0
+SCALE       2 2 3
+
+OBJECT 9
+cube
+material 8
+TRANS       5 2 0
+ROTAT       0 0 0
+SCALE       .5 3 3
+
+OBJECT 10
+cube
+material 9
+TRANS       4 2 -5
+ROTAT       0 0 0
+SCALE       1 5 1
\ No newline at end of file
diff --git a/src/common.cu b/src/common.cu
new file mode 100644
index 0000000..01e57d4
--- /dev/null
+++ b/src/common.cu
@@ -0,0 +1,49 @@
+#include "common.h"
+
+//void checkCUDAErrorFn(const char *msg, const char *file, int line) {
+//    cudaError_t err = cudaGetLastError();
+//    if (cudaSuccess == err) {
+//        return;
+//    }
+//
+//    fprintf(stderr, "CUDA error");
+//    if (file) {
+//        fprintf(stderr, " (%s:%d)", file, line);
+//    }
+//    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+//    exit(EXIT_FAILURE);
+//}
+
+
+namespace StreamCompaction {
+    namespace Common {
+
+        /**
+         * Maps an array to an array of 0s and 1s for stream compaction. Elements
+         * which map to 0 will be removed, and elements which map to 1 will be kept.
+         */
+        __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
+			int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+			if (index >= n) return;
+			
+			bools[index] = idata[index] >= 0;
+        }
+
+        /**
+         * Performs scatter on an array. That is, for each element in idata,
+         * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+         */
+        __global__ void kernScatter(int n, int *odata,
+                const int *idata, const int *bools, const int *indices) {
+
+			int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+			if (index >= n) return;
+
+			if (bools[index]) {
+				odata[indices[index]] = idata[index];
+			}
+
+        }
+
+    }
+}
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..996997e
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,132 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include <algorithm>
+#include <chrono>
+#include <stdexcept>
+
+#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+
+/**
+ * Check for CUDA errors; print and exit if there was a problem.
+ */
+void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
+
+inline int ilog2(int x) {
+    int lg = 0;
+    while (x >>= 1) {
+        ++lg;
+    }
+    return lg;
+}
+
+inline int ilog2ceil(int x) {
+    return x == 1 ? 0 : ilog2(x - 1) + 1;
+}
+
+namespace StreamCompaction {
+    namespace Common {
+        __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
+
+        __global__ void kernScatter(int n, int *odata,
+                const int *idata, const int *bools, const int *indices);
+
+	    /**
+	    * This class is used for timing the performance
+	    * Uncopyable and unmovable
+        *
+        * Adapted from WindyDarian(https://github.com/WindyDarian)
+	    */
+	    class PerformanceTimer
+	    {
+	    public:
+		    PerformanceTimer()
+		    {
+			    cudaEventCreate(&event_start);
+			    cudaEventCreate(&event_end);
+		    }
+
+		    ~PerformanceTimer()
+		    {
+			    cudaEventDestroy(event_start);
+			    cudaEventDestroy(event_end);
+		    }
+
+		    void startCpuTimer()
+		    {
+			    if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
+			    cpu_timer_started = true;
+
+			    time_start_cpu = std::chrono::high_resolution_clock::now();
+		    }
+
+		    void endCpuTimer()
+		    {
+			    time_end_cpu = std::chrono::high_resolution_clock::now();
+
+			    if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
+
+			    std::chrono::duration<double, std::milli> duro = time_end_cpu - time_start_cpu;
+			    prev_elapsed_time_cpu_milliseconds =
+				    static_cast<decltype(prev_elapsed_time_cpu_milliseconds)>(duro.count());
+
+			    cpu_timer_started = false;
+		    }
+
+		    void startGpuTimer()
+		    {
+			    if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
+			    gpu_timer_started = true;
+
+			    cudaEventRecord(event_start);
+		    }
+
+		    void endGpuTimer()
+		    {
+			    cudaEventRecord(event_end);
+			    cudaEventSynchronize(event_end);
+
+			    if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
+
+			    cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
+			    gpu_timer_started = false;
+		    }
+
+		    float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015
+		    {
+			    return prev_elapsed_time_cpu_milliseconds;
+		    }
+
+		    float getGpuElapsedTimeForPreviousOperation() //noexcept
+		    {
+			    return prev_elapsed_time_gpu_milliseconds;
+		    }
+
+		    // remove copy and move functions
+		    PerformanceTimer(const PerformanceTimer&) = delete;
+		    PerformanceTimer(PerformanceTimer&&) = delete;
+		    PerformanceTimer& operator=(const PerformanceTimer&) = delete;
+		    PerformanceTimer& operator=(PerformanceTimer&&) = delete;
+
+	    private:
+		    cudaEvent_t event_start = nullptr;
+		    cudaEvent_t event_end = nullptr;
+
+		    using time_point_t = std::chrono::high_resolution_clock::time_point;
+		    time_point_t time_start_cpu;
+		    time_point_t time_end_cpu;
+
+		    bool cpu_timer_started = false;
+		    bool gpu_timer_started = false;
+
+		    float prev_elapsed_time_cpu_milliseconds = 0.f;
+		    float prev_elapsed_time_gpu_milliseconds = 0.f;
+	    };
+    }
+}
diff --git a/src/efficient.cu b/src/efficient.cu
new file mode 100644
index 0000000..63d59e0
--- /dev/null
+++ b/src/efficient.cu
@@ -0,0 +1,597 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "efficient.h"
+//#include "thrust.h"
+#include <thrust/scan.h>
+#include <thrust/device_vector.h>
+
+#define checkCUDAErrorWithLine(msg) checkCUDAError(msg)
+
+namespace StreamCompaction {
+    namespace Efficient {
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
+        }
+
+		void printxxx(int n, const int *a) {
+			for (int i = 0; i < n; i++) {
+				printf("%d ", a[i]);
+			}
+			printf("\n\n\n");
+		}
+
+		__global__ void resetZeros(int n, int *a) {
+			int index = (blockDim.x*blockIdx.x) + threadIdx.x;
+			if (index >= n) return;
+			a[index] = 0;
+		}
+
+
+
+		__global__ void kernscanBlock(int n, int *odata, int* out_last, const int *idata) {
+			
+			extern __shared__ int temp[];
+
+			int idx = threadIdx.x;
+			int tid = (blockDim.x*blockIdx.x) + threadIdx.x;
+			int numPerBlock = 2 * blockDim.x;
+
+			if (tid >= n) return;
+
+			// copy the data this idx boi has to work with to shared memory
+			temp[2*idx] = idata[2*tid];
+			temp[2*idx + 1] = idata[2*tid + 1];
+
+			int offset = 1;
+			for (int d = numPerBlock>> 1; d > 0; d >>=1) {
+				__syncthreads();
+
+				if (idx < d) {
+
+					int k1 = offset * (2 * idx + 1) - 1;
+					int k2 = offset * (2 * idx + 2) - 1;
+					temp[k2] += temp[k1];
+				}
+
+				offset = 2 * offset;
+			}
+
+			if (idx == 0) { temp[numPerBlock - 1] = 0; }
+
+			for (int d = 1; d < numPerBlock; d *= 2) {
+				offset >>= 1;
+				__syncthreads();
+				if (idx < d) {
+
+					int k1 = offset * (2 * idx + 1) - 1;
+					int k2 = offset * (2 * idx + 2) - 1;
+
+					int tmp = temp[k1];
+					temp[k1] = temp[k2];
+					temp[k2] += tmp;
+				}
+			}
+
+			__syncthreads();
+			
+			odata[2 * tid] = temp[2 * idx]; // has to updated with block number
+			odata[2 * tid + 1] = temp[2 * idx + 1];
+
+			if (idx == 0) {
+				int last = numPerBlock * blockIdx.x + numPerBlock - 1;
+				out_last[blockIdx.x] = temp[numPerBlock - 1] + idata[last];
+			}
+		}
+
+		__global__ void copyLastElements(int n, int blockSize, int *odata, const int *idata) {
+			int tid = (blockDim.x*blockIdx.x) + threadIdx.x;
+			if (tid >= n) return;
+
+			odata[tid] = idata[tid*blockSize + blockSize - 1];
+		}
+
+		__global__ void addLastElement(int n, int blockSize, int *odata, const int *scanSum, const int *idata) {
+			int tid = (blockDim.x*blockIdx.x) + threadIdx.x;
+			if (tid >= n) return;
+
+			odata[tid] = scanSum[tid] + idata[tid*blockSize + blockSize - 1];
+			//odata[tid] = scanSum[tid];
+		}
+
+		__global__ void addScanMain(int n, int *odata, const int *scanSum, const int *scanSumBlock) {
+			int tid = (blockDim.x*blockIdx.x) + threadIdx.x;
+			if (tid >= n) return;
+
+			odata[tid] = scanSumBlock[tid] + scanSum[blockIdx.x];
+		}
+
+
+		void scanShared(int n, int *odata, const int *idata) {
+			bool exception = false;
+
+			int *dev_idata, *dev_scanSumBlock, *dev_addLastElements, *dev_scanSum, *dev_odata;
+
+			int d_max = ilog2ceil(n);
+
+			int twoPowN = 1 << d_max;
+			if (n != twoPowN) {
+
+				int diff = twoPowN - n;
+
+				cudaMalloc((void **)&dev_idata, (n + diff) * sizeof(int));
+				checkCUDAErrorWithLine("cudaMalloc dev_odata1 failed!");
+
+				int threadsPerBlock = 1024;
+				int blocksToLaunch = (n + diff + threadsPerBlock - 1) / threadsPerBlock;
+				resetZeros << <blocksToLaunch, threadsPerBlock >> > (n + diff, dev_idata);
+
+				cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+				n = n + diff;
+			}
+			else {
+				cudaMalloc((void **)&dev_idata, n * sizeof(int));
+				checkCUDAErrorWithLine("cudaMalloc dev_idata failed!");
+
+				cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			}
+
+			int blockSize = 1024;
+			int numBlocks = (n + blockSize - 1) / blockSize;
+			int numElements = numBlocks;
+
+			cudaMalloc((void **)&dev_scanSumBlock, n * sizeof(int));
+			cudaMalloc((void **)&dev_addLastElements, numElements * sizeof(int));
+			cudaMalloc((void **)&dev_scanSum, numElements * sizeof(int));
+			cudaMalloc((void **)&dev_odata, n * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc failed!");
+
+			cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+			//thrust::device_ptr<int> dev_idataItr(dev_addLastElements);
+			//thrust::device_ptr<int> dev_odataItr(dev_scanSum);
+
+			try {
+				timer().startGpuTimer();
+			}
+			catch (const std::runtime_error& ex) {
+				exception = true;
+			}
+
+			kernscanBlock << <numBlocks, blockSize/2, (blockSize) * sizeof(int) >> > (n, dev_scanSumBlock, dev_addLastElements, dev_idata);
+
+			scanCompact(numElements, dev_scanSum, dev_addLastElements);
+			//thrust::exclusive_scan(dev_idataItr, dev_idataItr + numElements, dev_odataItr);
+
+			addScanMain<<<numBlocks, blockSize >>>(n, dev_odata, dev_scanSum, dev_scanSumBlock);
+
+			if (!exception)
+				timer().endGpuTimer();
+
+			cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+			cudaFree(dev_idata);
+			cudaFree(dev_scanSum);
+			cudaFree(dev_scanSumBlock);
+			cudaFree(dev_addLastElements);
+			cudaFree(dev_odata);
+		}
+
+		void scanSharedGPU(int n, int *dev_odata, const int *idata) {
+			bool exception = false;
+
+			int *dev_idata, *dev_scanSumBlock, *dev_addLastElements, *dev_scanSum;
+
+			int d_max = ilog2ceil(n);
+
+			int twoPowN = 1 << d_max;
+			if (n != twoPowN) {
+
+				int diff = twoPowN - n;
+
+				cudaMalloc((void **)&dev_idata, (n + diff) * sizeof(int));
+				checkCUDAErrorWithLine("cudaMalloc dev_odata1 failed!");
+
+				int threadsPerBlock = 1024;
+				int blocksToLaunch = (n + diff + threadsPerBlock - 1) / threadsPerBlock;
+				resetZeros << <blocksToLaunch, threadsPerBlock >> > (n + diff, dev_idata);
+
+				cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyDeviceToDevice);
+				n = n + diff;
+			}
+			else {
+				cudaMalloc((void **)&dev_idata, n * sizeof(int));
+				checkCUDAErrorWithLine("cudaMalloc dev_idata failed!");
+
+				cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyDeviceToDevice);
+			}
+
+			int blockSize = 512;
+			int numBlocks = (n + blockSize - 1) / blockSize;
+			int numElements = numBlocks;
+
+			cudaMalloc((void **)&dev_scanSumBlock, n * sizeof(int));
+			cudaMalloc((void **)&dev_addLastElements, numElements * sizeof(int));
+			cudaMalloc((void **)&dev_scanSum, numElements * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc failed!");
+
+			//thrust::device_ptr<int> dev_idataItr(dev_addLastElements);
+			//thrust::device_ptr<int> dev_odataItr(dev_scanSum);
+
+			try {
+				timer().startGpuTimer();
+			}
+			catch (const std::runtime_error& ex) {
+				exception = true;
+			}
+
+			kernscanBlock << <numBlocks, blockSize / 2, (blockSize) * sizeof(int) >> > (n, dev_scanSumBlock, dev_addLastElements, dev_idata);
+
+			//int *a = new int[n];
+			//cudaMemcpy(a, dev_scanSumBlock, n * sizeof(int), cudaMemcpyDeviceToHost);
+			//printxxx(n, a);
+
+			scanCompact(numElements, dev_scanSum, dev_addLastElements);
+			//thrust::exclusive_scan(dev_idataItr, dev_idataItr + numElements, dev_odataItr);
+
+			addScanMain << <numBlocks, blockSize >> > (n, dev_odata, dev_scanSum, dev_scanSumBlock);
+
+			if (!exception)
+				timer().endGpuTimer();
+
+			cudaFree(dev_idata);
+			cudaFree(dev_scanSum);
+			cudaFree(dev_scanSumBlock);
+			cudaFree(dev_addLastElements);
+		}
+
+		__global__ void upSweep(int n, int d, int *idata) {
+			int index = (blockDim.x*blockIdx.x) + threadIdx.x;
+
+			int twoPowd1 = 1 << (d + 1);
+			int twoPowd = 1 << d;
+
+
+			if ((index % twoPowd1 != twoPowd1-1) || index >= n) return;
+
+			int k = index - twoPowd1 + 1;
+			idata[index] += idata[k + twoPowd - 1];
+		}
+
+		__global__ void downSweep(int n, int d, int *idata) {
+			int index = (blockDim.x*blockIdx.x) + threadIdx.x;
+
+			int twoPowd1 = 1 << (d + 1);
+			int twoPowd = 1 << d;
+
+
+			if ((index % twoPowd1 != twoPowd1 - 1) || index >= n) return;
+
+			int k = index - twoPowd1 + 1;
+			int t = idata[k + twoPowd - 1];
+			idata[k + twoPowd - 1] = idata[index];
+			idata[index] += t;
+		}
+
+
+        /**
+         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+         */
+        void scan(int n, int *odata, const int *idata) {
+			bool exception = false;
+
+			int *dev_idata;
+
+			int numThreads = 128;
+			int numBlocks = (n + numThreads - 1) / numThreads;
+
+			int d_max = ilog2ceil(n);
+
+			int twoPowN = 1 << d_max;
+			if (n != twoPowN) {
+
+				int diff = twoPowN - n;
+
+				cudaMalloc((void **)&dev_idata, (n + diff) * sizeof(int));
+				checkCUDAErrorWithLine("cudaMalloc dev_odata1 failed!");
+
+				resetZeros << <numBlocks, numThreads >> > (n + diff, dev_idata);
+
+				cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+				n = n + diff;
+			} else {
+				cudaMalloc((void **)&dev_idata, n * sizeof(int));
+				checkCUDAErrorWithLine("cudaMalloc dev_idata failed!");
+
+				cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			}
+
+			try {
+				timer().startGpuTimer();
+			}
+			catch (const std::runtime_error& ex) {
+				exception = true;
+			}
+
+
+			for (int d = 0; d < d_max; d++) {
+				upSweep<<<numBlocks, numThreads>>>(n, d, dev_idata);
+			}
+
+			// reset last element to zero
+			//int* zero = new int[1];
+			//zero[0] = 0;
+			//cudaMemcpy(dev_idata + n - 1, zero, sizeof(int), cudaMemcpyHostToDevice);
+			cudaMemset(dev_idata + n - 1, 0, sizeof(int));
+
+			
+			for(int d = d_max-1; d >= 0; d--) {
+				downSweep << <numBlocks, numThreads >> > (n, d, dev_idata);
+			}
+
+
+			if (!exception)
+				timer().endGpuTimer();
+
+
+			cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+
+			
+			cudaFree(dev_idata);
+
+
+        }
+
+		void scanCompact(int n, int *odata, const int *idata) {
+			bool exception = false;
+
+			int *dev_idata;
+
+			int numThreads = 128;
+			int numBlocks = (n + numThreads - 1) / numThreads;
+
+			int d_max = ilog2ceil(n);
+
+			int twoPowN = 1 << d_max;
+			if (n != twoPowN) {
+
+				int diff = twoPowN - n;
+
+				cudaMalloc((void **)&dev_idata, (n + diff) * sizeof(int));
+				checkCUDAErrorWithLine("cudaMalloc dev_odata1 failed!");
+
+				resetZeros << <numBlocks, numThreads >> > (n + diff, dev_idata);
+
+				cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyDeviceToDevice);
+				n = n + diff;
+			}
+			else {
+				cudaMalloc((void **)&dev_idata, n * sizeof(int));
+				checkCUDAErrorWithLine("cudaMalloc dev_idata failed!");
+
+				cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyDeviceToDevice);
+			}
+
+			try {
+				timer().startGpuTimer();
+			}
+			catch (const std::runtime_error& ex) {
+				exception = true;
+			}
+
+
+			for (int d = 0; d < d_max; d++) {
+				upSweep << <numBlocks, numThreads >> > (n, d, dev_idata);
+			}
+
+			// reset last element to zero
+			//int* zero = new int[1];
+			//zero[0] = 0;
+			//cudaMemcpy(dev_idata + n - 1, zero, sizeof(int), cudaMemcpyHostToDevice);
+			cudaMemset(dev_idata + n - 1, 0, sizeof(int));
+
+
+			for (int d = d_max - 1; d >= 0; d--) {
+				downSweep << <numBlocks, numThreads >> > (n, d, dev_idata);
+			}
+
+			if (!exception)
+				timer().endGpuTimer();
+
+			cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToDevice);
+
+
+
+			cudaFree(dev_idata);
+		}
+
+
+        /**
+         * Performs stream compaction on idata, storing the result into odata.
+         * All zeroes are discarded.
+         *
+         * @param n      The number of elements in idata.
+         * @param odata  The array into which to store elements.
+         * @param idata  The array of elements to compact.
+         * @returns      The number of elements remaining after compaction.
+         */
+        int compact(int n, int *odata, const int *idata) {
+            
+
+			int numThreads = 128;
+			int numBlocks = (n + numThreads - 1) / numThreads;
+
+			int *dev_checkZeros, *dev_sumIndices, *dev_odata, *dev_idata;
+
+			cudaMalloc((void **) &dev_checkZeros, n * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc dev_checkZeros failed!");
+			cudaMalloc((void **) &dev_sumIndices, n * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc dev_sumIndices failed!");
+			cudaMalloc((void **)&dev_odata, n * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc dev_odata failed!");
+			cudaMalloc((void **)&dev_idata, n * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc dev_idata failed!");
+
+			cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+			timer().startGpuTimer();
+
+			StreamCompaction::Common::kernMapToBoolean<<<numBlocks, numThreads>>>(n, dev_checkZeros, dev_idata);
+			
+			int *checkZeros = new int[n];
+			cudaMemcpy(checkZeros, dev_checkZeros, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+
+			int *sumIndices = new int[n];
+			scan(n, sumIndices, checkZeros);
+
+			cudaMemcpy(dev_sumIndices, sumIndices , n * sizeof(int), cudaMemcpyHostToDevice);
+
+			StreamCompaction::Common::kernScatter<<<numBlocks, numThreads>>>(n, dev_odata, dev_idata, dev_checkZeros, dev_sumIndices);
+
+			
+			timer().endGpuTimer();
+
+			cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+			
+
+			int count = checkZeros[n - 1] == 0 ? sumIndices[n - 1] : sumIndices[n - 1] + 1;
+
+			//delete[] checkZeros;
+			//delete[] sumIndices;
+
+			//printf("hey\n");
+
+			cudaFree(dev_idata);
+			cudaFree(dev_odata);
+			cudaFree(dev_checkZeros);
+			cudaFree(dev_sumIndices);
+
+           
+            return count;
+        }
+
+
+		int compactShared(int n, int *dev_idata) {
+
+
+			int numThreads = 128;
+			int numBlocks = (n + numThreads - 1) / numThreads;
+
+			int *dev_checkZeros, *dev_sumIndices, *dev_odata;
+
+			cudaMalloc((void **)&dev_checkZeros, n * sizeof(int));
+			cudaMalloc((void **)&dev_sumIndices, n * sizeof(int));
+			cudaMalloc((void **)&dev_odata, n * sizeof(int));
+			checkCUDAErrorWithLine("cudaMalloc dev_odata failed!");
+
+			//cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+			timer().startGpuTimer();
+
+			StreamCompaction::Common::kernMapToBoolean << <numBlocks, numThreads >> > (n, dev_checkZeros, dev_idata);
+
+			//int *a = new int[n];
+			//cudaMemcpy(a, dev_checkZeros, n * sizeof(int), cudaMemcpyDeviceToHost);
+			//printxxx(n, a);
+
+			scanSharedGPU(n, dev_sumIndices, dev_checkZeros);
+			//thrust::device_ptr<int> i1(dev_checkZeros);
+			//thrust::device_ptr<int> o1(dev_sumIndices);
+			//thrust::exclusive_scan(i1, i1 + n, o1);
+
+			//int *a = new int[n];
+			//cudaMemcpy(a, dev_sumIndices, n * sizeof(int), cudaMemcpyDeviceToHost);
+			//printxxx(n, a);
+
+			StreamCompaction::Common::kernScatter << <numBlocks, numThreads >> > (n, dev_odata, dev_idata, dev_checkZeros, dev_sumIndices);
+
+			timer().endGpuTimer();
+
+			cudaMemcpy(dev_idata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToDevice);
+
+			int *sumIndices = new int[1];
+			cudaMemcpy(sumIndices, dev_sumIndices + n-1, 1 * sizeof(int), cudaMemcpyDeviceToHost);
+			int *checkZeros = new int[1];
+			cudaMemcpy(checkZeros, dev_checkZeros + n-1, 1 * sizeof(int), cudaMemcpyDeviceToHost);
+
+			int count = checkZeros[0] == 0 ? sumIndices[0] : sumIndices[0] + 1;
+
+			delete[] checkZeros;
+			delete[] sumIndices;
+
+			cudaFree(dev_odata);
+			cudaFree(dev_checkZeros);
+			cudaFree(dev_sumIndices);
+
+			return count;
+		}
+
+
+		//int compact(int n, int *odata, const int *idata) {
+
+
+		//	int numThreads = 128;
+		//	int numBlocks = (n + numThreads - 1) / numThreads;
+
+		//	int *dev_checkZeros, *dev_sumIndices, *dev_odata, *dev_idata;
+
+		//	cudaMalloc((void **)&dev_checkZeros, n * sizeof(int));
+		//	checkCUDAErrorWithLine("cudaMalloc dev_checkZeros failed!");
+		//	cudaMalloc((void **)&dev_sumIndices, n * sizeof(int));
+		//	checkCUDAErrorWithLine("cudaMalloc dev_sumIndices failed!");
+		//	cudaMalloc((void **)&dev_odata, n * sizeof(int));
+		//	checkCUDAErrorWithLine("cudaMalloc dev_odata failed!");
+		//	cudaMalloc((void **)&dev_idata, n * sizeof(int));
+		//	checkCUDAErrorWithLine("cudaMalloc dev_idata failed!");
+
+		//	cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+		//	timer().startGpuTimer();
+
+		//	StreamCompaction::Common::kernMapToBoolean << <numBlocks, numThreads >> > (n, dev_checkZeros, dev_idata);
+
+		//	//int *checkZeros = new int[n];
+		//	//cudaMemcpy(checkZeros, dev_checkZeros, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+
+		//	//int *sumIndices = new int[n];
+		//	scanCompact(n, dev_sumIndices, dev_checkZeros);
+
+		//	//cudaMemcpy(dev_sumIndices, sumIndices, n * sizeof(int), cudaMemcpyHostToDevice);
+
+		//	StreamCompaction::Common::kernScatter << <numBlocks, numThreads >> > (n, dev_odata, dev_idata, dev_checkZeros, dev_sumIndices);
+
+
+		//	timer().endGpuTimer();
+
+		//	cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+		//	int *sumIndices = new int[n];
+		//	int *checkZeros = new int[n];
+		//	cudaMemcpy(checkZeros, dev_checkZeros, n * sizeof(int), cudaMemcpyDeviceToHost);
+		//	cudaMemcpy(sumIndices, dev_sumIndices, n * sizeof(int), cudaMemcpyDeviceToHost);
+		//	int count = checkZeros[n - 1] == 0 ? sumIndices[n - 1] : sumIndices[n - 1] + 1;
+
+		//	//delete[] checkZeros;
+		//	//delete[] sumIndices;
+
+		//	//printf("hey\n");
+
+		//	cudaFree(dev_idata);
+		//	cudaFree(dev_odata);
+		//	cudaFree(dev_checkZeros);
+		//	cudaFree(dev_sumIndices);
+
+		//	delete[] sumIndices;
+		//	delete[] checkZeros;
+
+		//	return count;
+		//}
+    }
+}
diff --git a/src/efficient.h b/src/efficient.h
new file mode 100644
index 0000000..cfa739d
--- /dev/null
+++ b/src/efficient.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "common.h"
+
+namespace StreamCompaction {
+    namespace Efficient {
+        StreamCompaction::Common::PerformanceTimer& timer();
+
+		void scanShared(int n, int *odata, const int *idata);
+
+        void scan(int n, int *odata, const int *idata);
+
+        int compact(int n, int *odata, const int *idata);
+
+		int compactShared(int n, int *idata);
+
+		void scanCompact(int n, int *odata, const int *idata);
+    }
+}
diff --git a/src/interactions.h b/src/interactions.h
index 5ce3628..4bebf68 100644
--- a/src/interactions.h
+++ b/src/interactions.h
@@ -68,12 +68,61 @@ glm::vec3 calculateRandomDirectionInHemisphere(
  */
 __host__ __device__
 void scatterRay(
-		PathSegment & pathSegment,
-        glm::vec3 intersect,
-        glm::vec3 normal,
-        const Material &m,
-        thrust::default_random_engine &rng) {
-    // TODO: implement this.
-    // A basic implementation of pure-diffuse shading will just call the
-    // calculateRandomDirectionInHemisphere defined above.
+	PathSegment & pathSegment,
+	glm::vec3 intersect,
+	glm::vec3 normal,
+	const Material &m,
+	thrust::default_random_engine &rng) {
+
+	glm::vec3 newRayDir;
+
+	// Toss a coin to decide between reflect, refract, and diffuse.
+	// reflect if p < reflect_prob, refract if relect_prob <= p < reflect_prob + refract_prob and diffuse otherwise
+	thrust::uniform_real_distribution<float> u01(0, 1);
+	float tossCoin = u01(rng);
+	if (tossCoin < m.hasReflective) {
+		//reflect
+		newRayDir = glm::reflect(pathSegment.ray.direction, normal);
+		pathSegment.color *= m.specular.color;
+	}
+	else if (tossCoin >= m.hasReflective && tossCoin < m.hasReflective + m.hasRefractive) {
+		//refract
+		if (glm::dot(pathSegment.ray.direction, normal) < 0) {
+			// From air to something else
+			newRayDir = glm::refract(pathSegment.ray.direction, normal, 1/m.indexOfRefraction);
+		}
+		else {
+			// From something else to air
+			newRayDir = glm::refract(pathSegment.ray.direction, normal * glm::vec3(-1.0), m.indexOfRefraction);
+			//total internal reflection;
+			if (glm::length2(newRayDir) < 1e-6) {
+				newRayDir = glm::reflect(pathSegment.ray.direction, normal * glm::vec3(-1.0));
+			}
+		}
+
+		pathSegment.color *= m.specular.color;
+	}
+	else {
+		newRayDir = calculateRandomDirectionInHemisphere(normal, rng);
+		pathSegment.color *= m.color;
+	}
+
+	pathSegment.ray.direction = newRayDir;
+	pathSegment.ray.origin = intersect + pathSegment.ray.direction*0.01f;
 }
+
+struct isTerminated {
+	//__host__ __device__ bool operator() (const PathSegment& p) {
+	//	return (p.remainingBounces > 0);
+	//}
+
+	__host__ __device__ bool operator() (int p) {
+		return (p >= 0);
+	}
+};
+
+struct compareIntersections {
+	__host__ __device__ bool operator() (const ShadeableIntersection &lhs, const ShadeableIntersection &rhs) { 
+		return (lhs.materialId < rhs.materialId); 
+	};
+};
\ No newline at end of file
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index c1ec122..e951af8 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -4,67 +4,113 @@
 #include <thrust/execution_policy.h>
 #include <thrust/random.h>
 #include <thrust/remove.h>
+#include <thrust/partition.h>
 
 #include "sceneStructs.h"
 #include "scene.h"
 #include "glm/glm.hpp"
 #include "glm/gtx/norm.hpp"
+#include <glm/gtc/matrix_inverse.hpp>
+#include <glm/gtc/matrix_transform.hpp>
 #include "utilities.h"
 #include "pathtrace.h"
 #include "intersections.h"
 #include "interactions.h"
+#include "efficient.h"
 
 #define ERRORCHECK 1
 
+
+#define MOTION_BLUR 0
+#define ANTI_ALIAS 1
+#define STREAM_COMPACT 0
+#define STREAM_COMPACT_THRUST 1 // toggle either stream compact
+#define SORT_BY_MATERIAL 0
+
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
 #if ERRORCHECK
-    cudaDeviceSynchronize();
-    cudaError_t err = cudaGetLastError();
-    if (cudaSuccess == err) {
-        return;
-    }
-
-    fprintf(stderr, "CUDA error");
-    if (file) {
-        fprintf(stderr, " (%s:%d)", file, line);
-    }
-    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+	cudaDeviceSynchronize();
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess == err) {
+		return;
+	}
+
+	fprintf(stderr, "CUDA error");
+	if (file) {
+		fprintf(stderr, " (%s:%d)", file, line);
+	}
+	fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
 #  ifdef _WIN32
-    getchar();
+	getchar();
 #  endif
-    exit(EXIT_FAILURE);
+	exit(EXIT_FAILURE);
 #endif
 }
 
+__device__ glm::mat4 buildTransformationMatrix(glm::vec3 translation, glm::vec3 rotation, glm::vec3 scale) {
+	glm::mat4 translationMat = glm::translate(glm::mat4(), translation);
+	glm::mat4 rotationMat = glm::rotate(glm::mat4(), rotation.x * (float)PI / 180, glm::vec3(1, 0, 0));
+	rotationMat = rotationMat * glm::rotate(glm::mat4(), rotation.y * (float)PI / 180, glm::vec3(0, 1, 0));
+	rotationMat = rotationMat * glm::rotate(glm::mat4(), rotation.z * (float)PI / 180, glm::vec3(0, 0, 1));
+	glm::mat4 scaleMat = glm::scale(glm::mat4(), scale);
+	return translationMat * rotationMat * scaleMat;
+}
+
 __host__ __device__
 thrust::default_random_engine makeSeededRandomEngine(int iter, int index, int depth) {
-    int h = utilhash((1 << 31) | (depth << 22) | iter) ^ utilhash(index);
-    return thrust::default_random_engine(h);
+	int h = utilhash((1 << 31) | (depth << 22) | iter) ^ utilhash(index);
+	return thrust::default_random_engine(h);
 }
 
+__global__ void kernMotionBlur(int n, Geom* dev_geoms, int iter) {
+
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (idx > n) return;
+
+	
+	if (dev_geoms[idx].type == SPHERE) {
+		//printf("%f %f %f\n", dev_geoms[idx].translation.x, dev_geoms[idx].translation.y, dev_geoms[idx].translation.z);
+		float vel = 0.001;
+		dev_geoms[idx].translation -= glm::vec3(vel);
+		dev_geoms[idx].transform = buildTransformationMatrix(dev_geoms[idx].translation, dev_geoms[idx].rotation, dev_geoms[idx].scale);
+		
+		//float vel = 0.01;
+		//dev_geoms[idx].transform = dev_geoms[idx].initialTransform + glm::mat4(
+		//	1.0, 0.0, 0.0, iter*vel,
+		//	0.0, 1.0, 0.0, iter*vel,
+		//	0.0, 0.0, 1.0, 0.0,
+		//	0.0, 0.0, 0.0, 1.0) * dev_geoms[idx].transform;
+
+		dev_geoms[idx].inverseTransform = glm::inverse(dev_geoms[idx].transform);
+		dev_geoms[idx].invTranspose = glm::inverseTranspose(dev_geoms[idx].transform);
+	}
+}
+
+
 //Kernel that writes the image to the OpenGL PBO directly.
 __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
-        int iter, glm::vec3* image) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-
-    if (x < resolution.x && y < resolution.y) {
-        int index = x + (y * resolution.x);
-        glm::vec3 pix = image[index];
-
-        glm::ivec3 color;
-        color.x = glm::clamp((int) (pix.x / iter * 255.0), 0, 255);
-        color.y = glm::clamp((int) (pix.y / iter * 255.0), 0, 255);
-        color.z = glm::clamp((int) (pix.z / iter * 255.0), 0, 255);
-
-        // Each thread writes one pixel location in the texture (textel)
-        pbo[index].w = 0;
-        pbo[index].x = color.x;
-        pbo[index].y = color.y;
-        pbo[index].z = color.z;
-    }
+		int iter, glm::vec3* image) {
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < resolution.x && y < resolution.y) {
+		int index = x + (y * resolution.x);
+		glm::vec3 pix = image[index];
+
+		glm::ivec3 color;
+		color.x = glm::clamp((int) (pix.x / iter * 255.0), 0, 255);
+		color.y = glm::clamp((int) (pix.y / iter * 255.0), 0, 255);
+		color.z = glm::clamp((int) (pix.z / iter * 255.0), 0, 255);
+
+		// Each thread writes one pixel location in the texture (textel)
+		pbo[index].w = 0;
+		pbo[index].x = color.x;
+		pbo[index].y = color.y;
+		pbo[index].z = color.z;
+	}
 }
 
 static Scene * hst_scene = NULL;
@@ -72,43 +118,49 @@ static glm::vec3 * dev_image = NULL;
 static Geom * dev_geoms = NULL;
 static Material * dev_materials = NULL;
 static PathSegment * dev_paths = NULL;
+static int* dev_alive_paths = NULL;
 static ShadeableIntersection * dev_intersections = NULL;
+static ShadeableIntersection * dev_first_intersections = NULL;
+static ShadeableIntersection * dev_intersections_orig = NULL;
 // TODO: static variables for device memory, any extra info you need, etc
 // ...
 
 void pathtraceInit(Scene *scene) {
-    hst_scene = scene;
-    const Camera &cam = hst_scene->state.camera;
-    const int pixelcount = cam.resolution.x * cam.resolution.y;
+	hst_scene = scene;
+	const Camera &cam = hst_scene->state.camera;
+	const int pixelcount = cam.resolution.x * cam.resolution.y;
 
-    cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3));
-    cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3));
+	cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3));
+	cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3));
 
-  	cudaMalloc(&dev_paths, pixelcount * sizeof(PathSegment));
+	cudaMalloc(&dev_paths, pixelcount * sizeof(PathSegment));
+	cudaMalloc(&dev_alive_paths, pixelcount * sizeof(int));
 
-  	cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom));
-  	cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice);
+	cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom));
+	cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice);
 
-  	cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material));
-  	cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice);
+	cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material));
+	cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection));
+	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
 
-  	cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection));
-  	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
 
-    // TODO: initialize any extra device memeory you need
+	cudaMalloc(&dev_first_intersections, pixelcount * sizeof(ShadeableIntersection));
+	cudaMemset(dev_first_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
 
-    checkCUDAError("pathtraceInit");
+	checkCUDAError("pathtraceInit");
 }
 
 void pathtraceFree() {
-    cudaFree(dev_image);  // no-op if dev_image is null
-  	cudaFree(dev_paths);
-  	cudaFree(dev_geoms);
-  	cudaFree(dev_materials);
-  	cudaFree(dev_intersections);
-    // TODO: clean up any extra device memory you created
-
-    checkCUDAError("pathtraceFree");
+	cudaFree(dev_image);  // no-op if dev_image is null
+	cudaFree(dev_paths);
+	cudaFree(dev_geoms);
+	cudaFree(dev_materials);
+	cudaFree(dev_intersections);
+	cudaFree(dev_first_intersections);
+
+	checkCUDAError("pathtraceFree");
 }
 
 /**
@@ -119,30 +171,48 @@ void pathtraceFree() {
 * motion blur - jitter rays "in time"
 * lens effect - jitter ray origin positions based on a lens
 */
-__global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, PathSegment* pathSegments)
+__global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, PathSegment* pathSegments, int* alive_paths)
 {
 	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
 	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
 
+
+
 	if (x < cam.resolution.x && y < cam.resolution.y) {
 		int index = x + (y * cam.resolution.x);
+
+
 		PathSegment & segment = pathSegments[index];
+		alive_paths[index] = index;
+
+		thrust::default_random_engine rng = makeSeededRandomEngine(iter, index, segment.remainingBounces);
+		thrust::uniform_real_distribution<float> u01(-0.5f, 0.5f);
 
 		segment.ray.origin = cam.position;
-    segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
+		segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
+
+		// antialiasing by jittering
+		//float x_shift = x + u01(rng);
+		//float y_shift = y + u01(rng);
+		float x_shift = x;
+		float y_shift = y;
+
+#if ANTI_ALIAS
+		x_shift = x + u01(rng);
+		y_shift = y + u01(rng);
+#endif
+		
 
-		// TODO: implement antialiasing by jittering the ray
 		segment.ray.direction = glm::normalize(cam.view
-			- cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f)
-			- cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f)
-			);
+			- cam.right * cam.pixelLength.x * (x_shift - (float)cam.resolution.x * 0.5f)
+			- cam.up * cam.pixelLength.y * (y_shift - (float)cam.resolution.y * 0.5f)
+		);
 
 		segment.pixelIndex = index;
 		segment.remainingBounces = traceDepth;
 	}
 }
 
-// TODO:
 // computeIntersections handles generating ray intersections ONLY.
 // Generating new rays is handled in your shader(s).
 // Feel free to modify the code below.
@@ -150,15 +220,18 @@ __global__ void computeIntersections(
 	int depth
 	, int num_paths
 	, PathSegment * pathSegments
+	, int* alive_paths
 	, Geom * geoms
 	, int geoms_size
-	, ShadeableIntersection * intersections
-	)
+	, ShadeableIntersection * intersections,
+	int iter)
 {
-	int path_index = blockIdx.x * blockDim.x + threadIdx.x;
+	int alive_idx = blockIdx.x * blockDim.x + threadIdx.x;
 
-	if (path_index < num_paths)
+	if (alive_idx < num_paths)
 	{
+		int path_index = alive_paths[alive_idx];
+		if (path_index < 0) return;
 		PathSegment pathSegment = pathSegments[path_index];
 
 		float t;
@@ -183,7 +256,20 @@ __global__ void computeIntersections(
 			}
 			else if (geom.type == SPHERE)
 			{
+				// Motion Blur atttempt
+				//float vel = 0.0001;
+				//geom.transform = 0.9f*geom.initialTransform + 0.1f*(float)iter * glm::mat4(
+				//	1.0, 0.0, 0.0, 0.0,
+				//	0.0, 1.0, 0.0, vel * iter,
+				//	0.0, 0.0, 1.0, 0.0,
+				//	0.0, 0.0, 0.0, 1.0) * geom.initialTransform;
+
+				//geom.inverseTransform = glm::inverse(geom.transform);
+				//geom.invTranspose = glm::inverseTranspose(geom.transform);
+
+			
 				t = sphereIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
+
 			}
 			// TODO: add more intersection tests here... triangle? metaball? CSG?
 
@@ -208,6 +294,7 @@ __global__ void computeIntersections(
 			intersections[path_index].t = t_min;
 			intersections[path_index].materialId = geoms[hit_geom_index].materialid;
 			intersections[path_index].surfaceNormal = normal;
+			intersections[path_index].intersectPoint = intersect_point;
 		}
 	}
 }
@@ -232,39 +319,94 @@ __global__ void shadeFakeMaterial (
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < num_paths)
   {
-    ShadeableIntersection intersection = shadeableIntersections[idx];
-    if (intersection.t > 0.0f) { // if the intersection exists...
-      // Set up the RNG
-      // LOOK: this is how you use thrust's RNG! Please look at
-      // makeSeededRandomEngine as well.
-      thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, 0);
-      thrust::uniform_real_distribution<float> u01(0, 1);
-
-      Material material = materials[intersection.materialId];
-      glm::vec3 materialColor = material.color;
-
-      // If the material indicates that the object was a light, "light" the ray
-      if (material.emittance > 0.0f) {
-        pathSegments[idx].color *= (materialColor * material.emittance);
-      }
-      // Otherwise, do some pseudo-lighting computation. This is actually more
-      // like what you would expect from shading in a rasterizer like OpenGL.
-      // TODO: replace this! you should be able to start with basically a one-liner
-      else {
-        float lightTerm = glm::dot(intersection.surfaceNormal, glm::vec3(0.0f, 1.0f, 0.0f));
-        pathSegments[idx].color *= (materialColor * lightTerm) * 0.3f + ((1.0f - intersection.t * 0.02f) * materialColor) * 0.7f;
-        pathSegments[idx].color *= u01(rng); // apply some noise because why not
-      }
-    // If there was no intersection, color the ray black.
-    // Lots of renderers use 4 channel color, RGBA, where A = alpha, often
-    // used for opacity, in which case they can indicate "no opacity".
-    // This can be useful for post-processing and image compositing.
-    } else {
-      pathSegments[idx].color = glm::vec3(0.0f);
-    }
+	ShadeableIntersection intersection = shadeableIntersections[idx];
+	if (intersection.t > 0.0f) { // if the intersection exists...
+	  // Set up the RNG
+	  // LOOK: this is how you use thrust's RNG! Please look at
+	  // makeSeededRandomEngine as well.
+	  thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, 0);
+	  thrust::uniform_real_distribution<float> u01(0, 1);
+
+	  Material material = materials[intersection.materialId];
+	  glm::vec3 materialColor = material.color;
+
+	  // If the material indicates that the object was a light, "light" the ray
+	  if (material.emittance > 0.0f) {
+		pathSegments[idx].color *= (materialColor * material.emittance);
+	  }
+	  // Otherwise, do some pseudo-lighting computation. This is actually more
+	  // like what you would expect from shading in a rasterizer like OpenGL.
+	  // TODO: replace this! you should be able to start with basically a one-liner
+	  else {
+		float lightTerm = glm::dot(intersection.surfaceNormal, glm::vec3(0.0f, 1.0f, 0.0f));
+		pathSegments[idx].color *= (materialColor * lightTerm) * 0.3f + ((1.0f - intersection.t * 0.02f) * materialColor) * 0.7f;
+		pathSegments[idx].color *= u01(rng); // apply some noise because why not
+	  }
+	// If there was no intersection, color the ray black.
+	// Lots of renderers use 4 channel color, RGBA, where A = alpha, often
+	// used for opacity, in which case they can indicate "no opacity".
+	// This can be useful for post-processing and image compositing.
+	} else {
+	  pathSegments[idx].color = glm::vec3(0.0f);
+	}
   }
 }
 
+
+__global__ void diffuseShader(
+	int iter
+	, int num_paths
+	, ShadeableIntersection * shadeableIntersections
+	, PathSegment * pathSegments
+	, int* alive_paths
+	, Material * materials, int depth
+)
+{
+	int alive_idx = blockIdx.x * blockDim.x + threadIdx.x;
+	if (alive_idx < num_paths)
+	{
+		int idx = alive_paths[alive_idx];
+		if (idx < 0) return;
+
+		ShadeableIntersection intersection = shadeableIntersections[idx];
+		if (intersection.t > 0.0f) { // if the intersection exists...
+		  // Set up the RNG
+		  // LOOK: this is how you use thrust's RNG! Please look at
+		  // makeSeededRandomEngine as well.
+			thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, pathSegments[idx].remainingBounces);
+			thrust::uniform_real_distribution<float> u01(0, 1);
+
+			Material material = materials[intersection.materialId];
+			glm::vec3 materialColor = material.color;
+			glm::vec3 intersectionPoint = intersection.intersectPoint;
+
+			// If the material indicates that the object was a light, "light" the ray
+			if (material.emittance > 0.0f) {
+				pathSegments[idx].color *= (materialColor * material.emittance);
+				pathSegments[idx].remainingBounces = 0;
+			}
+			// Otherwise, do some pseudo-lighting computation. This is actually more
+			// like what you would expect from shading in a rasterizer like OpenGL.
+			else {
+				if (pathSegments[idx].remainingBounces > 0) {
+					scatterRay(pathSegments[idx], intersectionPoint, intersection.surfaceNormal, material, rng);
+					pathSegments[idx].remainingBounces -= 1;
+				}
+			}
+			// If there was no intersection, color the ray black.
+			// Lots of renderers use 4 channel color, RGBA, where A = alpha, often
+			// used for opacity, in which case they can indicate "no opacity".
+			// This can be useful for post-processing and image compositing.
+		}
+		else {
+			pathSegments[idx].color = glm::vec3(0.0f);
+			pathSegments[idx].remainingBounces = 0;
+		}
+
+		if (pathSegments[idx].remainingBounces == 0) alive_paths[alive_idx] = -1;
+	}
+}
+
 // Add the current iteration's output to the overall image
 __global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterationPaths)
 {
@@ -282,51 +424,49 @@ __global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterati
  * of memory management
  */
 void pathtrace(uchar4 *pbo, int frame, int iter) {
-    const int traceDepth = hst_scene->state.traceDepth;
-    const Camera &cam = hst_scene->state.camera;
-    const int pixelcount = cam.resolution.x * cam.resolution.y;
+	const int traceDepth = hst_scene->state.traceDepth;
+	Camera &cam = hst_scene->state.camera;
+	const int pixelcount = cam.resolution.x * cam.resolution.y;
 
 	// 2D block for generating ray from camera
-    const dim3 blockSize2d(8, 8);
-    const dim3 blocksPerGrid2d(
-            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
-            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+	const dim3 blockSize2d(8, 8);
+	const dim3 blocksPerGrid2d(
+			(cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+			(cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
 
 	// 1D block for path tracing
 	const int blockSize1d = 128;
 
-    ///////////////////////////////////////////////////////////////////////////
-
-    // Recap:
-    // * Initialize array of path rays (using rays that come out of the camera)
-    //   * You can pass the Camera object to that kernel.
-    //   * Each path ray must carry at minimum a (ray, color) pair,
-    //   * where color starts as the multiplicative identity, white = (1, 1, 1).
-    //   * This has already been done for you.
-    // * For each depth:
-    //   * Compute an intersection in the scene for each path ray.
-    //     A very naive version of this has been implemented for you, but feel
-    //     free to add more primitives and/or a better algorithm.
-    //     Currently, intersection distance is recorded as a parametric distance,
-    //     t, or a "distance along the ray." t = -1.0 indicates no intersection.
-    //     * Color is attenuated (multiplied) by reflections off of any object
-    //   * TODO: Stream compact away all of the terminated paths.
-    //     You may use either your implementation or `thrust::remove_if` or its
-    //     cousins.
-    //     * Note that you can't really use a 2D kernel launch any more - switch
-    //       to 1D.
-    //   * TODO: Shade the rays that intersected something or didn't bottom out.
-    //     That is, color the ray by performing a color computation according
-    //     to the shader, then generate a new ray to continue the ray path.
-    //     We recommend just updating the ray's PathSegment in place.
-    //     Note that this step may come before or after stream compaction,
-    //     since some shaders you write may also cause a path to terminate.
-    // * Finally, add this iteration's results to the image. This has been done
-    //   for you.
-
-    // TODO: perform one iteration of path tracing
-
-	generateRayFromCamera <<<blocksPerGrid2d, blockSize2d >>>(cam, iter, traceDepth, dev_paths);
+	///////////////////////////////////////////////////////////////////////////
+
+	// Recap:
+	// * Initialize array of path rays (using rays that come out of the camera)
+	//   * You can pass the Camera object to that kernel.
+	//   * Each path ray must carry at minimum a (ray, color) pair,
+	//   * where color starts as the multiplicative identity, white = (1, 1, 1).
+	//   * This has already been done for you.
+	// * For each depth:
+	//   * Compute an intersection in the scene for each path ray.
+	//     A very naive version of this has been implemented for you, but feel
+	//     free to add more primitives and/or a better algorithm.
+	//     Currently, intersection distance is recorded as a parametric distance,
+	//     t, or a "distance along the ray." t = -1.0 indicates no intersection.
+	//     * Color is attenuated (multiplied) by reflections off of any object
+	//   * TODO: Stream compact away all of the terminated paths.
+	//     You may use either your implementation or `thrust::remove_if` or its
+	//     cousins.
+	//     * Note that you can't really use a 2D kernel launch any more - switch
+	//       to 1D.
+	//   * TODO: Shade the rays that intersected something or didn't bottom out.
+	//     That is, color the ray by performing a color computation according
+	//     to the shader, then generate a new ray to continue the ray path.
+	//     We recommend just updating the ray's PathSegment in place.
+	//     Note that this step may come before or after stream compaction,
+	//     since some shaders you write may also cause a path to terminate.
+	// * Finally, add this iteration's results to the image. This has been done
+	//   for you.
+
+	generateRayFromCamera <<<blocksPerGrid2d, blockSize2d >>>(cam, iter, traceDepth, dev_paths, dev_alive_paths);
 	checkCUDAError("generate camera ray");
 
 	int depth = 0;
@@ -336,58 +476,111 @@ void pathtrace(uchar4 *pbo, int frame, int iter) {
 	// --- PathSegment Tracing Stage ---
 	// Shoot ray into scene, bounce between objects, push shading chunks
 
-  bool iterationComplete = false;
+	// calculate first intersections
+	dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
+	if (iter == 1) {
+		computeIntersections << <numblocksPathSegmentTracing, blockSize1d >> > (
+			depth
+			, num_paths
+			, dev_paths
+			, dev_alive_paths
+			, dev_geoms
+			, hst_scene->geoms.size()
+			, dev_first_intersections
+			,iter);
+		checkCUDAError("trace one bounce");
+		cudaDeviceSynchronize();
+	}
+
+#if MOTION_BLUR
+	kernMotionBlur << <1, hst_scene->geoms.size() >>> (hst_scene->geoms.size(), dev_geoms, iter);
+#endif
+
+	cudaEvent_t start, stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+
+	cudaEventRecord(start);
+
+	bool iterationComplete = false;
 	while (!iterationComplete) {
 
-	// clean shading chunks
-	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
 
-	// tracing
-	dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
-	computeIntersections <<<numblocksPathSegmentTracing, blockSize1d>>> (
-		depth
-		, num_paths
-		, dev_paths
-		, dev_geoms
-		, hst_scene->geoms.size()
-		, dev_intersections
-		);
-	checkCUDAError("trace one bounce");
-	cudaDeviceSynchronize();
-	depth++;
-
-
-	// TODO:
-	// --- Shading Stage ---
-	// Shade path segments based on intersections and generate new rays by
-  // evaluating the BSDF.
-  // Start off with just a big kernel that handles all the different
-  // materials you have in the scenefile.
-  // TODO: compare between directly shading the path segments and shading
-  // path segments that have been reshuffled to be contiguous in memory.
-
-  shadeFakeMaterial<<<numblocksPathSegmentTracing, blockSize1d>>> (
-    iter,
-    num_paths,
-    dev_intersections,
-    dev_paths,
-    dev_materials
-  );
-  iterationComplete = true; // TODO: should be based off stream compaction results.
+
+
+		// clean shading chunks
+		cudaMemset(dev_intersections, 0, num_paths * sizeof(ShadeableIntersection));
+		// tracing
+		if (depth == 0) {
+			cudaMemcpy(dev_intersections, dev_first_intersections, num_paths * sizeof(ShadeableIntersection), cudaMemcpyDeviceToDevice);
+		}
+		else {
+			numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
+			computeIntersections << <numblocksPathSegmentTracing, blockSize1d >> > (
+				depth
+				, num_paths
+				, dev_paths
+				, dev_alive_paths
+				, dev_geoms
+				, hst_scene->geoms.size()
+				, dev_intersections
+				,iter);
+			checkCUDAError("trace one bounce");
+			cudaDeviceSynchronize();
+		}
+
+		depth++;
+
+
+		// --- Shading Stage ---
+		// Shade path segments based on intersections and generate new rays by
+		// evaluating the BSDF.
+		// Start off with just a big kernel that handles all the different
+		// materials you have in the scenefile.
+
+#if SORT_BY_MATERIAL
+		thrust::sort_by_key(thrust::device, dev_intersections, dev_intersections + num_paths, dev_paths, compareIntersections());
+#endif 
+
+		diffuseShader<<<numblocksPathSegmentTracing, blockSize1d>>>(iter, 
+			num_paths, dev_intersections, dev_paths, dev_alive_paths, dev_materials, depth);
+
+#if STREAM_COMPACT
+		num_paths = StreamCompaction::Efficient::compactShared(num_paths, dev_alive_paths);
+#endif
+
+#if STREAM_COMPACT_THRUST
+		int* dev_alive_paths_end = thrust::partition(thrust::device, dev_alive_paths, dev_alive_paths + num_paths, isTerminated());
+		num_paths = dev_alive_paths_end - dev_alive_paths;
+#endif
+		//if (iter == 1) {
+		//	printf("%d\n", num_paths);
+		//}
+
+		iterationComplete = (num_paths == 0) || depth > traceDepth;
 	}
+	
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+	float milliseconds = 0;
+	cudaEventElapsedTime(&milliseconds, start, stop);
+	
+	//printf("%.4f\n", milliseconds);
+
+	num_paths = pixelcount;
 
-  // Assemble this iteration and apply it to the image
-  dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
-	finalGather<<<numBlocksPixels, blockSize1d>>>(num_paths, dev_image, dev_paths);
+	// Assemble this iteration and apply it to the image
+	dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
+	finalGather << <numBlocksPixels, blockSize1d >> > (num_paths, dev_image, dev_paths);
 
-    ///////////////////////////////////////////////////////////////////////////
+	///////////////////////////////////////////////////////////////////////////
 
-    // Send results to OpenGL buffer for rendering
-    sendImageToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, iter, dev_image);
+	// Send results to OpenGL buffer for rendering
+	sendImageToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, iter, dev_image);
 
-    // Retrieve image from GPU
-    cudaMemcpy(hst_scene->state.image.data(), dev_image,
-            pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
+	// Retrieve image from GPU
+	cudaMemcpy(hst_scene->state.image.data(), dev_image,
+			pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
 
-    checkCUDAError("pathtrace");
+	checkCUDAError("pathtrace");
 }
diff --git a/src/scene.cpp b/src/scene.cpp
index cbae043..d96f59b 100644
--- a/src/scene.cpp
+++ b/src/scene.cpp
@@ -83,6 +83,8 @@ int Scene::loadGeom(string objectid) {
                 newGeom.translation, newGeom.rotation, newGeom.scale);
         newGeom.inverseTransform = glm::inverse(newGeom.transform);
         newGeom.invTranspose = glm::inverseTranspose(newGeom.transform);
+		newGeom.initialTransform = utilityCore::buildTransformationMatrix(
+			newGeom.translation, newGeom.rotation, newGeom.scale);
 
         geoms.push_back(newGeom);
         return 1;
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index b38b820..34a824b 100644
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -26,6 +26,7 @@ struct Geom {
     glm::mat4 transform;
     glm::mat4 inverseTransform;
     glm::mat4 invTranspose;
+	glm::mat4 initialTransform;
 };
 
 struct Material {
@@ -73,4 +74,5 @@ struct ShadeableIntersection {
   float t;
   glm::vec3 surfaceNormal;
   int materialId;
+  glm::vec3 intersectPoint;
 };
diff --git a/src/utilities.cpp b/src/utilities.cpp
index 9c06c68..8dea8f0 100644
--- a/src/utilities.cpp
+++ b/src/utilities.cpp
@@ -109,4 +109,4 @@ std::istream& utilityCore::safeGetline(std::istream& is, std::string& t) {
             t += (char)c;
         }
     }
-}
+}
\ No newline at end of file