diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..12949a7
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,20 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "CUDA C++: Launch",
+            "type": "cuda-gdb",
+            "request": "launch",
+            "program": ""
+        },
+        {
+            "name": "CUDA C++: Attach",
+            "type": "cuda-gdb",
+            "request": "attach",
+            "processId": "${command:cuda.pickProcess}"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..17ae526
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,82 @@
+{
+    "files.associations": {
+        "*.sdf": "xml",
+        "*.world": "xml",
+        "iostream": "cpp",
+        "cctype": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "csignal": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "any": "cpp",
+        "array": "cpp",
+        "atomic": "cpp",
+        "strstream": "cpp",
+        "bit": "cpp",
+        "*.tcc": "cpp",
+        "bitset": "cpp",
+        "chrono": "cpp",
+        "codecvt": "cpp",
+        "compare": "cpp",
+        "complex": "cpp",
+        "concepts": "cpp",
+        "condition_variable": "cpp",
+        "cstdint": "cpp",
+        "deque": "cpp",
+        "forward_list": "cpp",
+        "list": "cpp",
+        "map": "cpp",
+        "set": "cpp",
+        "string": "cpp",
+        "unordered_map": "cpp",
+        "unordered_set": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "iterator": "cpp",
+        "memory": "cpp",
+        "memory_resource": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "random": "cpp",
+        "ratio": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "fstream": "cpp",
+        "future": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "iosfwd": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "mutex": "cpp",
+        "new": "cpp",
+        "numbers": "cpp",
+        "ostream": "cpp",
+        "semaphore": "cpp",
+        "shared_mutex": "cpp",
+        "sstream": "cpp",
+        "stdexcept": "cpp",
+        "stop_token": "cpp",
+        "streambuf": "cpp",
+        "thread": "cpp",
+        "cfenv": "cpp",
+        "cinttypes": "cpp",
+        "typeindex": "cpp",
+        "typeinfo": "cpp",
+        "valarray": "cpp",
+        "variant": "cpp"
+    },
+    "ros.distro": "humble"
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index d63a6a1..e4fb9bb 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,107 @@
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture,
-Project 1 - Flocking**
+# CIS 565: GPU Programming and Architecture
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+## Project 1 - Flocking
 
-### (TODO: Your README)
+![Boids 1.3](images/boids13.gif)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+**Jason Xie**
+
+[🤓 LinkedIn](https://linkedin.com/in/jia-chun-xie)
+
+[😇 my website](https://jchunx.dev)
+
+[🥵 X (formerly 🐦)](https://x.com/codemonke_)
+
+Tested on: Ubuntu 22.04, i5-8400, RTX 3060Ti, personal machine
+
+## Performance Analysis
+
+### FPS vs. Number of Boids
+
+@ block size 128
+
+![FPS vs. Number of Boids](images/boids-perf.png)
+
+#### Naive
+
+|# Boids| FPS    |
+|-------|--------|
+| 1000  | 1900 |
+| 10000 |  220   |
+| 100000|  6   |
+
+#### Scattered Uniform
+
+|# Boids| FPS    |
+|-------|--------|
+| 1000  |  4000  |
+| 10000 |  2100  |
+| 100000|  270   |
+| 1000000|  6   |
+
+#### Coherent Uniform
+
+|# Boids| FPS    |
+|-------|--------|
+| 1000  |  3800  |
+| 10000 |  2800  |
+| 100000|  280   |
+| 1000000|  1    |
+
+### FPS vs. Block Size
+
+@ 10000 boids
+
+![FPS vs. Block Size](images/boids-block-perf.png)
+
+#### Naive
+
+|Block Size| FPS    |
+|-------|--------|
+| 32  |  220  |
+| 64 |  225  |
+| 128|  220  |
+| 256|  225  |
+
+#### Scattered Uniform
+
+|Block Size| FPS    |
+|-------|--------|
+| 32  |  2100  |
+| 64 |  2050  |
+| 128|  2100  |
+| 256|  2250  |
+
+#### Coherent Uniform
+
+|Block Size| FPS    |
+|-------|--------|
+| 32  |  2600  |
+| 64 |  2760  |
+| 128|  2800  |
+| 256|  2780  |
+
+## Q&A
+
+### Boids vs. performance
+
+Increasing the number of boids dramatically reduces the framerate. This is due to the quadratic nature of neighbor checking. Using uniform grids reduces the number of checks, resulting in $O(NlogN)$-like performance.
+
+### Blocks vs. performance
+
+For lower block sizes, increasing the number of blocks resulted in higher performance. However, for higher block sizes, increasing the number of blocks resulted in lower performance. For lower block sizes, the number of blocks is not limited by the number of SMs, so increasing the number of blocks increases the number of threads that can be run in parallel. But increasing block sizes further likely introduces more overhead than the performance gained from parallelization.
+
+### Coherent vs. scattered uniform grids
+
+Coherent uniform grids performed generally the same as scattered uniform grids. This is not a very surprising outcome, as the primitive rearrangement kernel simply performed the same swapping operation done on the scattered uniform kernel.
+
+### 27 vs. 8 neighbors
+
+@ 10000 boids, uniform grid, block size 128
+
+|Neighbor Count| FPS    |
+|-------|--------|
+| 27  |  3140  |
+| 8 |  2100  |
+
+Changing cell width and increasing the number of neighbors to 27 resulted in an increase of performance. This is due to the fact that a finer grid results in fewer neighbors outside of the checking radius, resulting in fewer checks.
\ No newline at end of file
diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000..a74cbcd
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Create build directory if it doesn't exist
+mkdir -p build
+
+# Change into the build directory
+cd build
+
+# Default build type to Release
+build_type="Release"
+
+# Check for argument "debug" to change build type
+if [ "$1" == "debug" ]; then
+  echo "Building in debug mode"
+  build_type="Debug"
+fi
+
+# Run cmake with the specified build type
+cmake -DCMAKE_BUILD_TYPE=$build_type ..
+
+# Build the project with dbg if debug was specified
+if [ "$1" == "debug" ]; then
+  make dbg=1
+else
+  make
+fi
diff --git a/images/boids-block-perf.png b/images/boids-block-perf.png
new file mode 100644
index 0000000..641454d
Binary files /dev/null and b/images/boids-block-perf.png differ
diff --git a/images/boids-perf.png b/images/boids-perf.png
new file mode 100644
index 0000000..5b743fc
Binary files /dev/null and b/images/boids-perf.png differ
diff --git a/images/boids13.gif b/images/boids13.gif
new file mode 100644
index 0000000..1d723ae
Binary files /dev/null and b/images/boids13.gif differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..d1915fd 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -21,14 +21,14 @@
 * Check for CUDA errors; print and exit if there was a problem.
 */
 void checkCUDAError(const char *msg, int line = -1) {
-  cudaError_t err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    if (line >= 0) {
-      fprintf(stderr, "Line %d: ", line);
+    cudaError_t err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        if (line >= 0) {
+            fprintf(stderr, "Line %d: ", line);
+        }
+        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
     }
-    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
 }
 
 
@@ -37,7 +37,7 @@ void checkCUDAError(const char *msg, int line = -1) {
 *****************/
 
 /*! Block size used for CUDA kernel launch. */
-#define blockSize 128
+#define blockSize 256
 
 // LOOK-1.2 Parameters for the boids algorithm.
 // These worked well in our reference implementation.
@@ -81,10 +81,12 @@ thrust::device_ptr<int> dev_thrust_particleArrayIndices;
 thrust::device_ptr<int> dev_thrust_particleGridIndices;
 
 int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs
-int *dev_gridCellEndIndices;   // to this cell?
+int *dev_gridCellEndIndices;     // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3 *dev_pos_rearranged;
+glm::vec3 *dev_vel_rearranged;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -99,13 +101,13 @@ glm::vec3 gridMinimum;
 ******************/
 
 __host__ __device__ unsigned int hash(unsigned int a) {
-  a = (a + 0x7ed55d16) + (a << 12);
-  a = (a ^ 0xc761c23c) ^ (a >> 19);
-  a = (a + 0x165667b1) + (a << 5);
-  a = (a + 0xd3a2646c) ^ (a << 9);
-  a = (a + 0xfd7046c5) + (a << 3);
-  a = (a ^ 0xb55a4f09) ^ (a >> 16);
-  return a;
+    a = (a + 0x7ed55d16) + (a << 12);
+    a = (a ^ 0xc761c23c) ^ (a >> 19);
+    a = (a + 0x165667b1) + (a << 5);
+    a = (a + 0xd3a2646c) ^ (a << 9);
+    a = (a + 0xfd7046c5) + (a << 3);
+    a = (a ^ 0xb55a4f09) ^ (a >> 16);
+    return a;
 }
 
 /**
@@ -113,10 +115,10 @@ __host__ __device__ unsigned int hash(unsigned int a) {
 * Function for generating a random vec3.
 */
 __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) {
-  thrust::default_random_engine rng(hash((int)(index * time)));
-  thrust::uniform_real_distribution<float> unitDistrib(-1, 1);
+    thrust::default_random_engine rng(hash((int)(index * time)));
+    thrust::uniform_real_distribution<float> unitDistrib(-1, 1);
 
-  return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng));
+    return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng));
 }
 
 /**
@@ -124,52 +126,67 @@ __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) {
 * CUDA kernel for generating boids with a specified mass randomly around the star.
 */
 __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, float scale) {
-  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (index < N) {
-    glm::vec3 rand = generateRandomVec3(time, index);
-    arr[index].x = scale * rand.x;
-    arr[index].y = scale * rand.y;
-    arr[index].z = scale * rand.z;
-  }
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index < N) {
+        glm::vec3 rand = generateRandomVec3(time, index);
+        arr[index].x = scale * rand.x;
+        arr[index].y = scale * rand.y;
+        arr[index].z = scale * rand.z;
+    }
 }
 
 /**
 * Initialize memory, update some globals
 */
 void Boids::initSimulation(int N) {
-  numObjects = N;
-  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
-
-  // LOOK-1.2 - This is basic CUDA memory management and error checking.
-  // Don't forget to cudaFree in  Boids::endSimulation.
-  cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3));
-  checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
-
-  cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3));
-  checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!");
-
-  cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3));
-  checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
-
-  // LOOK-1.2 - This is a typical CUDA kernel invocation.
-  kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects,
-    dev_pos, scene_scale);
-  checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
-
-  // LOOK-2.1 computing grid params
-  gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
-  int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
-  gridSideCount = 2 * halfSideCount;
-
-  gridCellCount = gridSideCount * gridSideCount * gridSideCount;
-  gridInverseCellWidth = 1.0f / gridCellWidth;
-  float halfGridWidth = gridCellWidth * halfSideCount;
-  gridMinimum.x -= halfGridWidth;
-  gridMinimum.y -= halfGridWidth;
-  gridMinimum.z -= halfGridWidth;
-
-  // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
-  cudaDeviceSynchronize();
+    numObjects = N;
+    dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+    //  - This is basic CUDA memory management and error checking.
+    // Don't forget to cudaFree in    Boids::endSimulation.
+    cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3));
+    checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
+
+    cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3));
+    checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!");
+
+    cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3));
+    checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
+
+    // LOOK-1.2 - This is a typical CUDA kernel invocation.
+    kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects,
+        dev_pos, scene_scale);
+    checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
+
+    // LOOK-2.1 computing grid params
+    gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+    int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
+    gridSideCount = 2 * halfSideCount;
+
+    gridCellCount = gridSideCount * gridSideCount * gridSideCount;
+    gridInverseCellWidth = 1.0f / gridCellWidth;
+    float halfGridWidth = gridCellWidth * halfSideCount;
+    gridMinimum.x -= halfGridWidth;
+    gridMinimum.y -= halfGridWidth;
+    gridMinimum.z -= halfGridWidth;
+
+    // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+    cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+    cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+    cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+    cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+    cudaMalloc((void**)&dev_pos_rearranged, N * sizeof(glm::vec3));
+    checkCUDAErrorWithLine("cudaMalloc dev_pos_rearranged failed!");
+    cudaMalloc((void**)&dev_vel_rearranged, N * sizeof(glm::vec3));
+    checkCUDAErrorWithLine("cudaMalloc dev_vel_rearranged failed!");
+
+    cudaDeviceSynchronize();
 }
 
 
@@ -181,41 +198,41 @@ void Boids::initSimulation(int N) {
 * Copy the boid positions into the VBO so that they can be drawn by OpenGL.
 */
 __global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo, float s_scale) {
-  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
-  float c_scale = -1.0f / s_scale;
+    float c_scale = -1.0f / s_scale;
 
-  if (index < N) {
-    vbo[4 * index + 0] = pos[index].x * c_scale;
-    vbo[4 * index + 1] = pos[index].y * c_scale;
-    vbo[4 * index + 2] = pos[index].z * c_scale;
-    vbo[4 * index + 3] = 1.0f;
-  }
+    if (index < N) {
+        vbo[4 * index + 0] = pos[index].x * c_scale;
+        vbo[4 * index + 1] = pos[index].y * c_scale;
+        vbo[4 * index + 2] = pos[index].z * c_scale;
+        vbo[4 * index + 3] = 1.0f;
+    }
 }
 
 __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float s_scale) {
-  int index = threadIdx.x + (blockIdx.x * blockDim.x);
-
-  if (index < N) {
-    vbo[4 * index + 0] = vel[index].x + 0.3f;
-    vbo[4 * index + 1] = vel[index].y + 0.3f;
-    vbo[4 * index + 2] = vel[index].z + 0.3f;
-    vbo[4 * index + 3] = 1.0f;
-  }
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+    if (index < N) {
+        vbo[4 * index + 0] = vel[index].x + 0.3f;
+        vbo[4 * index + 1] = vel[index].y + 0.3f;
+        vbo[4 * index + 2] = vel[index].z + 0.3f;
+        vbo[4 * index + 3] = 1.0f;
+    }
 }
 
 /**
 * Wrapper for call to the kernCopyboidsToVBO CUDA kernel.
 */
 void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) {
-  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
 
-  kernCopyPositionsToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, vbodptr_positions, scene_scale);
-  kernCopyVelocitiesToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
+    kernCopyPositionsToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, vbodptr_positions, scene_scale);
+    kernCopyVelocitiesToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
 
-  checkCUDAErrorWithLine("copyBoidsToVBO failed!");
+    checkCUDAErrorWithLine("copyBoidsToVBO failed!");
 
-  cudaDeviceSynchronize();
+    cudaDeviceSynchronize();
 }
 
 
@@ -230,21 +247,62 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return glm::vec3(0.0f, 0.0f, 0.0f);
+    }
+    // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+    glm::vec3 perceived_center = glm::vec3(0.0f, 0.0f, 0.0f);
+    int rule1Count = 0;
+    for (int i = 0; i < N; i++) {
+        if (i != iSelf && glm::distance(pos[i], pos[iSelf]) < rule1Distance) {
+            perceived_center += pos[i];
+            rule1Count++;
+        }
+    }
+    perceived_center /= rule1Count;
+    glm::vec3 v1 = (perceived_center - pos[iSelf]) * rule1Scale;
+
+    // Rule 2: boids try to stay a distance d away from each other
+    glm::vec3 v2 = glm::vec3(0.0f, 0.0f, 0.0f);
+    for (int i = 0; i < N; i++) {
+        if (i != iSelf && glm::distance(pos[i], pos[iSelf]) < rule2Distance) {
+            v2 -= (pos[i] - pos[iSelf]);
+        }
+    }
+    v2 = v2 * rule2Scale;
+
+    // Rule 3: boids try to match the speed of surrounding boids
+    glm::vec3 v3 = glm::vec3(0.0f, 0.0f, 0.0f);
+
+    for (int i = 0; i < N; i++) {
+        if (i != iSelf && glm::distance(pos[i], pos[iSelf]) < rule3Distance) {
+            v3 += vel[i];
+        }
+    }
+    v3 = v3 * rule3Scale;
+
+    return v1 + v2 + v3;
 }
 
 /**
 * TODO-1.2 implement basic flocking
 * For each of the `N` bodies, update its position based on its current velocity.
 */
-__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
-  glm::vec3 *vel1, glm::vec3 *vel2) {
-  // Compute a new velocity based on pos and vel1
-  // Clamp the speed
-  // Record the new velocity into vel2. Question: why NOT vel1?
+__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    // Compute a new velocity based on pos and vel1
+    glm::vec3 deltaV = computeVelocityChange(N, index, pos, vel1);
+    glm::vec3 newV = vel1[index] + deltaV;
+    // Clamp the speed
+    newV = glm::vec3(glm::clamp(newV.x, -maxSpeed, maxSpeed),
+                     glm::clamp(newV.y, -maxSpeed, maxSpeed),
+                     glm::clamp(newV.z, -maxSpeed, maxSpeed));
+    // Record the new velocity into vel2. Question: why NOT vel1?
+    vel2[index] = newV;
 }
 
 /**
@@ -252,206 +310,488 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
 * For each of the `N` bodies, update its position based on its current velocity.
 */
 __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
-  // Update position by velocity
-  int index = threadIdx.x + (blockIdx.x * blockDim.x);
-  if (index >= N) {
-    return;
-  }
-  glm::vec3 thisPos = pos[index];
-  thisPos += vel[index] * dt;
+    // Update position by velocity
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    glm::vec3 thisPos = pos[index];
+    thisPos += vel[index] * dt;
 
-  // Wrap the boids around so we don't lose them
-  thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x;
-  thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y;
-  thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z;
+    // Wrap the boids around so we don't lose them
+    thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x;
+    thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y;
+    thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z;
 
-  thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x;
-  thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y;
-  thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z;
+    thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x;
+    thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y;
+    thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z;
 
-  pos[index] = thisPos;
+    pos[index] = thisPos;
 }
 
 // LOOK-2.1 Consider this method of computing a 1D index from a 3D grid index.
 // LOOK-2.3 Looking at this method, what would be the most memory efficient
-//          order for iterating over neighboring grid cells?
-//          for(x)
-//            for(y)
-//             for(z)? Or some other order?
+//                    order for iterating over neighboring grid cells?
+//                    for(x)
+//                        for(y)
+//                         for(z)? Or some other order?
 __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
-  return x + y * gridResolution + z * gridResolution * gridResolution;
+    return x + y * gridResolution + z * gridResolution * gridResolution;
 }
 
 __global__ void kernComputeIndices(int N, int gridResolution,
-  glm::vec3 gridMin, float inverseCellWidth,
-  glm::vec3 *pos, int *indices, int *gridIndices) {
-    // TODO-2.1
-    // - Label each boid with the index of its grid cell.
-    // - Set up a parallel array of integer indices as pointers to the actual
-    //   boid data in pos and vel1/vel2
+    glm::vec3 gridMin, float inverseCellWidth,
+    glm::vec3 *pos, int *indices, int *gridIndices) {
+        // TODO-2.1
+        // - Label each boid with the index of its grid cell.
+        // - Set up a parallel array of integer indices as pointers to the actual
+        //     boid data in pos and vel1/vel2
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+    glm::vec3 gridIndex3D = glm::floor((pos[index] - gridMin) * inverseCellWidth);
+    int gridIndex1D = gridIndex3Dto1D(gridIndex3D.x, gridIndex3D.y, gridIndex3D.z, gridResolution);
+    gridIndices[index] = gridIndex1D;
+    indices[index] = index;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
-//          does not enclose any boids
+//                    does not enclose any boids
 __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
-  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (index < N) {
-    intBuffer[index] = value;
-  }
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index < N) {
+        intBuffer[index] = value;
+    }
 }
 
 __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
-  int *gridCellStartIndices, int *gridCellEndIndices) {
-  // TODO-2.1
-  // Identify the start point of each cell in the gridIndices array.
-  // This is basically a parallel unrolling of a loop that goes
-  // "this index doesn't match the one before it, must be a new cell!"
+    int *gridCellStartIndices, int *gridCellEndIndices) {
+    // TODO-2.1
+    // Identify the start point of each cell in the gridIndices array.
+    // This is basically a parallel unrolling of a loop that goes
+    // "this index doesn't match the one before it, must be a new cell!"
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+
+    int gridIndex = particleGridIndices[index];
+    int prevGridIndex = gridIndex == 0 ? -1 : particleGridIndices[index - 1];
+
+    if (gridIndex != prevGridIndex) {
+        gridCellStartIndices[gridIndex] = index;
+        if (gridIndex != 0) {
+            gridCellEndIndices[prevGridIndex] = index - 1;
+        }
+    }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  int *particleArrayIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
-  // the number of boids that need to be checked.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+    int N, int gridResolution, glm::vec3 gridMin,
+    float inverseCellWidth, float cellWidth,
+    int *gridCellStartIndices, int *gridCellEndIndices,
+    int *particleArrayIndices,
+    glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+    // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
+    // the number of boids that need to be checked.
+    // - Identify the grid cell that this particle is in
+    // - Identify which cells may contain neighbors. This isn't always 8.
+    // - For each cell, read the start/end indices in the boid pointer array.
+    // - Access each boid in the cell and compute velocity change from
+    //     the boids rules, if this boid is within the neighborhood distance.
+    // - Clamp the speed change before putting the new speed in vel2
+
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+    int particleIndex = particleArrayIndices[index];
+    glm::vec3 boidPos = pos[particleIndex];
+    glm::vec3 boidVel = vel1[particleIndex];
+
+    int boidXIndex = glm::round((boidPos.x - gridMin.x) * inverseCellWidth);
+    int boidYIndex = glm::round((boidPos.y - gridMin.y) * inverseCellWidth);
+    int boidZIndex = glm::round((boidPos.z - gridMin.z) * inverseCellWidth);
+
+    int boidXNeighbor[2] = { boidXIndex - 1, boidXIndex };
+    int boidYNeighbor[2] = { boidYIndex - 1, boidYIndex };
+    int boidZNeighbor[2] = { boidZIndex - 1, boidZIndex };
+
+    int neighborGridIndices[8] = { -1, -1, -1, -1, -1, -1, -1, -1 };
+    int neighborIndex = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 2; i++) {
+        int x = boidXNeighbor[i];
+        bool validX = x >= 0 && x < gridResolution;
+
+        #pragma unroll
+        for (int j = 0; j < 2; j++) {
+            int y = boidYNeighbor[j];
+            bool validY = y >= 0 && y < gridResolution;
+
+            #pragma unroll
+            for (int k = 0; k < 2; k++) {
+                int z = boidZNeighbor[k];
+                bool validZ = z >= 0 && z < gridResolution;
+
+                if (!validX || !validY || !validZ) continue;
+                neighborGridIndices[neighborIndex++] = gridIndex3Dto1D(x, y, z, gridResolution);
+            }
+        }
+    }
+
+    glm::vec3 perceived_center = glm::vec3(0.0f, 0.0f, 0.0f);
+    int rule1Count = 0;
+    glm::vec3 v2 = glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 v3 = glm::vec3(0.0f, 0.0f, 0.0f);
+
+    #pragma unroll
+    for (int i = 0; i < 8; i++) {
+        int neighborGridIndex = neighborGridIndices[i];
+        if (neighborGridIndex == -1) continue;
+
+        int neighborStartIndex = gridCellStartIndices[neighborGridIndex];
+        int neighborEndIndex = gridCellEndIndices[neighborGridIndex];
+
+        if (neighborStartIndex == -1 || neighborEndIndex == -1) continue;
+
+        for (int j = neighborStartIndex; j <= neighborEndIndex; j++) {
+            int neighborBoidIndex = particleArrayIndices[j];
+            if (neighborBoidIndex == particleIndex) continue;
+
+            glm::vec3 neighborPos = pos[neighborBoidIndex];
+            float distance = glm::distance(neighborPos, boidPos);
+
+            if (distance < rule1Distance) {
+                perceived_center += neighborPos;
+                rule1Count++;
+            }
+            if (distance < rule2Distance) {
+                v2 -= (neighborPos - boidPos);
+            }
+            if (distance < rule3Distance) {
+                v3 += vel1[neighborBoidIndex];
+            }
+        }
+    }
+
+    perceived_center /= rule1Count;
+    glm::vec3 v1 = (perceived_center - boidPos) * rule1Scale;
+    v2 = v2 * rule2Scale;
+    v3 = v3 * rule3Scale;
+
+    glm::vec3 newV = boidVel + v1 + v2 + v3;
+    
+    // Clamp the speed
+    newV = glm::vec3(glm::clamp(newV.x, -maxSpeed, maxSpeed),
+                     glm::clamp(newV.y, -maxSpeed, maxSpeed),
+                     glm::clamp(newV.z, -maxSpeed, maxSpeed));
+
+    // Record the new velocity into vel2. Question: why NOT vel1?
+    vel2[particleIndex] = newV;
+
+}
+
+__global__ void kernReorderPosVel(int N, int *particleArrayIndices, 
+    glm::vec3 *pos, glm::vec3 *vel, 
+    glm::vec3 *dev_pos_rearranged, glm::vec3 *dev_vel_rearranged) {
+
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+    
+    int particleIndex = particleArrayIndices[index];
+    dev_pos_rearranged[index] = pos[particleIndex];
+    dev_vel_rearranged[index] = vel[particleIndex];
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
-  // except with one less level of indirection.
-  // This should expect gridCellStartIndices and gridCellEndIndices to refer
-  // directly to pos and vel1.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+    int N, int gridResolution, glm::vec3 gridMin,
+    float inverseCellWidth, float cellWidth,
+    int *gridCellStartIndices, int *gridCellEndIndices,
+    glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+    // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
+    // except with one less level of indirection.
+    // This should expect gridCellStartIndices and gridCellEndIndices to refer
+    // directly to pos and vel1.
+    // - Identify the grid cell that this particle is in
+    // - Identify which cells may contain neighbors. This isn't always 8.
+    // - For each cell, read the start/end indices in the boid pointer array.
+    //     DIFFERENCE: For best results, consider what order the cells should be
+    //     checked in to maximize the memory benefits of reordering the boids data.
+    // - Access each boid in the cell and compute velocity change from
+    //     the boids rules, if this boid is within the neighborhood distance.
+    // - Clamp the speed change before putting the new speed in vel2
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+    glm::vec3 boidPos = pos[index];
+    glm::vec3 boidVel = vel1[index];
+
+    int boidXIndex = glm::round((boidPos.x - gridMin.x) * inverseCellWidth);
+    int boidYIndex = glm::round((boidPos.y - gridMin.y) * inverseCellWidth);
+    int boidZIndex = glm::round((boidPos.z - gridMin.z) * inverseCellWidth);
+
+    int boidXNeighbor[2] = { boidXIndex - 1, boidXIndex };
+    int boidYNeighbor[2] = { boidYIndex - 1, boidYIndex };
+    int boidZNeighbor[2] = { boidZIndex - 1, boidZIndex };
+
+    int neighborGridIndices[8] = { -1, -1, -1, -1, -1, -1, -1, -1 };
+    int neighborIndex = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 2; i++) {
+        int x = boidXNeighbor[i];
+        bool validX = x >= 0 && x < gridResolution;
+
+        #pragma unroll
+        for (int j = 0; j < 2; j++) {
+            int y = boidYNeighbor[j];
+            bool validY = y >= 0 && y < gridResolution;
+
+            #pragma unroll
+            for (int k = 0; k < 2; k++) {
+                int z = boidZNeighbor[k];
+                bool validZ = z >= 0 && z < gridResolution;
+
+                if (!validX || !validY || !validZ) continue;
+                neighborGridIndices[neighborIndex++] = gridIndex3Dto1D(x, y, z, gridResolution);
+            }
+        }
+    }
+
+    glm::vec3 perceived_center = glm::vec3(0.0f, 0.0f, 0.0f);
+    int rule1Count = 0;
+    glm::vec3 v2 = glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 v3 = glm::vec3(0.0f, 0.0f, 0.0f);
+
+    #pragma unroll
+    for (int i = 0; i < 8; i++) {
+        int neighborGridIndex = neighborGridIndices[i];
+        if (neighborGridIndex == -1) continue;
+
+        int neighborStartIndex = gridCellStartIndices[neighborGridIndex];
+        int neighborEndIndex = gridCellEndIndices[neighborGridIndex];
+
+        if (neighborStartIndex == -1 || neighborEndIndex == -1) continue;
+
+        for (int j = neighborStartIndex; j <= neighborEndIndex; j++) {
+            if (j == index) continue;
+
+            glm::vec3 neighborPos = pos[j];
+            float distance = glm::distance(neighborPos, boidPos);
+
+            if (distance < rule1Distance) {
+                perceived_center += neighborPos;
+                rule1Count++;
+            }
+            if (distance < rule2Distance) {
+                v2 -= (neighborPos - boidPos);
+            }
+            if (distance < rule3Distance) {
+                v3 += vel1[j];
+            }
+        }
+    }
+
+    perceived_center /= rule1Count;
+    glm::vec3 v1 = (perceived_center - boidPos) * rule1Scale;
+    v2 = v2 * rule2Scale;
+    v3 = v3 * rule3Scale;
+
+    glm::vec3 newV = boidVel + v1 + v2 + v3;
+    
+    // Clamp the speed
+    newV = glm::vec3(glm::clamp(newV.x, -maxSpeed, maxSpeed),
+                     glm::clamp(newV.y, -maxSpeed, maxSpeed),
+                     glm::clamp(newV.z, -maxSpeed, maxSpeed));
+
+    // Record the new velocity into vel2. Question: why NOT vel1?
+    vel2[index] = newV;
 }
 
 /**
 * Step the entire N-body simulation by `dt` seconds.
 */
 void Boids::stepSimulationNaive(float dt) {
-  // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
-  // TODO-1.2 ping-pong the velocity buffers
+    // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+    kernUpdateVelocityBruteForce<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_pos, dev_vel1, dev_vel2);
+
+    kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel1);
+    // TODO-1.2 ping-pong the velocity buffers
+    glm::vec3 *tmp = dev_vel1;
+    dev_vel1 = dev_vel2;
+    dev_vel2 = tmp;
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
-  // TODO-2.1
-  // Uniform Grid Neighbor search using Thrust sort.
-  // In Parallel:
-  // - label each particle with its array index as well as its grid index.
-  //   Use 2x width grids.
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed
+    // TODO-2.1
+    // Uniform Grid Neighbor search using Thrust sort.
+    // In Parallel:
+    // - label each particle with its array index as well as its grid index.
+    //     Use 2x width grids.
+    // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+    //     are welcome to do a performance comparison.
+    // - Naively unroll the loop for finding the start and end indices of each
+    //     cell's data pointers in the array of boid indices
+    // - Perform velocity updates using neighbor search
+    // - Update positions
+    // - Ping-pong buffers as needed
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+    kernComputeIndices<<<fullBlocksPerGrid, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+    dev_thrust_particleArrayIndices = thrust::device_pointer_cast(dev_particleArrayIndices);
+    dev_thrust_particleGridIndices = thrust::device_pointer_cast(dev_particleGridIndices);
+
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+    kernResetIntBuffer<<<fullBlocksPerGrid, blockSize>>>(gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer<<<fullBlocksPerGrid, blockSize>>>(gridCellCount, dev_gridCellEndIndices, -1);
+
+    kernIdentifyCellStartEnd<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+    kernUpdateVelNeighborSearchScattered<<<fullBlocksPerGrid, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+
+    kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel1);
+
+    glm::vec3 *tmp = dev_vel1;
+    dev_vel1 = dev_vel2;
+    dev_vel2 = tmp;
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
-  // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
-  // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
-  // In Parallel:
-  // - Label each particle with its array index as well as its grid index.
-  //   Use 2x width grids
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
-  //   the particle data in the simulation array.
-  //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+    // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
+    // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
+    // In Parallel:
+    // - Label each particle with its array index as well as its grid index.
+    //     Use 2x width grids
+    // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+    //     are welcome to do a performance comparison.
+    // - Naively unroll the loop for finding the start and end indices of each
+    //     cell's data pointers in the array of boid indices
+    // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
+    //     the particle data in the simulation array.
+    //     CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
+    // - Perform velocity updates using neighbor search
+    // - Update positions
+    // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+    kernComputeIndices<<<fullBlocksPerGrid, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+    dev_thrust_particleArrayIndices = thrust::device_pointer_cast(dev_particleArrayIndices);
+    dev_thrust_particleGridIndices = thrust::device_pointer_cast(dev_particleGridIndices);
+
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+    kernResetIntBuffer<<<fullBlocksPerGrid, blockSize>>>(gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer<<<fullBlocksPerGrid, blockSize>>>(gridCellCount, dev_gridCellEndIndices, -1);
+
+    kernIdentifyCellStartEnd<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+    kernReorderPosVel<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, dev_pos_rearranged, dev_vel_rearranged);
+
+    glm::vec3 *tmp_pos = dev_pos;
+    dev_pos = dev_pos_rearranged;
+    dev_pos_rearranged = tmp_pos; // need to swap here so we dont get both pointers to buffer B
+
+    glm::vec3 *tmp_vel = dev_vel1;
+    dev_vel1 = dev_vel_rearranged;
+    dev_vel_rearranged = tmp_vel;
+
+    kernUpdateVelNeighborSearchCoherent<<<fullBlocksPerGrid, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_pos, dev_vel1, dev_vel2);
+
+    kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+
+    glm::vec3 *tmp = dev_vel1;
+    dev_vel1 = dev_vel2;
+    dev_vel2 = tmp;
 }
 
 void Boids::endSimulation() {
-  cudaFree(dev_vel1);
-  cudaFree(dev_vel2);
-  cudaFree(dev_pos);
-
-  // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+    cudaFree(dev_vel1);
+    cudaFree(dev_vel2);
+    cudaFree(dev_pos);
+
+    // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+    cudaFree(dev_particleArrayIndices);
+    cudaFree(dev_particleGridIndices);
+    cudaFree(dev_gridCellStartIndices);
+    cudaFree(dev_gridCellEndIndices);
+
+    cudaFree(dev_pos_rearranged);
+    cudaFree(dev_vel_rearranged);
 }
 
 void Boids::unitTest() {
-  // LOOK-1.2 Feel free to write additional tests here.
-
-  // test unstable sort
-  int *dev_intKeys;
-  int *dev_intValues;
-  int N = 10;
-
-  std::unique_ptr<int[]>intKeys{ new int[N] };
-  std::unique_ptr<int[]>intValues{ new int[N] };
-
-  intKeys[0] = 0; intValues[0] = 0;
-  intKeys[1] = 1; intValues[1] = 1;
-  intKeys[2] = 0; intValues[2] = 2;
-  intKeys[3] = 3; intValues[3] = 3;
-  intKeys[4] = 0; intValues[4] = 4;
-  intKeys[5] = 2; intValues[5] = 5;
-  intKeys[6] = 2; intValues[6] = 6;
-  intKeys[7] = 0; intValues[7] = 7;
-  intKeys[8] = 5; intValues[8] = 8;
-  intKeys[9] = 6; intValues[9] = 9;
-
-  cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
-  checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
-
-  cudaMalloc((void**)&dev_intValues, N * sizeof(int));
-  checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
-
-  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
-
-  std::cout << "before unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
-  }
-
-  // How to copy data to the GPU
-  cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
-  cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
-
-  // Wrap device vectors in thrust iterators for use with thrust.
-  thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
-  thrust::device_ptr<int> dev_thrust_values(dev_intValues);
-  // LOOK-2.1 Example for using thrust::sort_by_key
-  thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
-
-  // How to copy data back to the CPU side from the GPU
-  cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  checkCUDAErrorWithLine("memcpy back failed!");
-
-  std::cout << "after unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
-  }
-
-  // cleanup
-  cudaFree(dev_intKeys);
-  cudaFree(dev_intValues);
-  checkCUDAErrorWithLine("cudaFree failed!");
-  return;
+    // LOOK-1.2 Feel free to write additional tests here.
+
+    // test unstable sort
+    int *dev_intKeys;
+    int *dev_intValues;
+    int N = 10;
+
+    std::unique_ptr<int[]>intKeys{ new int[N] };
+    std::unique_ptr<int[]>intValues{ new int[N] };
+
+    intKeys[0] = 0; intValues[0] = 0;
+    intKeys[1] = 1; intValues[1] = 1;
+    intKeys[2] = 0; intValues[2] = 2;
+    intKeys[3] = 3; intValues[3] = 3;
+    intKeys[4] = 0; intValues[4] = 4;
+    intKeys[5] = 2; intValues[5] = 5;
+    intKeys[6] = 2; intValues[6] = 6;
+    intKeys[7] = 0; intValues[7] = 7;
+    intKeys[8] = 5; intValues[8] = 8;
+    intKeys[9] = 6; intValues[9] = 9;
+
+    cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
+
+    cudaMalloc((void**)&dev_intValues, N * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
+
+    dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+    std::cout << "before unstable sort: " << std::endl;
+    for (int i = 0; i < N; i++) {
+        std::cout << "    key: " << intKeys[i];
+        std::cout << " value: " << intValues[i] << std::endl;
+    }
+
+    // How to copy data to the GPU
+    cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
+    cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
+
+    // Wrap device vectors in thrust iterators for use with thrust.
+    thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
+    thrust::device_ptr<int> dev_thrust_values(dev_intValues);
+    // LOOK-2.1 Example for using thrust::sort_by_key
+    thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
+
+    // How to copy data back to the CPU side from the GPU
+    cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
+    cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
+    checkCUDAErrorWithLine("memcpy back failed!");
+
+    std::cout << "after unstable sort: " << std::endl;
+    for (int i = 0; i < N; i++) {
+        std::cout << "    key: " << intKeys[i];
+        std::cout << " value: " << intValues[i] << std::endl;
+    }
+
+    // cleanup
+    cudaFree(dev_intKeys);
+    cudaFree(dev_intValues);
+    checkCUDAErrorWithLine("cudaFree failed!");
+    return;
 }
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..6ba6d95 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,37 +1,47 @@
 /**
-* @file      main.cpp
-* @brief     Example Boids flocking simulation for CIS 565
-* @authors   Liam Boone, Kai Ninomiya, Kangning (Gary) Li
-* @date      2013-2017
-* @copyright University of Pennsylvania
-*/
+ * @file      main.cpp
+ * @brief     Example Boids flocking simulation for CIS 565
+ * @authors   Liam Boone, Kai Ninomiya, Kangning (Gary) Li
+ * @date      2013-2017
+ * @copyright University of Pennsylvania
+ */
 
 #include "main.hpp"
+#include <iostream>
+#include <unistd.h>
 
 // ================
 // Configuration
 // ================
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
-#define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define VISUALIZE 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 10000;
 const float DT = 0.2f;
 
 /**
-* C main function.
-*/
-int main(int argc, char* argv[]) {
+ * C main function.
+ */
+int main(int argc, char *argv[])
+{
   projectName = "565 CUDA Intro: Boids";
 
-  if (init(argc, argv)) {
+  std::cout << "sleeping..." << std::endl;
+  sleep(5);
+  std::cout << "awake!" << std::endl;
+
+  if (init(argc, argv))
+  {
     mainLoop();
     Boids::endSimulation();
     return 0;
-  } else {
+  }
+  else
+  {
     return 1;
   }
 }
@@ -44,19 +54,21 @@ std::string deviceName;
 GLFWwindow *window;
 
 /**
-* Initialization of CUDA and GLFW.
-*/
-bool init(int argc, char **argv) {
+ * Initialization of CUDA and GLFW.
+ */
+bool init(int argc, char **argv)
+{
   // Set window title to "Student Name: [SM 2.0] GPU Name"
   cudaDeviceProp deviceProp;
   int gpuDevice = 0;
   int device_count = 0;
   cudaGetDeviceCount(&device_count);
-  if (gpuDevice > device_count) {
+  if (gpuDevice > device_count)
+  {
     std::cout
-    << "Error: GPU device number is greater than the number of devices!"
-    << " Perhaps a CUDA-capable GPU is not installed?"
-    << std::endl;
+        << "Error: GPU device number is greater than the number of devices!"
+        << " Perhaps a CUDA-capable GPU is not installed?"
+        << std::endl;
     return false;
   }
   cudaGetDeviceProperties(&deviceProp, gpuDevice);
@@ -70,11 +82,12 @@ bool init(int argc, char **argv) {
   // Window setup stuff
   glfwSetErrorCallback(errorCallback);
 
-  if (!glfwInit()) {
+  if (!glfwInit())
+  {
     std::cout
-    << "Error: Could not initialize GLFW!"
-    << " Perhaps OpenGL 3.3 isn't available?"
-    << std::endl;
+        << "Error: Could not initialize GLFW!"
+        << " Perhaps OpenGL 3.3 isn't available?"
+        << std::endl;
     return false;
   }
 
@@ -84,7 +97,8 @@ bool init(int argc, char **argv) {
   glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
 
   window = glfwCreateWindow(width, height, deviceName.c_str(), NULL, NULL);
-  if (!window) {
+  if (!window)
+  {
     glfwTerminate();
     return false;
   }
@@ -94,7 +108,8 @@ bool init(int argc, char **argv) {
   glfwSetMouseButtonCallback(window, mouseButtonCallback);
 
   glewExperimental = GL_TRUE;
-  if (glewInit() != GLEW_OK) {
+  if (glewInit() != GLEW_OK)
+  {
     return false;
   }
 
@@ -120,15 +135,17 @@ bool init(int argc, char **argv) {
   return true;
 }
 
-void initVAO() {
+void initVAO()
+{
 
-  std::unique_ptr<GLfloat[]> bodies{ new GLfloat[4 * (N_FOR_VIS)] };
-  std::unique_ptr<GLuint[]> bindices{ new GLuint[N_FOR_VIS] };
+  std::unique_ptr<GLfloat[]> bodies{new GLfloat[4 * (N_FOR_VIS)]};
+  std::unique_ptr<GLuint[]> bindices{new GLuint[N_FOR_VIS]};
 
   glm::vec4 ul(-1.0, -1.0, 1.0, 1.0);
   glm::vec4 lr(1.0, 1.0, 0.0, 0.0);
 
-  for (int i = 0; i < N_FOR_VIS; i++) {
+  for (int i = 0; i < N_FOR_VIS; i++)
+  {
     bodies[4 * i + 0] = 0.0f;
     bodies[4 * i + 1] = 0.0f;
     bodies[4 * i + 2] = 0.0f;
@@ -136,7 +153,6 @@ void initVAO() {
     bindices[i] = i;
   }
 
-
   glGenVertexArrays(1, &boidVAO); // Attach everything needed to draw a particle to this
   glGenBuffers(1, &boidVBO_positions);
   glGenBuffers(1, &boidVBO_velocities);
@@ -145,7 +161,7 @@ void initVAO() {
   glBindVertexArray(boidVAO);
 
   // Bind the positions array to the boidVAO by way of the boidVBO_positions
-  glBindBuffer(GL_ARRAY_BUFFER, boidVBO_positions); // bind the buffer
+  glBindBuffer(GL_ARRAY_BUFFER, boidVBO_positions);                                                // bind the buffer
   glBufferData(GL_ARRAY_BUFFER, 4 * (N_FOR_VIS) * sizeof(GLfloat), bodies.get(), GL_DYNAMIC_DRAW); // transfer data
 
   glEnableVertexAttribArray(positionLocation);
@@ -163,151 +179,166 @@ void initVAO() {
   glBindVertexArray(0);
 }
 
-void initShaders(GLuint * program) {
+void initShaders(GLuint *program)
+{
   GLint location;
 
   program[PROG_BOID] = glslUtility::createProgram(
-    "shaders/boid.vert.glsl",
-    "shaders/boid.geom.glsl",
-    "shaders/boid.frag.glsl", attributeLocations, 2);
-    glUseProgram(program[PROG_BOID]);
-
-    if ((location = glGetUniformLocation(program[PROG_BOID], "u_projMatrix")) != -1) {
-      glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]);
-    }
-    if ((location = glGetUniformLocation(program[PROG_BOID], "u_cameraPos")) != -1) {
-      glUniform3fv(location, 1, &cameraPosition[0]);
-    }
+      "shaders/boid.vert.glsl",
+      "shaders/boid.geom.glsl",
+      "shaders/boid.frag.glsl", attributeLocations, 2);
+  glUseProgram(program[PROG_BOID]);
+
+  if ((location = glGetUniformLocation(program[PROG_BOID], "u_projMatrix")) != -1)
+  {
+    glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]);
   }
-
-  //====================================
-  // Main loop
-  //====================================
-  void runCUDA() {
-    // Map OpenGL buffer object for writing from CUDA on a single GPU
-    // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not
-    // use this buffer
-
-    float4 *dptr = NULL;
-    float *dptrVertPositions = NULL;
-    float *dptrVertVelocities = NULL;
-
-    cudaGLMapBufferObject((void**)&dptrVertPositions, boidVBO_positions);
-    cudaGLMapBufferObject((void**)&dptrVertVelocities, boidVBO_velocities);
-
-    // execute the kernel
-    #if UNIFORM_GRID && COHERENT_GRID
-    Boids::stepSimulationCoherentGrid(DT);
-    #elif UNIFORM_GRID
-    Boids::stepSimulationScatteredGrid(DT);
-    #else
-    Boids::stepSimulationNaive(DT);
-    #endif
-
-    #if VISUALIZE
-    Boids::copyBoidsToVBO(dptrVertPositions, dptrVertVelocities);
-    #endif
-    // unmap buffer object
-    cudaGLUnmapBufferObject(boidVBO_positions);
-    cudaGLUnmapBufferObject(boidVBO_velocities);
+  if ((location = glGetUniformLocation(program[PROG_BOID], "u_cameraPos")) != -1)
+  {
+    glUniform3fv(location, 1, &cameraPosition[0]);
   }
+}
 
-  void mainLoop() {
-    double fps = 0;
-    double timebase = 0;
-    int frame = 0;
-
-    Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure
-                       // your CUDA development setup is ready to go.
+//====================================
+// Main loop
+//====================================
+void runCUDA()
+{
+  // Map OpenGL buffer object for writing from CUDA on a single GPU
+  // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not
+  // use this buffer
+
+  float4 *dptr = NULL;
+  float *dptrVertPositions = NULL;
+  float *dptrVertVelocities = NULL;
+
+  cudaGLMapBufferObject((void **)&dptrVertPositions, boidVBO_positions);
+  cudaGLMapBufferObject((void **)&dptrVertVelocities, boidVBO_velocities);
+
+// execute the kernel
+#if UNIFORM_GRID && COHERENT_GRID
+  Boids::stepSimulationCoherentGrid(DT);
+#elif UNIFORM_GRID
+  Boids::stepSimulationScatteredGrid(DT);
+#else
+  Boids::stepSimulationNaive(DT);
+#endif
+
+#if VISUALIZE
+  Boids::copyBoidsToVBO(dptrVertPositions, dptrVertVelocities);
+#endif
+  // unmap buffer object
+  cudaGLUnmapBufferObject(boidVBO_positions);
+  cudaGLUnmapBufferObject(boidVBO_velocities);
+}
 
-    while (!glfwWindowShouldClose(window)) {
-      glfwPollEvents();
+void mainLoop()
+{
+  double fps = 0;
+  double timebase = 0;
+  int frame = 0;
 
-      frame++;
-      double time = glfwGetTime();
+  Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure
+                     // your CUDA development setup is ready to go.
 
-      if (time - timebase > 1.0) {
-        fps = frame / (time - timebase);
-        timebase = time;
-        frame = 0;
-      }
+  while (!glfwWindowShouldClose(window))
+  {
+    glfwPollEvents();
 
-      runCUDA();
+    frame++;
+    double time = glfwGetTime();
 
-      std::ostringstream ss;
-      ss << "[";
-      ss.precision(1);
-      ss << std::fixed << fps;
-      ss << " fps] " << deviceName;
-      glfwSetWindowTitle(window, ss.str().c_str());
+    if (time - timebase > 1.0)
+    {
+      fps = frame / (time - timebase);
+      timebase = time;
+      frame = 0;
+    }
 
-      glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+    runCUDA();
 
-      #if VISUALIZE
-      glUseProgram(program[PROG_BOID]);
-      glBindVertexArray(boidVAO);
-      glPointSize((GLfloat)pointSize);
-      glDrawElements(GL_POINTS, N_FOR_VIS + 1, GL_UNSIGNED_INT, 0);
-      glPointSize(1.0f);
+    std::ostringstream ss;
+    ss << "[";
+    ss.precision(1);
+    ss << std::fixed << fps;
+    ss << " fps] " << deviceName;
+    glfwSetWindowTitle(window, ss.str().c_str());
 
-      glUseProgram(0);
-      glBindVertexArray(0);
+    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
 
-      glfwSwapBuffers(window);
-      #endif
-    }
-    glfwDestroyWindow(window);
-    glfwTerminate();
-  }
+#if VISUALIZE
+    glUseProgram(program[PROG_BOID]);
+    glBindVertexArray(boidVAO);
+    glPointSize((GLfloat)pointSize);
+    glDrawElements(GL_POINTS, N_FOR_VIS + 1, GL_UNSIGNED_INT, 0);
+    glPointSize(1.0f);
 
+    glUseProgram(0);
+    glBindVertexArray(0);
 
-  void errorCallback(int error, const char *description) {
-    fprintf(stderr, "error %d: %s\n", error, description);
+    glfwSwapBuffers(window);
+#endif
   }
+  glfwDestroyWindow(window);
+  glfwTerminate();
+}
 
-  void keyCallback(GLFWwindow* window, int key, int scancode, int action, int mods) {
-    if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) {
-      glfwSetWindowShouldClose(window, GL_TRUE);
-    }
-  }
+void errorCallback(int error, const char *description)
+{
+  fprintf(stderr, "error %d: %s\n", error, description);
+}
 
-  void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods) {
-    leftMousePressed = (button == GLFW_MOUSE_BUTTON_LEFT && action == GLFW_PRESS);
-    rightMousePressed = (button == GLFW_MOUSE_BUTTON_RIGHT && action == GLFW_PRESS);
+void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods)
+{
+  if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS)
+  {
+    glfwSetWindowShouldClose(window, GL_TRUE);
   }
+}
 
-  void mousePositionCallback(GLFWwindow* window, double xpos, double ypos) {
-    if (leftMousePressed) {
-      // compute new camera parameters
-      phi += (xpos - lastX) / width;
-      theta -= (ypos - lastY) / height;
-      theta = std::fmax(0.01f, std::fmin(theta, 3.14f));
-      updateCamera();
-    }
-    else if (rightMousePressed) {
-      zoom += (ypos - lastY) / height;
-      zoom = std::fmax(0.1f, std::fmin(zoom, 5.0f));
-      updateCamera();
-    }
+void mouseButtonCallback(GLFWwindow *window, int button, int action, int mods)
+{
+  leftMousePressed = (button == GLFW_MOUSE_BUTTON_LEFT && action == GLFW_PRESS);
+  rightMousePressed = (button == GLFW_MOUSE_BUTTON_RIGHT && action == GLFW_PRESS);
+}
 
-	lastX = xpos;
-	lastY = ypos;
+void mousePositionCallback(GLFWwindow *window, double xpos, double ypos)
+{
+  if (leftMousePressed)
+  {
+    // compute new camera parameters
+    phi += (xpos - lastX) / width;
+    theta -= (ypos - lastY) / height;
+    theta = std::fmax(0.01f, std::fmin(theta, 3.14f));
+    updateCamera();
+  }
+  else if (rightMousePressed)
+  {
+    zoom += (ypos - lastY) / height;
+    zoom = std::fmax(0.1f, std::fmin(zoom, 5.0f));
+    updateCamera();
   }
 
-  void updateCamera() {
-    cameraPosition.x = zoom * sin(phi) * sin(theta);
-    cameraPosition.z = zoom * cos(theta);
-    cameraPosition.y = zoom * cos(phi) * sin(theta);
-    cameraPosition += lookAt;
+  lastX = xpos;
+  lastY = ypos;
+}
 
-    projection = glm::perspective(fovy, float(width) / float(height), zNear, zFar);
-    glm::mat4 view = glm::lookAt(cameraPosition, lookAt, glm::vec3(0, 0, 1));
-    projection = projection * view;
+void updateCamera()
+{
+  cameraPosition.x = zoom * sin(phi) * sin(theta);
+  cameraPosition.z = zoom * cos(theta);
+  cameraPosition.y = zoom * cos(phi) * sin(theta);
+  cameraPosition += lookAt;
 
-    GLint location;
+  projection = glm::perspective(fovy, float(width) / float(height), zNear, zFar);
+  glm::mat4 view = glm::lookAt(cameraPosition, lookAt, glm::vec3(0, 0, 1));
+  projection = projection * view;
 
-    glUseProgram(program[PROG_BOID]);
-    if ((location = glGetUniformLocation(program[PROG_BOID], "u_projMatrix")) != -1) {
-      glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]);
-    }
+  GLint location;
+
+  glUseProgram(program[PROG_BOID]);
+  if ((location = glGetUniformLocation(program[PROG_BOID], "u_projMatrix")) != -1)
+  {
+    glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]);
   }
+}
diff --git a/src/main.hpp b/src/main.hpp
index 88e9df7..853b9b6 100644
--- a/src/main.hpp
+++ b/src/main.hpp
@@ -36,8 +36,8 @@ const float fovy = (float) (PI / 4);
 const float zNear = 0.10f;
 const float zFar = 10.0f;
 // LOOK-1.2: for high DPI displays, you may want to double these settings.
-int width = 1280;
-int height = 720;
+int width = 1280*2;
+int height = 720*2;
 int pointSize = 2;
 
 // For camera controls