diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2f16f40..8f07e17 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -80,9 +80,35 @@ elseif(CMAKE_VERSION VERSION_LESS "3.24.0")
 else()
     set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES native)
 endif()
-target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE "$<$<AND:$<CONFIG:Debug,RelWithDebInfo>,$<COMPILE_LANGUAGE:CUDA>>:-G>")
+target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE
+    $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda --expt-relaxed-constexpr>
+)
+target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE
+    $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>>:-G>
+)
 set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${CMAKE_PROJECT_NAME})
 
+set(N_FOR_VIS 5000 CACHE STRING "Number of boids to simulate")
+option(VISUALIZE "Turn on OpenGL visualization" ON)
+option(FINE_GRAINED_CELLS "Use fine-grained cells with 27 neighbors instead of 8" ON)
+option(UNIFORM_GRID "Use uniform grids for efficient neighbor checks" OFF)
+option(COHERENT_GRID "Use semi-coherent memory access for uniform grids" OFF)
+set(CUDA_BLOCK_SIZE 128 CACHE STRING "CUDA block size")
+option(FPS_MEASURE "Enable frame rate measurements" OFF)
+set(FPS_MEASURE_START 2 CACHE STRING "Seconds after start to begin frame rate measurements")
+set(FPS_MEASURE_DURATION 20 CACHE STRING "Number of seconds to measure frame rates for")
+target_compile_definitions(${CMAKE_PROJECT_NAME} PRIVATE
+    N_FOR_VIS=${N_FOR_VIS}
+    VISUALIZE=$<IF:$<BOOL:${VISUALIZE}>,1,0>
+    FINE_GRAINED_CELLS=$<IF:$<BOOL:${FINE_GRAINED_CELLS}>,1,0>
+    UNIFORM_GRID=$<IF:$<BOOL:${UNIFORM_GRID}>,1,0>
+    COHERENT_GRID=$<IF:$<BOOL:${COHERENT_GRID}>,1,0>
+    CUDA_BLOCK_SIZE=${CUDA_BLOCK_SIZE}
+    FPS_MEASURE=$<IF:$<BOOL:${FPS_MEASURE}>,1,0>
+    FPS_MEASURE_START=${FPS_MEASURE_START}
+    FPS_MEASURE_DURATION=${FPS_MEASURE_DURATION}
+)
+
 add_custom_command(
     TARGET ${CMAKE_PROJECT_NAME}
     PRE_BUILD
diff --git a/README.md b/README.md
index ee39093..c132e2b 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,79 @@
-**University of Pennsylvania, CIS 5650: GPU Programming and Architecture,
-Project 1 - Flocking**
+# University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 1 - Flocking
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+- Yunhao Qian
+  - [LinkedIn](www.linkedin.com/in/yunhao-qian-026980170)
+  - [GitHub](https://github.com/yunhao-qian)
+- Tested on my personal computer:
+  - OS: Windows 11, 24H2
+  - CPU: 13th Gen Intel(R) Core(TM) i7-13700 (2.10 GHz)
+  - GPU: NVIDIA GeForce RTX 4090
+  - RAM: 32.0 GB
 
-### (TODO: Your README)
+## Boids in Action
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+### Screenshot
+
+![Screenshot of boids](images/Screenshot%20of%20boids.png)
+
+The screenshot above is from an experiment with 100000 boids, 1x-sized cells, scattered uniform grid, and the block size of 128.
+
+### Recording
+
+![Recording of boids](images/Recording%20of%20boids.gif)
+
+The screen recording above is from an experiment with 320000 boids, 1x-sized cells, scattered uniform grid, and the block size of 128.
+
+## Changes to `CMakeLists.txt`
+
+- To share code among different configurations, I used lambda functions and `constexpr` in CUDA `__device__` code. To enable these features, I turned on the `--extended-lambda` and `--expt-relaxed-constexpr` flags using `target_compile_options()`.
+- To measure frame rates programmatically, all compile-time configurations are controlled by CMake definitions. These include `N_FOR_VIS`, `VISUALIZE`, `FINE_GRAINED_CELLS`, `UNIFORM_GRID`, `COHERENT_GRID`, and `CUDA_BLOCK_SIZE`.
+- The modified program has a special timed mode for measuring frame rates. To support this, CMake options are added, including `FPS_MEASURE`, `FPS_MEASURE_START`, and `FPS_MEASURE_DURATION`.
+
+## Performance Analysis
+
+### Methodology
+
+All experiments in this section are launched programmatically using [`measure_fps.py`](./scripts/measure_fps.py), which re-compiles the CMake project using specific arguments, runs the program, and captures the average frame rate (in frames per second, or FPS) from stdout. The measurement of each experiment starts 2 seconds after the program's launch and lasts for 20 seconds. After that, the program exits automatically. To avoid crashing the computer, more resource-consuming configurations are skipped once the frame rate drops below 1 FPS or the program takes over 100 seconds to execute. Statistics used to create the following plots are omitted here for conciseness, and please refer to [`measurements.json`](./scripts/measurements.json) for those detailed numbers.
+
+### Number of Boids
+
+![Frame rate vs. number of boids](images/Frame%20rate%20vs%20number%20of%20boids.png)
+
+All experiments in this set use 1x-sized cells and the block size of 128.
+
+Discussion:
+
+- The naive implementation should be heavily compute-bound, and the time complexity is $O(N^2)$, where $N$ is the number of boids. Experiments confirm this trend in general, as the FPS drops consistently as $N$ increases.
+- The scattered & coherent uniform grid implementations do not show a monotonous trend at small $N$'s. The worst performance appears at $N = ~20000$. This may be caused by thread divergence in warps as not all neighbor cells are occupied by boids.
+- The two uniform-grid implementations reach their best performance at $N = ~100000$. The GPU's compute capability is probably saturated at this point.
+- When $N$ is very large, the time complexity is $O(N^2)$ for all implementations with or without uniform grids, as large $N$'s always increase the number of boids within the effective distance. This is confirmed by the linear trends in the log-log plot.
+
+### OpenGL Visualization
+
+While visualizing a point cloud in OpenGL has $O(N)$ time complexity, its amount of work is much lighter than boid simulations in general. When $N$ is small, experiments with visualization turned on have slightly lower FPS. When $N$ is large, the difference becomes barely detectable.
+
+### Block Size
+
+![Frame rate vs. block size](images/Frame%20rate%20vs%20block%20size.png)
+
+All experiments in this set use 32000 boids, no visualization, and 1x-sized cells.
+
+Discussion:
+
+- For the naive implementation, FPS improves as the block size increases from 8 to 64. This is because increased computation helps cover the latency of random global memory accesses. There is a small drop at 1024, which is probably due to reaching register limits.
+- The scattered uniform grid performs best at very small block sizes from 8 to 32, and then flattens. This is because the execution time is dominated by the latency of random global memory accesses. Increasing the block size further does not address this problem, and reduces the scheduler's ability to distribute blocks evenly among MP's.
+- The coherent uniform grid has strong gains for 8 to 16, as increased computation covers the latency of semi-sequential global memory accesses. Once the program is compute-bound, increasing the block size further does not bring consistent improvements.
+
+### Coherent Uniform Grids
+
+The coherent uniform grid brings significant performance improvements, and this is the expected outcome. Sorting boid data by cells makes neighbor loops touch contiguous memory. Those data reside in $\leq 9$ pieces of contiguous memory if the cells are 1x-sized, and $\leq 4$ pieces if the cells are 2x-sized. When $N$ is large, the semi-sequential accesses are much faster than thousands of random accesses. When $N$ is small, the overhead of sorting boid data tends to dominate, resulting in the slightly lower FPS of the coherent uniform grid.
+
+### Cell Size
+
+![Frame rate vs. fine-grained cells](images/Frame%20rate%20vs%20fine-grained%20cells.png)
+
+All experiments in this set use no visualization, coherent uniform grids, and the block size of 128.
+
+When $N$ is small, 2x-sized cells should be slightly better, as the smaller number of neighbor cells means lighter looping overhead and better memory coherence. However, this advantage is not obvious is experiments, as it is hard to tell that one configuration is clearly better than the other.
+
+When $N$ is very large, 1x-sized cells should be much better, as more fine-grained cells allow the program to check fewer neighboring boids. If the boid density is $\rho$, and the effective distance is $R$, 1x-sized cells result in $(3 R)^3 = 27 R^3$ neighboring boids, while 2x-sized cells result in $(4 R)^3 = 64 R^3$ neighboring boids. This discrepancy is confirmed by the roughly constant offset in the log-log plot.
diff --git a/images/Frame rate vs block size.png b/images/Frame rate vs block size.png
new file mode 100644
index 0000000..fd5c4e3
Binary files /dev/null and b/images/Frame rate vs block size.png differ
diff --git a/images/Frame rate vs fine-grained cells.png b/images/Frame rate vs fine-grained cells.png
new file mode 100644
index 0000000..5501652
Binary files /dev/null and b/images/Frame rate vs fine-grained cells.png differ
diff --git a/images/Frame rate vs number of boids.png b/images/Frame rate vs number of boids.png
new file mode 100644
index 0000000..b00f763
Binary files /dev/null and b/images/Frame rate vs number of boids.png differ
diff --git a/images/Recording of boids.gif b/images/Recording of boids.gif
new file mode 100644
index 0000000..43c45c9
Binary files /dev/null and b/images/Recording of boids.gif differ
diff --git a/images/Screenshot of boids.png b/images/Screenshot of boids.png
new file mode 100644
index 0000000..32a6e0d
Binary files /dev/null and b/images/Screenshot of boids.png differ
diff --git a/scripts/measure_fps.py b/scripts/measure_fps.py
new file mode 100644
index 0000000..394b8d5
--- /dev/null
+++ b/scripts/measure_fps.py
@@ -0,0 +1,229 @@
+import json
+import re
+import shutil
+import subprocess
+import time
+from pathlib import Path
+from typing import Any
+
+ROOT_DIR = Path(__file__).parent.parent.absolute()
+BUILD_DIR = ROOT_DIR / "build"
+BOIDS_EXE = BUILD_DIR / "bin" / "Release" / "cis5650_boids.exe"
+MEASUREMENTS_JSON = Path(__file__).parent.absolute() / "measurements.json"
+
+
+def measure_fps_vs_num_boids() -> None:
+    measurement_data: list[dict[str, Any]] = []
+    if MEASUREMENTS_JSON.exists():
+        assert MEASUREMENTS_JSON.is_file()
+        with MEASUREMENTS_JSON.open(encoding="utf-8") as file:
+            measurement_data = json.load(file)
+    else:
+        measurement_data = []
+
+    def find_measurement(config: dict[str, Any]) -> dict[str, Any] | None:
+        for measurement in measurement_data:
+            if all(
+                key in measurement and measurement[key] == value
+                for key, value in config.items()
+            ):
+                return measurement
+        return None
+
+    nums_boids = [5000 * (2**i) for i in range(13)]
+    for name, base_config in [
+        ("naive", {"UNIFORM_GRID": 0, "COHERENT_GRID": 0}),
+        ("scattered uniform grid", {"UNIFORM_GRID": 1, "COHERENT_GRID": 0}),
+        ("coherent uniform grid", {"UNIFORM_GRID": 1, "COHERENT_GRID": 1}),
+    ]:
+        print(f"Measuring FPS for method: {name}")
+        # Visualize first so we can confirm correctness.
+        for visualize in [1, 0]:
+            for num_boids in nums_boids:
+                config = {
+                    "N_FOR_VIS": num_boids,
+                    "VISUALIZE": visualize,
+                    "FINE_GRAINED_CELLS": 1,
+                    **base_config,
+                    "CUDA_BLOCK_SIZE": 128,
+                }
+                measurement = find_measurement(config)
+                if measurement is None:
+                    print(f"Measuring FPS for config: {config}")
+                    start_time = time.time()
+                    fps = build_and_measure_fps(**config)
+                    duration = time.time() - start_time
+                    measurement = {**config, "duration": duration, "fps": fps}
+                    measurement_data.append(measurement)
+                    with MEASUREMENTS_JSON.open("w", encoding="utf-8") as file:
+                        json.dump(measurement_data, file, indent=4)
+                else:
+                    print(f"Found existing measurement for config: {config}")
+                print(f"  Duration: {measurement['duration']}")
+                print(f"  FPS: {measurement['fps']}")
+                if measurement["duration"] > 100.0:
+                    print("Skipping further measurements due to long duration")
+                    break
+                if measurement["fps"] < 1.0:
+                    print("Skipping further measurements due to low FPS")
+                    break
+
+
+def measure_fps_vs_block_size() -> None:
+    measurement_data: list[dict[str, Any]] = []
+    if MEASUREMENTS_JSON.exists():
+        assert MEASUREMENTS_JSON.is_file()
+        with MEASUREMENTS_JSON.open(encoding="utf-8") as file:
+            measurement_data = json.load(file)
+    else:
+        measurement_data = []
+
+    def find_measurement(config: dict[str, Any]) -> dict[str, Any] | None:
+        for measurement in measurement_data:
+            if all(
+                key in measurement and measurement[key] == value
+                for key, value in config.items()
+            ):
+                return measurement
+        return None
+
+    for name, base_config in [
+        ("naive", {"UNIFORM_GRID": 0, "COHERENT_GRID": 0}),
+        ("scattered uniform grid", {"UNIFORM_GRID": 1, "COHERENT_GRID": 0}),
+        ("coherent uniform grid", {"UNIFORM_GRID": 1, "COHERENT_GRID": 1}),
+    ]:
+        print(f"Measuring FPS for method: {name}")
+        for block_size in [8 * (2**i) for i in range(8)]:
+            config = {
+                "N_FOR_VIS": 320000,
+                "VISUALIZE": 0,
+                "FINE_GRAINED_CELLS": 1,
+                **base_config,
+                "CUDA_BLOCK_SIZE": block_size,
+            }
+            measurement = find_measurement(config)
+            if measurement is None:
+                print(f"Measuring FPS for config: {config}")
+                start_time = time.time()
+                fps = build_and_measure_fps(**config)
+                duration = time.time() - start_time
+                measurement = {**config, "duration": duration, "fps": fps}
+                measurement_data.append(measurement)
+                with MEASUREMENTS_JSON.open("w", encoding="utf-8") as file:
+                    json.dump(measurement_data, file, indent=4)
+            else:
+                print(f"Found existing measurement for config: {config}")
+            print(f"  Duration: {measurement['duration']}")
+            print(f"  FPS: {measurement['fps']}")
+
+
+def measure_fps_vs_fine_grained_cells() -> None:
+    measurement_data: list[dict[str, Any]] = []
+    if MEASUREMENTS_JSON.exists():
+        assert MEASUREMENTS_JSON.is_file()
+        with MEASUREMENTS_JSON.open(encoding="utf-8") as file:
+            measurement_data = json.load(file)
+    else:
+        measurement_data = []
+
+    def find_measurement(config: dict[str, Any]) -> dict[str, Any] | None:
+        for measurement in measurement_data:
+            if all(
+                key in measurement and measurement[key] == value
+                for key, value in config.items()
+            ):
+                return measurement
+        return None
+
+    nums_boids = [5000 * (2**i) for i in range(13)]
+    for fine_grained in [0, 1]:
+        for num_boids in nums_boids:
+            config = {
+                "N_FOR_VIS": num_boids,
+                "VISUALIZE": 0,
+                "FINE_GRAINED_CELLS": fine_grained,
+                "UNIFORM_GRID": 1,
+                "COHERENT_GRID": 1,
+                "CUDA_BLOCK_SIZE": 128,
+            }
+            measurement = find_measurement(config)
+            if measurement is None:
+                print(f"Measuring FPS for config: {config}")
+                start_time = time.time()
+                fps = build_and_measure_fps(**config)
+                duration = time.time() - start_time
+                measurement = {**config, "duration": duration, "fps": fps}
+                measurement_data.append(measurement)
+                with MEASUREMENTS_JSON.open("w", encoding="utf-8") as file:
+                    json.dump(measurement_data, file, indent=4)
+            else:
+                print(f"Found existing measurement for config: {config}")
+            print(f"  Duration: {measurement['duration']}")
+            print(f"  FPS: {measurement['fps']}")
+
+
+def build_and_measure_fps(**kwargs: str) -> float:
+    build(**kwargs)
+    return measure_fps()
+
+
+def build(**kwargs: str) -> None:
+    if BUILD_DIR.exists():
+        assert BUILD_DIR.is_dir()
+        shutil.rmtree(BUILD_DIR)
+
+    subprocess.run(
+        args=[
+            "cmake",
+            "-G",
+            "Visual Studio 17 2022",
+            "-S",
+            str(ROOT_DIR),
+            "-B",
+            str(BUILD_DIR),
+            *(
+                f"-D{key}={value}"
+                for key, value in {
+                    **kwargs,
+                    "FPS_MEASURE": 1,
+                    "FPS_MEASURE_START": 2,
+                    "FPS_MEASURE_DURATION": 20,
+                }.items()
+            ),
+        ],
+        check=True,
+    )
+    assert BUILD_DIR.is_dir()
+
+    subprocess.run(
+        [
+            "cmake",
+            "--build",
+            str(BUILD_DIR),
+            "--target",
+            "cis5650_boids",
+            "--config",
+            "Release",
+            "--parallel",
+        ],
+        check=True,
+    )
+    assert BOIDS_EXE.is_file()
+
+
+def measure_fps() -> float:
+    assert BOIDS_EXE.is_file()
+    result = subprocess.run(
+        [str(BOIDS_EXE)], cwd=BUILD_DIR, capture_output=True, check=True, text=True
+    )
+    for line in result.stdout.splitlines():
+        re_match = re.match(r"^FPS: (.+)$", line)
+        if re_match is not None:
+            return float(re_match[1])
+    raise RuntimeError("FPS not found in output")
+
+
+if __name__ == "__main__":
+    measure_fps_vs_num_boids()
+    measure_fps_vs_block_size()
+    measure_fps_vs_fine_grained_cells()
diff --git a/scripts/measurements.json b/scripts/measurements.json
new file mode 100644
index 0000000..41bf9bd
--- /dev/null
+++ b/scripts/measurements.json
@@ -0,0 +1,1012 @@
+[
+    {
+        "N_FOR_VIS": 5000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.6227707862854,
+        "fps": 657.431
+    },
+    {
+        "N_FOR_VIS": 10000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.55230760574341,
+        "fps": 434.421
+    },
+    {
+        "N_FOR_VIS": 20000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.73219704627991,
+        "fps": 253.271
+    },
+    {
+        "N_FOR_VIS": 40000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.65107774734497,
+        "fps": 131.948
+    },
+    {
+        "N_FOR_VIS": 80000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.29531240463257,
+        "fps": 66.6082
+    },
+    {
+        "N_FOR_VIS": 160000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.46341681480408,
+        "fps": 22.0857
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.969216108322144,
+        "fps": 7.67103
+    },
+    {
+        "N_FOR_VIS": 640000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.66736817359924,
+        "fps": 2.0287
+    },
+    {
+        "N_FOR_VIS": 1280000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 47.77025842666626,
+        "fps": 0.522458
+    },
+    {
+        "N_FOR_VIS": 5000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.48864769935608,
+        "fps": 1044.75
+    },
+    {
+        "N_FOR_VIS": 10000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.19173836708069,
+        "fps": 569.543
+    },
+    {
+        "N_FOR_VIS": 20000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.61615967750549,
+        "fps": 292.536
+    },
+    {
+        "N_FOR_VIS": 40000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 44.331363916397095,
+        "fps": 145.788
+    },
+    {
+        "N_FOR_VIS": 80000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 45.7530574798584,
+        "fps": 69.3474
+    },
+    {
+        "N_FOR_VIS": 160000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 50.78683161735535,
+        "fps": 22.4769
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 65.49712753295898,
+        "fps": 7.7261
+    },
+    {
+        "N_FOR_VIS": 640000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 128.35280299186707,
+        "fps": 2.03304
+    },
+    {
+        "N_FOR_VIS": 5000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.357173919677734,
+        "fps": 609.024
+    },
+    {
+        "N_FOR_VIS": 10000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.732410192489624,
+        "fps": 454.698
+    },
+    {
+        "N_FOR_VIS": 20000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.6706976890564,
+        "fps": 403.515
+    },
+    {
+        "N_FOR_VIS": 40000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 44.57855582237244,
+        "fps": 448.533
+    },
+    {
+        "N_FOR_VIS": 80000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.58179759979248,
+        "fps": 856.177
+    },
+    {
+        "N_FOR_VIS": 160000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.86065483093262,
+        "fps": 607.599
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.05411076545715,
+        "fps": 360.284
+    },
+    {
+        "N_FOR_VIS": 640000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 44.03311252593994,
+        "fps": 152.258
+    },
+    {
+        "N_FOR_VIS": 1280000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.115172147750854,
+        "fps": 46.6988
+    },
+    {
+        "N_FOR_VIS": 2560000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.69062376022339,
+        "fps": 12.4909
+    },
+    {
+        "N_FOR_VIS": 5120000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 45.71691036224365,
+        "fps": 1.13697
+    },
+    {
+        "N_FOR_VIS": 10240000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 59.34166193008423,
+        "fps": 0.157158
+    },
+    {
+        "N_FOR_VIS": 5000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.99738645553589,
+        "fps": 974.337
+    },
+    {
+        "N_FOR_VIS": 10000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 44.17229962348938,
+        "fps": 677.989
+    },
+    {
+        "N_FOR_VIS": 20000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.16410231590271,
+        "fps": 507.029
+    },
+    {
+        "N_FOR_VIS": 40000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.93199133872986,
+        "fps": 631.217
+    },
+    {
+        "N_FOR_VIS": 80000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.353086948394775,
+        "fps": 1529.25
+    },
+    {
+        "N_FOR_VIS": 160000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.44966006278992,
+        "fps": 1070.75
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.97448468208313,
+        "fps": 545.961
+    },
+    {
+        "N_FOR_VIS": 640000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.00913715362549,
+        "fps": 178.42
+    },
+    {
+        "N_FOR_VIS": 1280000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.977482080459595,
+        "fps": 49.4848
+    },
+    {
+        "N_FOR_VIS": 2560000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.04587411880493,
+        "fps": 12.8192
+    },
+    {
+        "N_FOR_VIS": 5120000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 45.90328574180603,
+        "fps": 1.13995
+    },
+    {
+        "N_FOR_VIS": 10240000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 65.49869990348816,
+        "fps": 0.157247
+    },
+    {
+        "N_FOR_VIS": 5000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.63059067726135,
+        "fps": 708.078
+    },
+    {
+        "N_FOR_VIS": 10000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.029086112976074,
+        "fps": 554.618
+    },
+    {
+        "N_FOR_VIS": 20000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.42856407165527,
+        "fps": 464.936
+    },
+    {
+        "N_FOR_VIS": 40000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.17451572418213,
+        "fps": 748.213
+    },
+    {
+        "N_FOR_VIS": 80000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.031094551086426,
+        "fps": 888.023
+    },
+    {
+        "N_FOR_VIS": 160000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.710936307907104,
+        "fps": 866.863
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.268999338150024,
+        "fps": 877.069
+    },
+    {
+        "N_FOR_VIS": 640000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.97938346862793,
+        "fps": 678.783
+    },
+    {
+        "N_FOR_VIS": 1280000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.90176224708557,
+        "fps": 425.147
+    },
+    {
+        "N_FOR_VIS": 2560000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.3774847984314,
+        "fps": 176.343
+    },
+    {
+        "N_FOR_VIS": 5120000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.766531467437744,
+        "fps": 63.778
+    },
+    {
+        "N_FOR_VIS": 10240000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 46.134644746780396,
+        "fps": 18.2124
+    },
+    {
+        "N_FOR_VIS": 20480000,
+        "VISUALIZE": 1,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 44.231101751327515,
+        "fps": 4.87904
+    },
+    {
+        "N_FOR_VIS": 5000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.162721395492554,
+        "fps": 1183.29
+    },
+    {
+        "N_FOR_VIS": 10000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.026193618774414,
+        "fps": 907.845
+    },
+    {
+        "N_FOR_VIS": 20000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.53697109222412,
+        "fps": 659.143
+    },
+    {
+        "N_FOR_VIS": 40000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.02344369888306,
+        "fps": 1157.83
+    },
+    {
+        "N_FOR_VIS": 80000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.16648769378662,
+        "fps": 2114.93
+    },
+    {
+        "N_FOR_VIS": 160000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.108848571777344,
+        "fps": 1599.36
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.17498850822449,
+        "fps": 1451.75
+    },
+    {
+        "N_FOR_VIS": 640000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.273948192596436,
+        "fps": 1302.09
+    },
+    {
+        "N_FOR_VIS": 1280000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.08933973312378,
+        "fps": 695.029
+    },
+    {
+        "N_FOR_VIS": 2560000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.16365456581116,
+        "fps": 242.137
+    },
+    {
+        "N_FOR_VIS": 5120000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.797698974609375,
+        "fps": 71.665
+    },
+    {
+        "N_FOR_VIS": 10240000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.767372131347656,
+        "fps": 19.2754
+    },
+    {
+        "N_FOR_VIS": 20480000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 44.0957510471344,
+        "fps": 4.9888
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 8,
+        "duration": 135.69655895233154,
+        "fps": 1.8606
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 16,
+        "duration": 90.44691896438599,
+        "fps": 3.66607
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 32,
+        "duration": 70.0245943069458,
+        "fps": 6.5176
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 64,
+        "duration": 65.16104793548584,
+        "fps": 7.72391
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 256,
+        "duration": 65.64359068870544,
+        "fps": 7.72893
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 512,
+        "duration": 65.29334497451782,
+        "fps": 7.88712
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 0,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 1024,
+        "duration": 68.36514234542847,
+        "fps": 6.69629
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 8,
+        "duration": 43.08688473701477,
+        "fps": 652.441
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 16,
+        "duration": 43.30788469314575,
+        "fps": 589.888
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 32,
+        "duration": 43.20156240463257,
+        "fps": 543.022
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 64,
+        "duration": 42.874295711517334,
+        "fps": 548.221
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 256,
+        "duration": 42.78530263900757,
+        "fps": 557.764
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 512,
+        "duration": 42.60041666030884,
+        "fps": 556.336
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 0,
+        "CUDA_BLOCK_SIZE": 1024,
+        "duration": 43.15495443344116,
+        "fps": 556.064
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 8,
+        "duration": 42.688578844070435,
+        "fps": 1198.04
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 16,
+        "duration": 42.94251227378845,
+        "fps": 1449.47
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 32,
+        "duration": 43.37266540527344,
+        "fps": 1388.05
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 64,
+        "duration": 43.31553554534912,
+        "fps": 1393.16
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 256,
+        "duration": 43.2527232170105,
+        "fps": 1499.31
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 512,
+        "duration": 43.233431339263916,
+        "fps": 1404.74
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 1,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 1024,
+        "duration": 42.6162543296814,
+        "fps": 1452.13
+    },
+    {
+        "N_FOR_VIS": 5000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 0,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.91198372840881,
+        "fps": 1169.53
+    },
+    {
+        "N_FOR_VIS": 10000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 0,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 52.28997850418091,
+        "fps": 929.669
+    },
+    {
+        "N_FOR_VIS": 20000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 0,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.69576406478882,
+        "fps": 872.443
+    },
+    {
+        "N_FOR_VIS": 40000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 0,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.84919023513794,
+        "fps": 1265.27
+    },
+    {
+        "N_FOR_VIS": 80000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 0,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 45.40427613258362,
+        "fps": 1545.59
+    },
+    {
+        "N_FOR_VIS": 160000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 0,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.797423362731934,
+        "fps": 1647.58
+    },
+    {
+        "N_FOR_VIS": 320000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 0,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.972813844680786,
+        "fps": 1452.49
+    },
+    {
+        "N_FOR_VIS": 640000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 0,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.78973627090454,
+        "fps": 979.476
+    },
+    {
+        "N_FOR_VIS": 1280000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 0,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 44.02507781982422,
+        "fps": 351.216
+    },
+    {
+        "N_FOR_VIS": 2560000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 0,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.75337862968445,
+        "fps": 100.433
+    },
+    {
+        "N_FOR_VIS": 5120000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 0,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 42.96444058418274,
+        "fps": 26.3963
+    },
+    {
+        "N_FOR_VIS": 10240000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 0,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 43.17731285095215,
+        "fps": 6.69305
+    },
+    {
+        "N_FOR_VIS": 20480000,
+        "VISUALIZE": 0,
+        "FINE_GRAINED_CELLS": 0,
+        "UNIFORM_GRID": 1,
+        "COHERENT_GRID": 1,
+        "CUDA_BLOCK_SIZE": 128,
+        "duration": 45.49564719200134,
+        "fps": 1.68413
+    }
+]
\ No newline at end of file
diff --git a/scripts/plot_fps.py b/scripts/plot_fps.py
new file mode 100644
index 0000000..3db27b9
--- /dev/null
+++ b/scripts/plot_fps.py
@@ -0,0 +1,134 @@
+import json
+from pathlib import Path
+from typing import Any
+from collections.abc import Iterator
+from matplotlib import pyplot as plt
+
+MEASUREMENTS_JSON = Path(__file__).parent.absolute() / "measurements.json"
+ROOT_DIR = Path(__file__).parent.parent.absolute()
+IMAGES_DIR = ROOT_DIR / "images"
+
+with MEASUREMENTS_JSON.open(encoding="utf-8") as file:
+    MEASUREMENT_DATA: list[dict[str, Any]] = json.load(file)
+
+
+def find_measurements(config: dict[str, Any]) -> Iterator[dict[str, Any]]:
+    for measurement in MEASUREMENT_DATA:
+        if all(
+            key in measurement and measurement[key] == value
+            for key, value in config.items()
+        ):
+            yield measurement
+
+
+def plot_fps_vs_num_boids() -> None:
+    fig, ax = plt.subplots()
+    ax.set_xscale("log")
+    ax.set_yscale("log")
+    ax.set_xlabel("Number of boids")
+    ax.set_ylabel("Frames per second")
+
+    for label, base_config in [
+        ("Naive (w/o vis.)", {"UNIFORM_GRID": 0, "COHERENT_GRID": 0, "VISUALIZE": 0}),
+        ("Naive (w/ vis.)", {"UNIFORM_GRID": 0, "COHERENT_GRID": 0, "VISUALIZE": 1}),
+        (
+            "Scattered Uniform Grid (w/o vis.)",
+            {"UNIFORM_GRID": 1, "COHERENT_GRID": 0, "VISUALIZE": 0},
+        ),
+        (
+            "Scattered Uniform Grid (w/ vis.)",
+            {"UNIFORM_GRID": 1, "COHERENT_GRID": 0, "VISUALIZE": 1},
+        ),
+        (
+            "Coherent Uniform Grid (w/o vis.)",
+            {"UNIFORM_GRID": 1, "COHERENT_GRID": 1, "VISUALIZE": 0},
+        ),
+        (
+            "Coherent Uniform Grid (w/ vis.)",
+            {"UNIFORM_GRID": 1, "COHERENT_GRID": 1, "VISUALIZE": 1},
+        ),
+    ]:
+        measurements = list(
+            find_measurements(
+                {**base_config, "FINE_GRAINED_CELLS": 1, "CUDA_BLOCK_SIZE": 128}
+            )
+        )
+        measurements.sort(key=lambda m: m["N_FOR_VIS"])
+        nums_boids = [m["N_FOR_VIS"] for m in measurements]
+        fps = [m["fps"] for m in measurements]
+        ax.plot(nums_boids, fps, marker="o", label=label)
+
+    ax.legend()
+    fig.tight_layout()
+    fig.savefig(IMAGES_DIR / "Frame rate vs number of boids.png")
+
+
+def plot_fps_vs_block_size() -> None:
+    fig, axes = plt.subplots(3, 1, sharex=True)
+    axes[-1].set_xlabel("Block size")
+
+    for ax, (title, base_config) in zip(
+        axes,
+        [
+            ("Naive", {"UNIFORM_GRID": 0, "COHERENT_GRID": 0}),
+            ("Scattered Uniform Grid", {"UNIFORM_GRID": 1, "COHERENT_GRID": 0}),
+            ("Coherent Uniform Grid", {"UNIFORM_GRID": 1, "COHERENT_GRID": 1}),
+        ],
+    ):
+        ax.set_title(title)
+        ax.set_xscale("log")
+        ax.set_ylabel("FPS")
+
+        measurements = list(
+            find_measurements(
+                {
+                    **base_config,
+                    "N_FOR_VIS": 320000,
+                    "VISUALIZE": 0,
+                    "FINE_GRAINED_CELLS": 1,
+                }
+            )
+        )
+        measurements.sort(key=lambda m: m["CUDA_BLOCK_SIZE"])
+        block_sizes = [m["CUDA_BLOCK_SIZE"] for m in measurements]
+        fps = [m["fps"] for m in measurements]
+        ax.plot(block_sizes, fps, marker="o")
+
+    fig.align_labels()
+    fig.tight_layout()
+    fig.savefig(IMAGES_DIR / "Frame rate vs block size.png")
+
+
+def plot_fps_vs_fine_grained_cells() -> None:
+    fig, ax = plt.subplots()
+    ax.set_xscale("log")
+    ax.set_yscale("log")
+    ax.set_xlabel("Number of boids")
+    ax.set_ylabel("Frames per second")
+
+    for fine_grained, label in [(0, "2x-sized cells"), (1, "1x-sized cells")]:
+        measurements = list(
+            find_measurements(
+                {
+                    "VISUALIZE": 0,
+                    "FINE_GRAINED_CELLS": fine_grained,
+                    "UNIFORM_GRID": 1,
+                    "COHERENT_GRID": 1,
+                    "CUDA_BLOCK_SIZE": 128,
+                }
+            )
+        )
+        measurements.sort(key=lambda m: m["N_FOR_VIS"])
+        nums_boids = [m["N_FOR_VIS"] for m in measurements]
+        fps = [m["fps"] for m in measurements]
+        ax.plot(nums_boids, fps, marker="o", label=label)
+
+    ax.legend()
+    fig.tight_layout()
+    fig.savefig(IMAGES_DIR / "Frame rate vs fine-grained cells.png")
+
+
+if __name__ == "__main__":
+    plot_fps_vs_num_boids()
+    plot_fps_vs_block_size()
+    plot_fps_vs_fine_grained_cells()
diff --git a/src/kernel.cu b/src/kernel.cu
index 7149917..38e72e6 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -47,7 +47,7 @@ void checkCUDAError(const char *msg, int line = -1) {
 *****************/
 
 /*! Block size used for CUDA kernel launch. */
-#define blockSize 128
+#define blockSize CUDA_BLOCK_SIZE
 
 // LOOK-1.2 Parameters for the boids algorithm.
 // These worked well in our reference implementation.
@@ -95,6 +95,8 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3 *dev_coherentPos;
+glm::vec3 *dev_coherentVel1;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -167,7 +169,11 @@ void Boids::initSimulation(int N) {
   checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
   // LOOK-2.1 computing grid params
-  gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+  if constexpr (FINE_GRAINED_CELLS) {
+    gridCellWidth = std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+  } else {
+    gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+  }
   int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
   gridSideCount = 2 * halfSideCount;
 
@@ -179,6 +185,28 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  if constexpr (UNIFORM_GRID) {
+    cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+    cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+    cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+    cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+    if constexpr (COHERENT_GRID) {
+      cudaMalloc((void**)&dev_coherentPos, N * sizeof(glm::vec3));
+      checkCUDAErrorWithLine("cudaMalloc dev_coherentPos failed!");
+
+      cudaMalloc((void**)&dev_coherentVel1, N * sizeof(glm::vec3));
+      checkCUDAErrorWithLine("cudaMalloc dev_coherentVel1 failed!");
+    }
+  }
+
   cudaDeviceSynchronize();
 }
 
@@ -243,7 +271,46 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+
+  glm::vec3 posSelf = pos[iSelf];
+
+  glm::vec3 rule1PerceivedCenter(0.0f, 0.0f, 0.0f);
+  int rule1NumberOfNeighbors = 0;
+  glm::vec3 rule2C(0.0f, 0.0f, 0.0f);
+  glm::vec3 rule3PerceivedVelocity(0.0f, 0.0f, 0.0f);
+  int rule3NumberOfNeighbors = 0;
+
+  for (int i = 0; i < N; ++i) {
+    if (i == iSelf) {
+      continue;
+    }
+    glm::vec3 posI = pos[i];
+    float distance = glm::distance(posSelf, posI);
+    if (distance < rule1Distance) {
+      rule1PerceivedCenter += posI;
+      ++rule1NumberOfNeighbors;
+    }
+    if (distance < rule2Distance) {
+      rule2C -= posI - posSelf;
+    }
+    if (distance < rule3Distance) {
+      rule3PerceivedVelocity += vel[i];
+      ++rule3NumberOfNeighbors;
+    }
+  }
+
+  glm::vec3 result(0.0f, 0.0f, 0.0f);
+  if (rule1NumberOfNeighbors > 0) {
+    rule1PerceivedCenter /= rule1NumberOfNeighbors;
+    result += (rule1PerceivedCenter - posSelf) * rule1Scale;
+  }
+  result += rule2C * rule2Scale;
+  if (rule3NumberOfNeighbors > 0) {
+    rule3PerceivedVelocity /= rule3NumberOfNeighbors;
+    result += rule3PerceivedVelocity * rule3Scale;
+  }
+
+  return result;
 }
 
 /**
@@ -255,6 +322,14 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+
+  glm::vec3 newVel = vel1[idx] + computeVelocityChange(N, idx, pos, vel1);
+  vel2[idx] = fminf(glm::length(newVel), maxSpeed) * glm::normalize(newVel);
 }
 
 /**
@@ -299,6 +374,17 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= N) {
+      return;
+    }
+
+    indices[idx] = idx;
+
+    glm::ivec3 gridIdx3D((pos[idx] - gridMin) * inverseCellWidth);
+    gridIdx3D = glm::clamp(gridIdx3D, 0, gridResolution - 1);
+    gridIndices[idx] = gridIndex3Dto1D(gridIdx3D.x, gridIdx3D.y, gridIdx3D.z, gridResolution);
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -316,6 +402,71 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+
+  int particleGridIdx = particleGridIndices[idx];
+  if (idx == 0 || particleGridIndices[idx - 1] != particleGridIdx) {
+    gridCellStartIndices[particleGridIdx] = idx;
+  }
+  if (idx == N - 1 || particleGridIndices[idx + 1] != particleGridIdx) {
+    // The end indices are inclusive.
+    gridCellEndIndices[particleGridIdx] = idx;
+  }
+}
+
+template <typename Visitor>
+__device__ void visitNeighborCells(
+  int gridResolution,
+  glm::vec3 gridMin,
+  float inverseCellWidth,
+  glm::vec3 posSelf,
+  Visitor visitor
+) {
+  if constexpr (FINE_GRAINED_CELLS) {
+    glm::ivec3 gridIdx3D((posSelf - gridMin) * inverseCellWidth);
+    gridIdx3D = glm::clamp(gridIdx3D, 0, gridResolution - 1);
+    for (int dz = -1; dz <= 1; ++dz) {
+      int z = gridIdx3D.z + dz;
+      for (int dy = -1; dy <= 1; ++dy) {
+        int y = gridIdx3D.y + dy;
+        for (int dx = -1; dx <= 1; ++dx) {
+          int x = gridIdx3D.x + dx;
+          if (
+            x < 0 || x >= gridResolution ||
+            y < 0 || y >= gridResolution ||
+            z < 0 || z >= gridResolution
+          ) {
+            continue;
+          }
+          visitor(x, y, z);
+        }
+      }
+    }
+  } else {
+    glm::ivec3 gridIdx3D((posSelf - gridMin) * inverseCellWidth - 0.5f);
+    gridIdx3D = glm::clamp(gridIdx3D, 0, gridResolution - 1);
+    for (int dz = 0; dz <= 1; ++dz) {
+      int z = gridIdx3D.z + dz;
+      for (int dy = 0; dy <= 1; ++dy) {
+        int y = gridIdx3D.y + dy;
+        for (int dx = 0; dx <= 1; ++dx) {
+          int x = gridIdx3D.x + dx;
+          if (
+            x < 0 || x >= gridResolution ||
+            y < 0 || y >= gridResolution ||
+            z < 0 || z >= gridResolution
+          ) {
+            continue;
+          }
+          visitor(x, y, z);
+        }
+      }
+    }
+  }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -332,6 +483,72 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+  glm::vec3 posSelf = pos[idx];
+
+  glm::vec3 rule1PerceivedCenter(0.0f, 0.0f, 0.0f);
+  int rule1NumberOfNeighbors = 0;
+  glm::vec3 rule2C(0.0f, 0.0f, 0.0f);
+  glm::vec3 rule3PerceivedVelocity(0.0f, 0.0f, 0.0f);
+  int rule3NumberOfNeighbors = 0;
+
+  visitNeighborCells(
+    gridResolution,
+    gridMin,
+    inverseCellWidth,
+    posSelf,
+    [
+      =,
+      &rule1PerceivedCenter,
+      &rule1NumberOfNeighbors,
+      &rule2C,
+      &rule3PerceivedVelocity,
+      &rule3NumberOfNeighbors
+    ] __device__ (int x, int y, int z) {
+      int neighborGridIdx = gridIndex3Dto1D(x, y, z, gridResolution);
+      int startIdx = gridCellStartIndices[neighborGridIdx];
+      int endIdx = gridCellEndIndices[neighborGridIdx];
+      if (startIdx == -1 || endIdx == -1) {
+        return;
+      }
+
+      for (int i = startIdx; i <= endIdx; ++i) {
+        int boidIdx = particleArrayIndices[i];
+        if (boidIdx == idx) {
+          continue;
+        }
+        glm::vec3 posI = pos[boidIdx];
+        float distance = glm::distance(posSelf, posI);
+        if (distance < rule1Distance) {
+          rule1PerceivedCenter += posI;
+          ++rule1NumberOfNeighbors;
+        }
+        if (distance < rule2Distance) {
+          rule2C -= posI - posSelf;
+        }
+        if (distance < rule3Distance) {
+          rule3PerceivedVelocity += vel1[boidIdx];
+          ++rule3NumberOfNeighbors;
+        }
+      }
+    }
+  );
+
+  glm::vec3 newVel = vel1[idx];
+  if (rule1NumberOfNeighbors > 0) {
+    rule1PerceivedCenter /= rule1NumberOfNeighbors;
+    newVel += (rule1PerceivedCenter - posSelf) * rule1Scale;
+  }
+  newVel += rule2C * rule2Scale;
+  if (rule3NumberOfNeighbors > 0) {
+    rule3PerceivedVelocity /= rule3NumberOfNeighbors;
+    newVel += rule3PerceivedVelocity * rule3Scale;
+  }
+  vel2[idx] = fminf(glm::length(newVel), maxSpeed) * glm::normalize(newVel);
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -351,6 +568,89 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+  glm::vec3 posSelf = pos[idx];
+
+  glm::vec3 rule1PerceivedCenter(0.0f, 0.0f, 0.0f);
+  int rule1NumberOfNeighbors = 0;
+  glm::vec3 rule2C(0.0f, 0.0f, 0.0f);
+  glm::vec3 rule3PerceivedVelocity(0.0f, 0.0f, 0.0f);
+  int rule3NumberOfNeighbors = 0;
+
+  visitNeighborCells(
+    gridResolution,
+    gridMin,
+    inverseCellWidth,
+    posSelf,
+    [
+      =,
+      &rule1PerceivedCenter,
+      &rule1NumberOfNeighbors,
+      &rule2C,
+      &rule3PerceivedVelocity,
+      &rule3NumberOfNeighbors
+    ] __device__ (int x, int y, int z) {
+      int neighborGridIdx = gridIndex3Dto1D(x, y, z, gridResolution);
+      int startIdx = gridCellStartIndices[neighborGridIdx];
+      int endIdx = gridCellEndIndices[neighborGridIdx];
+      if (startIdx == -1 || endIdx == -1) {
+        return;
+      }
+
+      for (int i = startIdx; i <= endIdx; ++i) {
+        if (i == idx) {
+          continue;
+        }
+        glm::vec3 posI = pos[i];
+        float distance = glm::distance(posSelf, posI);
+        if (distance < rule1Distance) {
+          rule1PerceivedCenter += posI;
+          ++rule1NumberOfNeighbors;
+        }
+        if (distance < rule2Distance) {
+          rule2C -= posI - posSelf;
+        }
+        if (distance < rule3Distance) {
+          rule3PerceivedVelocity += vel1[i];
+          ++rule3NumberOfNeighbors;
+        }
+      }
+    }
+  );
+
+  glm::vec3 newVel = vel1[idx];
+  if (rule1NumberOfNeighbors > 0) {
+    rule1PerceivedCenter /= rule1NumberOfNeighbors;
+    newVel += (rule1PerceivedCenter - posSelf) * rule1Scale;
+  }
+  newVel += rule2C * rule2Scale;
+  if (rule3NumberOfNeighbors > 0) {
+    rule3PerceivedVelocity /= rule3NumberOfNeighbors;
+    newVel += rule3PerceivedVelocity * rule3Scale;
+  }
+  vel2[idx] = fminf(glm::length(newVel), maxSpeed) * glm::normalize(newVel);
+}
+
+__global__ void kernPermutePosAndVel1(
+  int N,
+  int *particleArrayIndices,
+  glm::vec3 *pos,
+  glm::vec3 *vel1,
+  glm::vec3 *coherentPos,
+  glm::vec3 *coherentVel1
+) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= N) {
+    return;
+  }
+
+  int boidIdx = particleArrayIndices[idx];
+  coherentPos[idx] = pos[boidIdx];
+  coherentVel1[idx] = vel1[boidIdx];
 }
 
 /**
@@ -359,6 +659,16 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+
+  int gridSize = (numObjects + blockSize - 1) / blockSize;
+
+  kernUpdateVelocityBruteForce<<<gridSize, blockSize>>>(numObjects, dev_pos, dev_vel1, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+
+  kernUpdatePos<<<gridSize, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+  std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -374,6 +684,60 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+
+  int gridSizeForParticles = (numObjects + blockSize - 1) / blockSize;
+  int gridSizeForCells = (gridCellCount + blockSize - 1) / blockSize;
+
+  kernComputeIndices<<<gridSizeForParticles, blockSize>>>(
+    numObjects,
+    gridSideCount,
+    gridMinimum,
+    gridInverseCellWidth,
+    dev_pos,
+    dev_particleArrayIndices,
+    dev_particleGridIndices
+  );
+  checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+  {
+    thrust::device_ptr<int> dev_keys(dev_particleGridIndices);
+    thrust::device_ptr<int> dev_values(dev_particleArrayIndices);
+    thrust::sort_by_key(dev_keys, dev_keys + numObjects, dev_values);
+    checkCUDAErrorWithLine("thrust::sort_by_key failed!");
+  }
+
+  kernResetIntBuffer<<<gridSizeForCells, blockSize>>>(gridCellCount, dev_gridCellStartIndices, -1);
+  checkCUDAErrorWithLine("kernResetIntBuffer dev_gridCellStartIndices failed!");
+  kernResetIntBuffer<<<gridSizeForCells, blockSize>>>(gridCellCount, dev_gridCellEndIndices, -1);
+  checkCUDAErrorWithLine("kernResetIntBuffer dev_gridCellEndIndices failed!");
+
+  kernIdentifyCellStartEnd<<<gridSizeForParticles, blockSize>>>(
+    numObjects,
+    dev_particleGridIndices,
+    dev_gridCellStartIndices,
+    dev_gridCellEndIndices
+  );
+  checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+  kernUpdateVelNeighborSearchScattered<<<gridSizeForParticles, blockSize>>>(
+    numObjects,
+    gridSideCount,
+    gridMinimum,
+    gridInverseCellWidth,
+    gridCellWidth,
+    dev_gridCellStartIndices,
+    dev_gridCellEndIndices,
+    dev_particleArrayIndices,
+    dev_pos,
+    dev_vel1,
+    dev_vel2
+  );
+  checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
+
+  kernUpdatePos<<<gridSizeForParticles, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+  std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -392,6 +756,70 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+
+  int gridSizeForParticles = (numObjects + blockSize - 1) / blockSize;
+  int gridSizeForCells = (gridCellCount + blockSize - 1) / blockSize;
+
+  kernComputeIndices<<<gridSizeForParticles, blockSize>>>(
+    numObjects,
+    gridSideCount,
+    gridMinimum,
+    gridInverseCellWidth,
+    dev_pos,
+    dev_particleArrayIndices,
+    dev_particleGridIndices
+  );
+  checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+  {
+    thrust::device_ptr<int> dev_keys(dev_particleGridIndices);
+    thrust::device_ptr<int> dev_values(dev_particleArrayIndices);
+    thrust::sort_by_key(dev_keys, dev_keys + numObjects, dev_values);
+    checkCUDAErrorWithLine("thrust::sort_by_key failed!");
+  }
+
+  kernResetIntBuffer<<<gridSizeForCells, blockSize>>>(gridCellCount, dev_gridCellStartIndices, -1);
+  checkCUDAErrorWithLine("kernResetIntBuffer dev_gridCellStartIndices failed!");
+  kernResetIntBuffer<<<gridSizeForCells, blockSize>>>(gridCellCount, dev_gridCellEndIndices, -1);
+  checkCUDAErrorWithLine("kernResetIntBuffer dev_gridCellEndIndices failed!");
+
+  kernIdentifyCellStartEnd<<<gridSizeForParticles, blockSize>>>(
+    numObjects,
+    dev_particleGridIndices,
+    dev_gridCellStartIndices,
+    dev_gridCellEndIndices
+  );
+  checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+  kernPermutePosAndVel1<<<gridSizeForParticles, blockSize>>>(
+    numObjects,
+    dev_particleArrayIndices,
+    dev_pos,
+    dev_vel1,
+    dev_coherentPos,
+    dev_coherentVel1
+  );
+  checkCUDAErrorWithLine("kernPermutePosAndVel1 failed!");
+
+  kernUpdateVelNeighborSearchCoherent<<<gridSizeForParticles, blockSize>>>(
+    numObjects,
+    gridSideCount,
+    gridMinimum,
+    gridInverseCellWidth,
+    gridCellWidth,
+    dev_gridCellStartIndices,
+    dev_gridCellEndIndices,
+    dev_coherentPos,
+    dev_coherentVel1,
+    dev_vel2
+  );
+  checkCUDAErrorWithLine("kernUpdateVelNeighborSearchCoherent failed!");
+
+  kernUpdatePos<<<gridSizeForParticles, blockSize>>>(numObjects, dt, dev_coherentPos, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+  std::swap(dev_pos, dev_coherentPos);
+  std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::endSimulation() {
@@ -400,6 +828,17 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  if constexpr (UNIFORM_GRID) {
+    cudaFree(dev_particleArrayIndices);
+    cudaFree(dev_particleGridIndices);
+    cudaFree(dev_gridCellStartIndices);
+    cudaFree(dev_gridCellEndIndices);
+
+    if constexpr (COHERENT_GRID) {
+      cudaFree(dev_coherentPos);
+      cudaFree(dev_coherentVel1);
+    }
+  }
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index 9c917c0..1b5cf37 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -22,12 +22,12 @@
 // ================
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
-#define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+// #define VISUALIZE 1
+// #define UNIFORM_GRID 0
+// #define COHERENT_GRID 0
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+// const int N_FOR_VIS = 5000;
 const float DT = 0.2f;
 
 /**
@@ -226,6 +226,11 @@ void initShaders(GLuint * program) {
     double timebase = 0;
     int frame = 0;
 
+    #if FPS_MEASURE
+      double startTime = glfwGetTime();
+      int totalFrames = -1;
+    #endif // FPS_MEASURE
+
     Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure
                        // your CUDA development setup is ready to go.
 
@@ -264,6 +269,24 @@ void initShaders(GLuint * program) {
 
       glfwSwapBuffers(window);
       #endif
+
+      #if FPS_MEASURE
+      {
+        double relativeTime = time - startTime;
+        if (totalFrames < 0) {
+          if (relativeTime >= FPS_MEASURE_START) {
+            totalFrames = 0;
+            startTime = time;
+          }
+        } else {
+          ++totalFrames;
+          if (relativeTime >= FPS_MEASURE_DURATION) {
+            std::cout << "FPS: " << (totalFrames / relativeTime) << std::endl;
+            break;
+          }
+        }
+      }
+      #endif // FPS_MEASURE
     }
     glfwDestroyWindow(window);
     glfwTerminate();