CIS5650-Fall-2025 · yunhao-qian · Sep 8, 2025 · Sep 8, 2025 · Sep 9, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -80,9 +80,35 @@ elseif(CMAKE_VERSION VERSION_LESS "3.24.0")
 else()
     set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES native)
 endif()
-target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE "$<$<AND:$<CONFIG:Debug,RelWithDebInfo>,$<COMPILE_LANGUAGE:CUDA>>:-G>")
+target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE
+    $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda --expt-relaxed-constexpr>
+)
+target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE
+    $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>>:-G>
+)
 set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${CMAKE_PROJECT_NAME})
 
+set(N_FOR_VIS 5000 CACHE STRING "Number of boids to simulate")
+option(VISUALIZE "Turn on OpenGL visualization" ON)
+option(FINE_GRAINED_CELLS "Use fine-grained cells with 27 neighbors instead of 8" ON)
+option(UNIFORM_GRID "Use uniform grids for efficient neighbor checks" OFF)
+option(COHERENT_GRID "Use semi-coherent memory access for uniform grids" OFF)
+set(CUDA_BLOCK_SIZE 128 CACHE STRING "CUDA block size")
+option(FPS_MEASURE "Enable frame rate measurements" OFF)
+set(FPS_MEASURE_START 2 CACHE STRING "Seconds after start to begin frame rate measurements")
+set(FPS_MEASURE_DURATION 20 CACHE STRING "Number of seconds to measure frame rates for")
+target_compile_definitions(${CMAKE_PROJECT_NAME} PRIVATE
+    N_FOR_VIS=${N_FOR_VIS}
+    VISUALIZE=$<IF:$<BOOL:${VISUALIZE}>,1,0>
+    FINE_GRAINED_CELLS=$<IF:$<BOOL:${FINE_GRAINED_CELLS}>,1,0>
+    UNIFORM_GRID=$<IF:$<BOOL:${UNIFORM_GRID}>,1,0>
+    COHERENT_GRID=$<IF:$<BOOL:${COHERENT_GRID}>,1,0>
+    CUDA_BLOCK_SIZE=${CUDA_BLOCK_SIZE}
+    FPS_MEASURE=$<IF:$<BOOL:${FPS_MEASURE}>,1,0>
+    FPS_MEASURE_START=${FPS_MEASURE_START}
+    FPS_MEASURE_DURATION=${FPS_MEASURE_DURATION}
+)
+
 add_custom_command(
     TARGET ${CMAKE_PROJECT_NAME}
     PRE_BUILD

diff --git a/README.md b/README.md
@@ -1,11 +1,79 @@
-**University of Pennsylvania, CIS 5650: GPU Programming and Architecture,
-Project 1 - Flocking**
+# University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 1 - Flocking
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+- Yunhao Qian
+  - [LinkedIn](www.linkedin.com/in/yunhao-qian-026980170)
+  - [GitHub](https://github.com/yunhao-qian)
+- Tested on my personal computer:
+  - OS: Windows 11, 24H2
+  - CPU: 13th Gen Intel(R) Core(TM) i7-13700 (2.10 GHz)
+  - GPU: NVIDIA GeForce RTX 4090
+  - RAM: 32.0 GB
 
-### (TODO: Your README)
+## Boids in Action
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+### Screenshot
+
+![Screenshot of boids](images/Screenshot%20of%20boids.png)
+
+The screenshot above is from an experiment with 100000 boids, 1x-sized cells, scattered uniform grid, and the block size of 128.
+
+### Recording
+
+![Recording of boids](images/Recording%20of%20boids.gif)
+
+The screen recording above is from an experiment with 320000 boids, 1x-sized cells, scattered uniform grid, and the block size of 128.
+
+## Changes to `CMakeLists.txt`
+
+- To share code among different configurations, I used lambda functions and `constexpr` in CUDA `__device__` code. To enable these features, I turned on the `--extended-lambda` and `--expt-relaxed-constexpr` flags using `target_compile_options()`.
+- To measure frame rates programmatically, all compile-time configurations are controlled by CMake definitions. These include `N_FOR_VIS`, `VISUALIZE`, `FINE_GRAINED_CELLS`, `UNIFORM_GRID`, `COHERENT_GRID`, and `CUDA_BLOCK_SIZE`.
+- The modified program has a special timed mode for measuring frame rates. To support this, CMake options are added, including `FPS_MEASURE`, `FPS_MEASURE_START`, and `FPS_MEASURE_DURATION`.
+
+## Performance Analysis
+
+### Methodology
+
+All experiments in this section are launched programmatically using [`measure_fps.py`](./scripts/measure_fps.py), which re-compiles the CMake project using specific arguments, runs the program, and captures the average frame rate (in frames per second, or FPS) from stdout. The measurement of each experiment starts 2 seconds after the program's launch and lasts for 20 seconds. After that, the program exits automatically. To avoid crashing the computer, more resource-consuming configurations are skipped once the frame rate drops below 1 FPS or the program takes over 100 seconds to execute. Statistics used to create the following plots are omitted here for conciseness, and please refer to [`measurements.json`](./scripts/measurements.json) for those detailed numbers.
+
+### Number of Boids
+
+![Frame rate vs. number of boids](images/Frame%20rate%20vs%20number%20of%20boids.png)
+
+All experiments in this set use 1x-sized cells and the block size of 128.
+
+Discussion:
+
+- The naive implementation should be heavily compute-bound, and the time complexity is $O(N^2)$, where $N$ is the number of boids. Experiments confirm this trend in general, as the FPS drops consistently as $N$ increases.
+- The scattered & coherent uniform grid implementations do not show a monotonous trend at small $N$'s. The worst performance appears at $N = ~20000$. This may be caused by thread divergence in warps as not all neighbor cells are occupied by boids.
+- The two uniform-grid implementations reach their best performance at $N = ~100000$. The GPU's compute capability is probably saturated at this point.
+- When $N$ is very large, the time complexity is $O(N^2)$ for all implementations with or without uniform grids, as large $N$'s always increase the number of boids within the effective distance. This is confirmed by the linear trends in the log-log plot.
+
+### OpenGL Visualization
+
+While visualizing a point cloud in OpenGL has $O(N)$ time complexity, its amount of work is much lighter than boid simulations in general. When $N$ is small, experiments with visualization turned on have slightly lower FPS. When $N$ is large, the difference becomes barely detectable.
+
+### Block Size
+
+![Frame rate vs. block size](images/Frame%20rate%20vs%20block%20size.png)
+
+All experiments in this set use 32000 boids, no visualization, and 1x-sized cells.
+
+Discussion:
+
+- For the naive implementation, FPS improves as the block size increases from 8 to 64. This is because increased computation helps cover the latency of random global memory accesses. There is a small drop at 1024, which is probably due to reaching register limits.
+- The scattered uniform grid performs best at very small block sizes from 8 to 32, and then flattens. This is because the execution time is dominated by the latency of random global memory accesses. Increasing the block size further does not address this problem, and reduces the scheduler's ability to distribute blocks evenly among MP's.
+- The coherent uniform grid has strong gains for 8 to 16, as increased computation covers the latency of semi-sequential global memory accesses. Once the program is compute-bound, increasing the block size further does not bring consistent improvements.
+
+### Coherent Uniform Grids
+
+The coherent uniform grid brings significant performance improvements, and this is the expected outcome. Sorting boid data by cells makes neighbor loops touch contiguous memory. Those data reside in $\leq 9$ pieces of contiguous memory if the cells are 1x-sized, and $\leq 4$ pieces if the cells are 2x-sized. When $N$ is large, the semi-sequential accesses are much faster than thousands of random accesses. When $N$ is small, the overhead of sorting boid data tends to dominate, resulting in the slightly lower FPS of the coherent uniform grid.
+
+### Cell Size
+
+![Frame rate vs. fine-grained cells](images/Frame%20rate%20vs%20fine-grained%20cells.png)
+
+All experiments in this set use no visualization, coherent uniform grids, and the block size of 128.
+
+When $N$ is small, 2x-sized cells should be slightly better, as the smaller number of neighbor cells means lighter looping overhead and better memory coherence. However, this advantage is not obvious is experiments, as it is hard to tell that one configuration is clearly better than the other.
+
+When $N$ is very large, 1x-sized cells should be much better, as more fine-grained cells allow the program to check fewer neighboring boids. If the boid density is $\rho$, and the effective distance is $R$, 1x-sized cells result in $(3 R)^3 = 27 R^3$ neighboring boids, while 2x-sized cells result in $(4 R)^3 = 64 R^3$ neighboring boids. This discrepancy is confirmed by the roughly constant offset in the log-log plot.
diff --git a/images/Frame rate vs block size.png b/images/Frame rate vs block size.png
diff --git a/images/Frame rate vs fine-grained cells.png b/images/Frame rate vs fine-grained cells.png
diff --git a/images/Frame rate vs number of boids.png b/images/Frame rate vs number of boids.png
diff --git a/images/Recording of boids.gif b/images/Recording of boids.gif
diff --git a/images/Screenshot of boids.png b/images/Screenshot of boids.png
diff --git a/scripts/measure_fps.py b/scripts/measure_fps.py
@@ -0,0 +1,229 @@
+import json
+import re
+import shutil
+import subprocess
+import time
+from pathlib import Path
+from typing import Any
+
+ROOT_DIR = Path(__file__).parent.parent.absolute()
+BUILD_DIR = ROOT_DIR / "build"
+BOIDS_EXE = BUILD_DIR / "bin" / "Release" / "cis5650_boids.exe"
+MEASUREMENTS_JSON = Path(__file__).parent.absolute() / "measurements.json"
+
+
+def measure_fps_vs_num_boids() -> None:
+    measurement_data: list[dict[str, Any]] = []
+    if MEASUREMENTS_JSON.exists():
+        assert MEASUREMENTS_JSON.is_file()
+        with MEASUREMENTS_JSON.open(encoding="utf-8") as file:
+            measurement_data = json.load(file)
+    else:
+        measurement_data = []
+
+    def find_measurement(config: dict[str, Any]) -> dict[str, Any] | None:
+        for measurement in measurement_data:
+            if all(
+                key in measurement and measurement[key] == value
+                for key, value in config.items()
+            ):
+                return measurement
+        return None
+
+    nums_boids = [5000 * (2**i) for i in range(13)]
+    for name, base_config in [
+        ("naive", {"UNIFORM_GRID": 0, "COHERENT_GRID": 0}),
+        ("scattered uniform grid", {"UNIFORM_GRID": 1, "COHERENT_GRID": 0}),
+        ("coherent uniform grid", {"UNIFORM_GRID": 1, "COHERENT_GRID": 1}),
+    ]:
+        print(f"Measuring FPS for method: {name}")
+        # Visualize first so we can confirm correctness.
+        for visualize in [1, 0]:
+            for num_boids in nums_boids:
+                config = {
+                    "N_FOR_VIS": num_boids,
+                    "VISUALIZE": visualize,
+                    "FINE_GRAINED_CELLS": 1,
+                    **base_config,
+                    "CUDA_BLOCK_SIZE": 128,
+                }
+                measurement = find_measurement(config)
+                if measurement is None:
+                    print(f"Measuring FPS for config: {config}")
+                    start_time = time.time()
+                    fps = build_and_measure_fps(**config)
+                    duration = time.time() - start_time
+                    measurement = {**config, "duration": duration, "fps": fps}
+                    measurement_data.append(measurement)
+                    with MEASUREMENTS_JSON.open("w", encoding="utf-8") as file:
+                        json.dump(measurement_data, file, indent=4)
+                else:
+                    print(f"Found existing measurement for config: {config}")
+                print(f"  Duration: {measurement['duration']}")
+                print(f"  FPS: {measurement['fps']}")
+                if measurement["duration"] > 100.0:
+                    print("Skipping further measurements due to long duration")
+                    break
+                if measurement["fps"] < 1.0:
+                    print("Skipping further measurements due to low FPS")
+                    break
+
+
+def measure_fps_vs_block_size() -> None:
+    measurement_data: list[dict[str, Any]] = []
+    if MEASUREMENTS_JSON.exists():
+        assert MEASUREMENTS_JSON.is_file()
+        with MEASUREMENTS_JSON.open(encoding="utf-8") as file:
+            measurement_data = json.load(file)
+    else:
+        measurement_data = []
+
+    def find_measurement(config: dict[str, Any]) -> dict[str, Any] | None:
+        for measurement in measurement_data:
+            if all(
+                key in measurement and measurement[key] == value
+                for key, value in config.items()
+            ):
+                return measurement
+        return None
+
+    for name, base_config in [
+        ("naive", {"UNIFORM_GRID": 0, "COHERENT_GRID": 0}),
+        ("scattered uniform grid", {"UNIFORM_GRID": 1, "COHERENT_GRID": 0}),
+        ("coherent uniform grid", {"UNIFORM_GRID": 1, "COHERENT_GRID": 1}),
+    ]:
+        print(f"Measuring FPS for method: {name}")
+        for block_size in [8 * (2**i) for i in range(8)]:
+            config = {
+                "N_FOR_VIS": 320000,
+                "VISUALIZE": 0,
+                "FINE_GRAINED_CELLS": 1,
+                **base_config,
+                "CUDA_BLOCK_SIZE": block_size,
+            }
+            measurement = find_measurement(config)
+            if measurement is None:
+                print(f"Measuring FPS for config: {config}")
+                start_time = time.time()
+                fps = build_and_measure_fps(**config)
+                duration = time.time() - start_time
+                measurement = {**config, "duration": duration, "fps": fps}
+                measurement_data.append(measurement)
+                with MEASUREMENTS_JSON.open("w", encoding="utf-8") as file:
+                    json.dump(measurement_data, file, indent=4)
+            else:
+                print(f"Found existing measurement for config: {config}")
+            print(f"  Duration: {measurement['duration']}")
+            print(f"  FPS: {measurement['fps']}")
+
+
+def measure_fps_vs_fine_grained_cells() -> None:
+    measurement_data: list[dict[str, Any]] = []
+    if MEASUREMENTS_JSON.exists():
+        assert MEASUREMENTS_JSON.is_file()
+        with MEASUREMENTS_JSON.open(encoding="utf-8") as file:
+            measurement_data = json.load(file)
+    else:
+        measurement_data = []
+
+    def find_measurement(config: dict[str, Any]) -> dict[str, Any] | None:
+        for measurement in measurement_data:
+            if all(
+                key in measurement and measurement[key] == value
+                for key, value in config.items()
+            ):
+                return measurement
+        return None
+
+    nums_boids = [5000 * (2**i) for i in range(13)]
+    for fine_grained in [0, 1]:
+        for num_boids in nums_boids:
+            config = {
+                "N_FOR_VIS": num_boids,
+                "VISUALIZE": 0,
+                "FINE_GRAINED_CELLS": fine_grained,
+                "UNIFORM_GRID": 1,
+                "COHERENT_GRID": 1,
+                "CUDA_BLOCK_SIZE": 128,
+            }
+            measurement = find_measurement(config)
+            if measurement is None:
+                print(f"Measuring FPS for config: {config}")
+                start_time = time.time()
+                fps = build_and_measure_fps(**config)
+                duration = time.time() - start_time
+                measurement = {**config, "duration": duration, "fps": fps}
+                measurement_data.append(measurement)
+                with MEASUREMENTS_JSON.open("w", encoding="utf-8") as file:
+                    json.dump(measurement_data, file, indent=4)
+            else:
+                print(f"Found existing measurement for config: {config}")
+            print(f"  Duration: {measurement['duration']}")
+            print(f"  FPS: {measurement['fps']}")
+
+
+def build_and_measure_fps(**kwargs: str) -> float:
+    build(**kwargs)
+    return measure_fps()
+
+
+def build(**kwargs: str) -> None:
+    if BUILD_DIR.exists():
+        assert BUILD_DIR.is_dir()
+        shutil.rmtree(BUILD_DIR)
+
+    subprocess.run(
+        args=[
+            "cmake",
+            "-G",
+            "Visual Studio 17 2022",
+            "-S",
+            str(ROOT_DIR),
+            "-B",
+            str(BUILD_DIR),
+            *(
+                f"-D{key}={value}"
+                for key, value in {
+                    **kwargs,
+                    "FPS_MEASURE": 1,
+                    "FPS_MEASURE_START": 2,
+                    "FPS_MEASURE_DURATION": 20,
+                }.items()
+            ),
+        ],
+        check=True,
+    )
+    assert BUILD_DIR.is_dir()
+
+    subprocess.run(
+        [
+            "cmake",
+            "--build",
+            str(BUILD_DIR),
+            "--target",
+            "cis5650_boids",
+            "--config",
+            "Release",
+            "--parallel",
+        ],
+        check=True,
+    )
+    assert BOIDS_EXE.is_file()
+
+
+def measure_fps() -> float:
+    assert BOIDS_EXE.is_file()
+    result = subprocess.run(
+        [str(BOIDS_EXE)], cwd=BUILD_DIR, capture_output=True, check=True, text=True
+    )
+    for line in result.stdout.splitlines():
+        re_match = re.match(r"^FPS: (.+)$", line)
+        if re_match is not None:
+            return float(re_match[1])
+    raise RuntimeError("FPS not found in output")
+
+
+if __name__ == "__main__":
+    measure_fps_vs_num_boids()
+    measure_fps_vs_block_size()
+    measure_fps_vs_fine_grained_cells()