CIS5650-Fall-2025 · mialana · Sep 7, 2025 · Sep 8, 2025 · Sep 8, 2025 · Sep 8, 2025
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,8 @@ cis5650_boids_generated_kernel*
 *.xcodeproj
 build
 
+log.txt
+
 # Created by https://www.gitignore.io/api/linux,osx,sublimetext,windows,jetbrains,vim,emacs,cmake,c++,cuda,visualstudio,webstorm,eclipse,xcode
 
 ### Linux ###

diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
@@ -0,0 +1,27 @@
+{
+    "configurations": [
+        {
+            "name": "Win32",
+            "includePath": [
+                "${workspaceFolder}/src/**",
+                "${workspaceFolder}/external/**/**"
+            ],
+            "defines": [ ],
+            "compilerPath": "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.1/bin/nvcc.exe"
+        },
+        {
+            "name": "Linux",
+            "includePath": [
+                "${workspaceFolder}/src/**",
+                "${workspaceFolder}/external/**/**"
+            ],
+            "defines": [ ],
+            "compilerPath": "/usr/local/cuda/bin/nvcc",
+            "cStandard": "gnu17",
+            "cppStandard": "gnu++17",
+            "intelliSenseMode": "linux-gcc-x64",
+            "configurationProvider": "ms-vscode.makefile-tools"
+        }
+    ],
+    "version": 4
+}
diff --git a/.vscode/extension.json b/.vscode/extension.json
@@ -0,0 +1,7 @@
+{
+    "recommendations": [
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cpptools",
+        "ms-vscode.makefile-tools"
+    ]
+}
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,36 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "type": "cmake",
+            "request": "launch",
+            "name": "CMake: Configure project",
+            "cmakeDebugType": "configure",
+            "clean": false,
+            "configureAll": false
+        },
+        {
+            "name": "CUDA C++: Launch",
+            "preLaunchTask": "CMake: build",
+            "type": "cuda-gdb",
+            "request": "launch",
+            "program": "${command:cmake.launchTargetPath}",
+            "logFile": "${workspaceFolder}/log.txt",
+            "cwd": "${workspaceFolder}",
+            "environment": [
+                {
+                    "name": "XDG_SESSION_TYPE",
+                    "value": "x11"
+                },
+                {
+                    "name": "WAYLAND_DISPLAY",
+                    "value": "''"
+                },
+                {
+                    "name": "__GL_SYNC_TO_VBLANK",
+                    "value": "0"
+                }
+            ]
+        }
+    ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,73 @@
+{
+    "files.associations": {
+        "cpp": "cuda-cpp",
+        "hpp": "cuda-cpp",
+        "array": "cpp",
+        "atomic": "cpp",
+        "bit": "cpp",
+        "cctype": "cpp",
+        "charconv": "cpp",
+        "chrono": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "compare": "cpp",
+        "concepts": "cpp",
+        "condition_variable": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdint": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "deque": "cpp",
+        "map": "cpp",
+        "set": "cpp",
+        "string": "cpp",
+        "unordered_map": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "expected": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "iterator": "cpp",
+        "memory": "cpp",
+        "memory_resource": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "random": "cpp",
+        "ratio": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "format": "cpp",
+        "fstream": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "iosfwd": "cpp",
+        "iostream": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "mutex": "cpp",
+        "new": "cpp",
+        "numbers": "cpp",
+        "ostream": "cpp",
+        "queue": "cpp",
+        "ranges": "cpp",
+        "semaphore": "cpp",
+        "span": "cpp",
+        "sstream": "cpp",
+        "stdexcept": "cpp",
+        "stop_token": "cpp",
+        "streambuf": "cpp",
+        "text_encoding": "cpp",
+        "thread": "cpp",
+        "cinttypes": "cpp",
+        "typeinfo": "cpp",
+        "variant": "cpp"
+    }
+}
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
@@ -0,0 +1,16 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "type": "cmake",
+            "label": "CMake: build",
+            "command": "build",
+            "targets": [
+                "${command:cmake.buildTargetName}"
+            ],
+            "group": "build",
+            "problemMatcher": ["$nvcc"],
+            "detail": "CMake template build task"
+        },
+    ]
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,5 +1,12 @@
 cmake_minimum_required(VERSION 3.18)
-project(cis5650_boids LANGUAGES CUDA CXX)
+
+project(cis5650_boids LANGUAGES CXX)
+
+if (MSVC)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --allow-unsupported-compiler" CACHE STRING "Allow unsupported compiler" FORCE)
+endif()
+
+enable_language(CUDA)
 
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 

diff --git a/README.md b/README.md
@@ -1,11 +1,125 @@
-**University of Pennsylvania, CIS 5650: GPU Programming and Architecture,
-Project 1 - Flocking**
+# Boid Flocking in CUDA
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+![boid thumbnail](./images/boid_thumbnail.gif)
 
-### (TODO: Your README)
+University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 1
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+- Amy Liu
+  - [Personal Website](https://amyliu.dev), [LinkedIn](https://linkedin.com/in/miyalana), [Github](https://github.com/mialana).
+- Tested on: Fedora 42 KDE Plasma, Wayland Protocol, Optimus GPU (Intel(R) Core(TM) Ultra 9 275HX 32GiB, NVIDIA GeForce RTX 5070Ti 12227MiB)
+
+## Overview
+
+This project implements a real-time 3D flocking simulation in CUDA C++ based on [Craig Reynolds’ 1987 Boids algorithm](https://team.inria.fr/imagine/files/2014/10/flocks-hers-and-schools.pdf).
+
+To summarize it, the Boids algorithm is governed by three simple behavioral rules: cohesion (move toward the average position of neighbors), separation (avoid crowding too closely), and alignment (match the velocity of nearby boids).
+
+The codebase progresses through three implementations of increasing efficiency: a naïve all-pairs neighbor search, a uniform grid spatial partitioning approach to reduce unnecessary comparisons, and an optimized coherent grid variant that improves memory access locality. 
+
+Alongside implementing these kernels, the project emphasizes GPU performance analysis by measuring and comparing execution times, evaluating scalability with varying boid counts, and exploring how block sizes and grid structures affect efficiency.
+
+This project was completed for **CIS 5650**, a master’s-level course in GPU programming at the University of Pennsylvania.
+
+Notably, the final optimized implementation achieved **1800 fps** runtime with **100000 particles** simulated every frame.
+
+## Demo Media
+
+Simulations are handled using a combination of GLFW, GLEW, and OpenGL.
+
+**10,000 Boids Simulation**
+![10000 boids](images/10000_boids.gif)
+
+Here, observe the "flocking" behavior in smaller clusters of boids.
+
+**100,000 Boids Simulation**
+![100000 boids](images/100000_boids.gif)
+
+Note that colors are mapped to particle velocity.
+
+**1 Million Boids Simulation**
+![1mil boids](images/1mil_boids.gif)
+
+Though unrealistic, the simulation can efficiently handle upwards of 1 million particles.
+
+## Data Collection
+
+Data was collected to understand the effect of different variables on the framerate of the simulation, where higher frames per second (FPS) signifies better performance.
+
+**Graph 1: Framerate vs Number of Boids (With Visualization)**
+![img](images/framerate_to_boids_with_visuals.png)
+
+Note, I included data from my first attempt at implementing the naive / brute-force calculation of boid behavior. In this attempt, I used a mask-based approach to a branching situation, as we studied that branching can be inefficient on the GPU. 
+
+```python
+for each neighbor j:
+    d = distance(pos[i], pos[j])
+    mask1 = (d < rule1MaxDist) ? 1 : 0
+    mask2 = (d < rule2MaxDist) ? 1 : 0
+    mask3 = (d < rule3MaxDist) ? 1 : 0
+    cohesion   += mask1 * rule1Factor
+    separation += mask2 * rule2Factor
+    alignment  += mask3 * rule3Factor
+```
+
+However, this implementation actually led to less than optimal results, as shown in the graph data. This first draft implementation is located [here](https://github.com/CIS5650-Fall-2025/Project1-CUDA-Flocking/blob/1d94c9d44171d3223b58e458969b70508d2585a0/src/kernel.cu#L259C1-L304C2) for perusal.
+
+Switching to a more standard `if-then` format showed an incredible performance improvement.
+
+```python
+for each neighbor j:
+    d = distance(pos[i], pos[j])
+    if d < rule1MaxDist:
+        cohesion   += rule1Factor
+    if d < rule2MaxDist:
+        separation += rule2Factor
+    if d < rule3MaxDist:
+        alignment  += rule3Factor
+```
+
+Looking into it, I learned that in many cases the compiler and thread scheduler often predict simple if bodies, albeit their classification as "branching". In future coding, I will keep an eye out for this slight gotcha.
+
+---
+
+Furthermore, toggling off visualization in the program can eliminate GL-related operations. This effectively allows us to isolate and measure the efficiency of our CUDA implementation.
+
+**Graph 2: Framerate vs Number of Boids (No Visualization)**
+![img](images/framerate_to_boids_no_visuals.png)
+
+Furthermore, I was introduced to the notion of "Blocks" in CUDA for this project. Briefly, a block refers to a number of threads that are grouped together within a program's execution. Threads within a block can cooperate by sharing data through shared memory and synchronizing their execution. However, blocks can further be grouped into a "Grid", but these blocks have no ability to cooperate or communicate with each other (share memory, exchange data, synchronize execution, etc).
+
+In this graph, I measure how choosing different block sizes will affect framerate in execution. From my observation, the performance improvement is logarithmic in nature.
+
+**Graph 3: Framerate vs CUDA block size (No Visualization)**
+![img](images/framerate_to_blocksize.png)
+
+## Other Performance Analysis
+
+#### For each implementation, how does changing the number of boids affect performance? Why?
+- Naive (all-pairs): Falls off fast as boids increase. Expected, with the very large NxN comparison overhead.
+
+- Uniform grid (scattered): Big improvement over naive, then flattens out. This is because we effectively cut the search space, but indirection (needing particleArrayIndices) hurts performance still.
+
+- Uniform grid (coherent): Best. Reordering makes neighbors mostly contiguous in memory, so reads are coalesced even as N grows. You can see the jump at 100K boids in the plots, and maintained optimal performance even at 1 million boids.
+
+#### For each implementation, how do block count and block size affect performance? Why?
+Bigger blocks help up to around 256–512 threads per block, then it levels off.
+I believe initial improvement comes from the impact of higher occupancy and better memory coalescing, but after that, register/memory pressure and scheduling overhead cap that improvement.
+
+#### For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this expected? Why or why not?
+
+Yes—generally faster than scattered, especially at higher boid counts. This was expected: by reshuffling pos/vel into cell order, neighbor loops turn into mostly sequential reads, where GPUs specialize. The O(N) reshuffle cost gets amortized. There’s a small-N case where the reshuffle overhead can briefly outweigh the gains
+
+#### Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not?
+
+It mattered more in the scattered version and was roughly neutral in the coherent version.
+In scattered, more cells mean more cache-unfriendly indirect reads, so overhead adds up. In coherent, those neighbors live contiguously, so even with more cells, accesses are sequential and cheap. It’s the access pattern—not the raw number of cells—that really drives performance.
+
+## Build Instructions
+
+Prerequisites:
+- NVIDIA GPU
+- NVIDIA Driver (r580+)
+- CUDA toolkit (13.0+)
+- CMake
+
+See `run_linux.sh` for a simple template on building and running the program.
diff --git a/images/100000_boids.gif b/images/100000_boids.gif
diff --git a/images/10000_boids.gif b/images/10000_boids.gif
diff --git a/images/1mil_boids.gif b/images/1mil_boids.gif
diff --git a/images/boid_thumbnail.gif b/images/boid_thumbnail.gif
diff --git a/images/framerate_to_blocksize.png b/images/framerate_to_blocksize.png
diff --git a/images/framerate_to_boids_no_visuals.png b/images/framerate_to_boids_no_visuals.png
diff --git a/images/framerate_to_boids_with_visuals.png b/images/framerate_to_boids_with_visuals.png
diff --git a/images/framerates_to_boids_no_visuals.png b/images/framerates_to_boids_no_visuals.png
diff --git a/run_linux.sh b/run_linux.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+CURRENT_DIR=$(pwd)
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+while true; do
+    read -p "Do you want to re-build? (y/n): " yn
+    case $yn in
+        [Yy]* )
+        cd $SCRIPT_DIR; # cd to where script is located
+
+        trash build;
+        mkdir build && cd build;
+        cmake .. -DCMAKE_BUILD_TYPE=Release;
+        make -j$(nproc --all);
+
+        cd $CURRENT_DIR; # cd to stored original directory 
+        break;;
+        [Nn]* ) 
+        break;;
+        * ) echo "Invalid input. Please answer 'y' or 'n'.";;
+    esac
+done
+
+__GL_SYNC_TO_VBLANK=0 XDG_SESSION_TYPE=x11 WAYLAND_DISPLAY="" $SCRIPT_DIR/build/bin/cis5650_boids
diff --git a/scripts/graph1.py b/scripts/graph1.py
@@ -0,0 +1,45 @@
+import matplotlib.pyplot as plt
+import numpy as np
+
+# Data
+boids = np.array([500, 5000, 100000, 10000000])
+
+coherent = np.array([295.15, 465.123, 1241.273, 901.641])
+scattered = np.array([151.12, 490.932, 640.674, 445.891])
+naive_branch = np.array([41.253, 69.273, 111.93, 39.273])
+naive_mask = np.array([10.192, 4.293, 0.2, 0.2])
+
+
+# Plot
+plt.figure(figsize=(12,7))
+line1, = plt.plot(boids, coherent, 'x--', label="Coherent Grid")
+line2, = plt.plot(boids, scattered, 'd--', label="Scattered Grid")
+line3, = plt.plot(boids, naive_branch, 's--', label="Naive (Branching)")
+line4, = plt.plot(boids, naive_mask, 'o--', label="Naive (Mask-based)")
+
+# Annotation with offsets to reduce overlap
+
+lines = {
+    "Coherent Grid": (line1, coherent, (0, -60)),
+    "Scattered Grid": (line2, scattered, (0, 30)),
+    "Naive (Branching)": (line3, naive_branch, (0, 45)),
+    "Naive (Mask-based)": (line4, naive_mask, (0, -60)),
+}
+
+# Annotate with same color as line
+for label, (line, values, (dx, dy)) in lines.items():
+    color = line.get_color()
+    for x, y in zip(boids, values):
+        plt.text(x+dx, y+dy, f"{y:.2f}", fontsize=10, ha='center', color=color, fontweight='bold')
+
+plt.xscale("log")
+plt.xticks(boids, [str(b) for b in boids])
+
+plt.ylim(-100, 2000)
+
+plt.xlabel("Number of Boids (log-based x-scale)")
+plt.ylabel("Framerate (FPS)")
+plt.title("Framerate vs Number of Boids (With Visualization)", fontsize=18, fontweight='bold')
+plt.legend()
+plt.grid(True, linestyle="--", linewidth=0.5)
+plt.show()
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,8 @@ cis5650_boids_generated_kernel* @@
     *.xcodeproj
     build
+    log.txt
     # Created by https://www.gitignore.io/api/linux,osx,sublimetext,windows,jetbrains,vim,emacs,cmake,c++,cuda,visualstudio,webstorm,eclipse,xcode
     ### Linux ###
@@ Expand Down @@