Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ cis5650_boids_generated_kernel*
*.xcodeproj
build

log.txt

# Created by https://www.gitignore.io/api/linux,osx,sublimetext,windows,jetbrains,vim,emacs,cmake,c++,cuda,visualstudio,webstorm,eclipse,xcode

### Linux ###
Expand Down
27 changes: 27 additions & 0 deletions .vscode/c_cpp_properties.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"configurations": [
{
"name": "Win32",
"includePath": [
"${workspaceFolder}/src/**",
"${workspaceFolder}/external/**/**"
],
"defines": [ ],
"compilerPath": "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.1/bin/nvcc.exe"
},
{
"name": "Linux",
"includePath": [
"${workspaceFolder}/src/**",
"${workspaceFolder}/external/**/**"
],
"defines": [ ],
"compilerPath": "/usr/local/cuda/bin/nvcc",
"cStandard": "gnu17",
"cppStandard": "gnu++17",
"intelliSenseMode": "linux-gcc-x64",
"configurationProvider": "ms-vscode.makefile-tools"
}
],
"version": 4
}
7 changes: 7 additions & 0 deletions .vscode/extension.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"recommendations": [
"nvidia.nsight-vscode-edition",
"ms-vscode.cpptools",
"ms-vscode.makefile-tools"
]
}
36 changes: 36 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"version": "0.2.0",
"configurations": [
{
"type": "cmake",
"request": "launch",
"name": "CMake: Configure project",
"cmakeDebugType": "configure",
"clean": false,
"configureAll": false
},
{
"name": "CUDA C++: Launch",
"preLaunchTask": "CMake: build",
"type": "cuda-gdb",
"request": "launch",
"program": "${command:cmake.launchTargetPath}",
"logFile": "${workspaceFolder}/log.txt",
"cwd": "${workspaceFolder}",
"environment": [
{
"name": "XDG_SESSION_TYPE",
"value": "x11"
},
{
"name": "WAYLAND_DISPLAY",
"value": "''"
},
{
"name": "__GL_SYNC_TO_VBLANK",
"value": "0"
}
]
}
]
}
73 changes: 73 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
{
"files.associations": {
"cpp": "cuda-cpp",
"hpp": "cuda-cpp",
"array": "cpp",
"atomic": "cpp",
"bit": "cpp",
"cctype": "cpp",
"charconv": "cpp",
"chrono": "cpp",
"clocale": "cpp",
"cmath": "cpp",
"compare": "cpp",
"concepts": "cpp",
"condition_variable": "cpp",
"cstdarg": "cpp",
"cstddef": "cpp",
"cstdint": "cpp",
"cstdio": "cpp",
"cstdlib": "cpp",
"cstring": "cpp",
"ctime": "cpp",
"cwchar": "cpp",
"cwctype": "cpp",
"deque": "cpp",
"map": "cpp",
"set": "cpp",
"string": "cpp",
"unordered_map": "cpp",
"vector": "cpp",
"exception": "cpp",
"expected": "cpp",
"algorithm": "cpp",
"functional": "cpp",
"iterator": "cpp",
"memory": "cpp",
"memory_resource": "cpp",
"numeric": "cpp",
"optional": "cpp",
"random": "cpp",
"ratio": "cpp",
"string_view": "cpp",
"system_error": "cpp",
"tuple": "cpp",
"type_traits": "cpp",
"utility": "cpp",
"format": "cpp",
"fstream": "cpp",
"initializer_list": "cpp",
"iomanip": "cpp",
"iosfwd": "cpp",
"iostream": "cpp",
"istream": "cpp",
"limits": "cpp",
"mutex": "cpp",
"new": "cpp",
"numbers": "cpp",
"ostream": "cpp",
"queue": "cpp",
"ranges": "cpp",
"semaphore": "cpp",
"span": "cpp",
"sstream": "cpp",
"stdexcept": "cpp",
"stop_token": "cpp",
"streambuf": "cpp",
"text_encoding": "cpp",
"thread": "cpp",
"cinttypes": "cpp",
"typeinfo": "cpp",
"variant": "cpp"
}
}
16 changes: 16 additions & 0 deletions .vscode/tasks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"version": "2.0.0",
"tasks": [
{
"type": "cmake",
"label": "CMake: build",
"command": "build",
"targets": [
"${command:cmake.buildTargetName}"
],
"group": "build",
"problemMatcher": ["$nvcc"],
"detail": "CMake template build task"
},
]
}
9 changes: 8 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
cmake_minimum_required(VERSION 3.18)
project(cis5650_boids LANGUAGES CUDA CXX)

project(cis5650_boids LANGUAGES CXX)

if (MSVC)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --allow-unsupported-compiler" CACHE STRING "Allow unsupported compiler" FORCE)
endif()

enable_language(CUDA)

set_property(GLOBAL PROPERTY USE_FOLDERS ON)

Expand Down
130 changes: 122 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,125 @@
**University of Pennsylvania, CIS 5650: GPU Programming and Architecture,
Project 1 - Flocking**
# Boid Flocking in CUDA

* (TODO) YOUR NAME HERE
* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
![boid thumbnail](./images/boid_thumbnail.gif)

### (TODO: Your README)
University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 1

Include screenshots, analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
- Amy Liu
- [Personal Website](https://amyliu.dev), [LinkedIn](https://linkedin.com/in/miyalana), [Github](https://github.com/mialana).
- Tested on: Fedora 42 KDE Plasma, Wayland Protocol, Optimus GPU (Intel(R) Core(TM) Ultra 9 275HX 32GiB, NVIDIA GeForce RTX 5070Ti 12227MiB)

## Overview

This project implements a real-time 3D flocking simulation in CUDA C++ based on [Craig Reynolds’ 1987 Boids algorithm](https://team.inria.fr/imagine/files/2014/10/flocks-hers-and-schools.pdf).

To summarize it, the Boids algorithm is governed by three simple behavioral rules: cohesion (move toward the average position of neighbors), separation (avoid crowding too closely), and alignment (match the velocity of nearby boids).

The codebase progresses through three implementations of increasing efficiency: a naïve all-pairs neighbor search, a uniform grid spatial partitioning approach to reduce unnecessary comparisons, and an optimized coherent grid variant that improves memory access locality.

Alongside implementing these kernels, the project emphasizes GPU performance analysis by measuring and comparing execution times, evaluating scalability with varying boid counts, and exploring how block sizes and grid structures affect efficiency.

This project was completed for **CIS 5650**, a master’s-level course in GPU programming at the University of Pennsylvania.

Notably, the final optimized implementation achieved **1800 fps** runtime with **100000 particles** simulated every frame.

## Demo Media

Simulations are handled using a combination of GLFW, GLEW, and OpenGL.

**10,000 Boids Simulation**
![10000 boids](images/10000_boids.gif)

Here, observe the "flocking" behavior in smaller clusters of boids.

**100,000 Boids Simulation**
![100000 boids](images/100000_boids.gif)

Note that colors are mapped to particle velocity.

**1 Million Boids Simulation**
![1mil boids](images/1mil_boids.gif)

Though unrealistic, the simulation can efficiently handle upwards of 1 million particles.

## Data Collection

Data was collected to understand the effect of different variables on the framerate of the simulation, where higher frames per second (FPS) signifies better performance.

**Graph 1: Framerate vs Number of Boids (With Visualization)**
![img](images/framerate_to_boids_with_visuals.png)

Note, I included data from my first attempt at implementing the naive / brute-force calculation of boid behavior. In this attempt, I used a mask-based approach to a branching situation, as we studied that branching can be inefficient on the GPU.

```python
for each neighbor j:
d = distance(pos[i], pos[j])
mask1 = (d < rule1MaxDist) ? 1 : 0
mask2 = (d < rule2MaxDist) ? 1 : 0
mask3 = (d < rule3MaxDist) ? 1 : 0
cohesion += mask1 * rule1Factor
separation += mask2 * rule2Factor
alignment += mask3 * rule3Factor
```

However, this implementation actually led to less than optimal results, as shown in the graph data. This first draft implementation is located [here](https://github.com/CIS5650-Fall-2025/Project1-CUDA-Flocking/blob/1d94c9d44171d3223b58e458969b70508d2585a0/src/kernel.cu#L259C1-L304C2) for perusal.

Switching to a more standard `if-then` format showed an incredible performance improvement.

```python
for each neighbor j:
d = distance(pos[i], pos[j])
if d < rule1MaxDist:
cohesion += rule1Factor
if d < rule2MaxDist:
separation += rule2Factor
if d < rule3MaxDist:
alignment += rule3Factor
```

Looking into it, I learned that in many cases the compiler and thread scheduler often predict simple if bodies, albeit their classification as "branching". In future coding, I will keep an eye out for this slight gotcha.

---

Furthermore, toggling off visualization in the program can eliminate GL-related operations. This effectively allows us to isolate and measure the efficiency of our CUDA implementation.

**Graph 2: Framerate vs Number of Boids (No Visualization)**
![img](images/framerate_to_boids_no_visuals.png)

Furthermore, I was introduced to the notion of "Blocks" in CUDA for this project. Briefly, a block refers to a number of threads that are grouped together within a program's execution. Threads within a block can cooperate by sharing data through shared memory and synchronizing their execution. However, blocks can further be grouped into a "Grid", but these blocks have no ability to cooperate or communicate with each other (share memory, exchange data, synchronize execution, etc).

In this graph, I measure how choosing different block sizes will affect framerate in execution. From my observation, the performance improvement is logarithmic in nature.

**Graph 3: Framerate vs CUDA block size (No Visualization)**
![img](images/framerate_to_blocksize.png)

## Other Performance Analysis

#### For each implementation, how does changing the number of boids affect performance? Why?
- Naive (all-pairs): Falls off fast as boids increase. Expected, with the very large NxN comparison overhead.

- Uniform grid (scattered): Big improvement over naive, then flattens out. This is because we effectively cut the search space, but indirection (needing particleArrayIndices) hurts performance still.

- Uniform grid (coherent): Best. Reordering makes neighbors mostly contiguous in memory, so reads are coalesced even as N grows. You can see the jump at 100K boids in the plots, and maintained optimal performance even at 1 million boids.

#### For each implementation, how do block count and block size affect performance? Why?
Bigger blocks help up to around 256–512 threads per block, then it levels off.
I believe initial improvement comes from the impact of higher occupancy and better memory coalescing, but after that, register/memory pressure and scheduling overhead cap that improvement.

#### For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this expected? Why or why not?

Yes—generally faster than scattered, especially at higher boid counts. This was expected: by reshuffling pos/vel into cell order, neighbor loops turn into mostly sequential reads, where GPUs specialize. The O(N) reshuffle cost gets amortized. There’s a small-N case where the reshuffle overhead can briefly outweigh the gains

#### Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not?

It mattered more in the scattered version and was roughly neutral in the coherent version.
In scattered, more cells mean more cache-unfriendly indirect reads, so overhead adds up. In coherent, those neighbors live contiguously, so even with more cells, accesses are sequential and cheap. It’s the access pattern—not the raw number of cells—that really drives performance.

## Build Instructions

Prerequisites:
- NVIDIA GPU
- NVIDIA Driver (r580+)
- CUDA toolkit (13.0+)
- CMake

See `run_linux.sh` for a simple template on building and running the program.
Binary file added images/100000_boids.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/10000_boids.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/1mil_boids.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/boid_thumbnail.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/framerate_to_blocksize.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/framerate_to_boids_no_visuals.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/framerate_to_boids_with_visuals.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/framerates_to_boids_no_visuals.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
25 changes: 25 additions & 0 deletions run_linux.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env bash

CURRENT_DIR=$(pwd)
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

while true; do
read -p "Do you want to re-build? (y/n): " yn
case $yn in
[Yy]* )
cd $SCRIPT_DIR; # cd to where script is located

trash build;
mkdir build && cd build;
cmake .. -DCMAKE_BUILD_TYPE=Release;
make -j$(nproc --all);

cd $CURRENT_DIR; # cd to stored original directory
break;;
[Nn]* )
break;;
* ) echo "Invalid input. Please answer 'y' or 'n'.";;
esac
done

__GL_SYNC_TO_VBLANK=0 XDG_SESSION_TYPE=x11 WAYLAND_DISPLAY="" $SCRIPT_DIR/build/bin/cis5650_boids
45 changes: 45 additions & 0 deletions scripts/graph1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import matplotlib.pyplot as plt
import numpy as np

# Data
boids = np.array([500, 5000, 100000, 10000000])

coherent = np.array([295.15, 465.123, 1241.273, 901.641])
scattered = np.array([151.12, 490.932, 640.674, 445.891])
naive_branch = np.array([41.253, 69.273, 111.93, 39.273])
naive_mask = np.array([10.192, 4.293, 0.2, 0.2])


# Plot
plt.figure(figsize=(12,7))
line1, = plt.plot(boids, coherent, 'x--', label="Coherent Grid")
line2, = plt.plot(boids, scattered, 'd--', label="Scattered Grid")
line3, = plt.plot(boids, naive_branch, 's--', label="Naive (Branching)")
line4, = plt.plot(boids, naive_mask, 'o--', label="Naive (Mask-based)")

# Annotation with offsets to reduce overlap

lines = {
"Coherent Grid": (line1, coherent, (0, -60)),
"Scattered Grid": (line2, scattered, (0, 30)),
"Naive (Branching)": (line3, naive_branch, (0, 45)),
"Naive (Mask-based)": (line4, naive_mask, (0, -60)),
}

# Annotate with same color as line
for label, (line, values, (dx, dy)) in lines.items():
color = line.get_color()
for x, y in zip(boids, values):
plt.text(x+dx, y+dy, f"{y:.2f}", fontsize=10, ha='center', color=color, fontweight='bold')

plt.xscale("log")
plt.xticks(boids, [str(b) for b in boids])

plt.ylim(-100, 2000)

plt.xlabel("Number of Boids (log-based x-scale)")
plt.ylabel("Framerate (FPS)")
plt.title("Framerate vs Number of Boids (With Visualization)", fontsize=18, fontweight='bold')
plt.legend()
plt.grid(True, linestyle="--", linewidth=0.5)
plt.show()
Loading