Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,35 @@ elseif(CMAKE_VERSION VERSION_LESS "3.24.0")
else()
set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES native)
endif()
target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE "$<$<AND:$<CONFIG:Debug,RelWithDebInfo>,$<COMPILE_LANGUAGE:CUDA>>:-G>")
target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda --expt-relaxed-constexpr>
)
target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE
$<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>>:-G>
)
set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${CMAKE_PROJECT_NAME})

set(N_FOR_VIS 5000 CACHE STRING "Number of boids to simulate")
option(VISUALIZE "Turn on OpenGL visualization" ON)
option(FINE_GRAINED_CELLS "Use fine-grained cells with 27 neighbors instead of 8" ON)
option(UNIFORM_GRID "Use uniform grids for efficient neighbor checks" OFF)
option(COHERENT_GRID "Use semi-coherent memory access for uniform grids" OFF)
set(CUDA_BLOCK_SIZE 128 CACHE STRING "CUDA block size")
option(FPS_MEASURE "Enable frame rate measurements" OFF)
set(FPS_MEASURE_START 2 CACHE STRING "Seconds after start to begin frame rate measurements")
set(FPS_MEASURE_DURATION 20 CACHE STRING "Number of seconds to measure frame rates for")
target_compile_definitions(${CMAKE_PROJECT_NAME} PRIVATE
N_FOR_VIS=${N_FOR_VIS}
VISUALIZE=$<IF:$<BOOL:${VISUALIZE}>,1,0>
FINE_GRAINED_CELLS=$<IF:$<BOOL:${FINE_GRAINED_CELLS}>,1,0>
UNIFORM_GRID=$<IF:$<BOOL:${UNIFORM_GRID}>,1,0>
COHERENT_GRID=$<IF:$<BOOL:${COHERENT_GRID}>,1,0>
CUDA_BLOCK_SIZE=${CUDA_BLOCK_SIZE}
FPS_MEASURE=$<IF:$<BOOL:${FPS_MEASURE}>,1,0>
FPS_MEASURE_START=${FPS_MEASURE_START}
FPS_MEASURE_DURATION=${FPS_MEASURE_DURATION}
)

add_custom_command(
TARGET ${CMAKE_PROJECT_NAME}
PRE_BUILD
Expand Down
84 changes: 76 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,79 @@
**University of Pennsylvania, CIS 5650: GPU Programming and Architecture,
Project 1 - Flocking**
# University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 1 - Flocking

* (TODO) YOUR NAME HERE
* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
- Yunhao Qian
- [LinkedIn](www.linkedin.com/in/yunhao-qian-026980170)
- [GitHub](https://github.com/yunhao-qian)
- Tested on my personal computer:
- OS: Windows 11, 24H2
- CPU: 13th Gen Intel(R) Core(TM) i7-13700 (2.10 GHz)
- GPU: NVIDIA GeForce RTX 4090
- RAM: 32.0 GB

### (TODO: Your README)
## Boids in Action

Include screenshots, analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
### Screenshot

![Screenshot of boids](images/Screenshot%20of%20boids.png)

The screenshot above is from an experiment with 100000 boids, 1x-sized cells, scattered uniform grid, and the block size of 128.

### Recording

![Recording of boids](images/Recording%20of%20boids.gif)

The screen recording above is from an experiment with 320000 boids, 1x-sized cells, scattered uniform grid, and the block size of 128.

## Changes to `CMakeLists.txt`

- To share code among different configurations, I used lambda functions and `constexpr` in CUDA `__device__` code. To enable these features, I turned on the `--extended-lambda` and `--expt-relaxed-constexpr` flags using `target_compile_options()`.
- To measure frame rates programmatically, all compile-time configurations are controlled by CMake definitions. These include `N_FOR_VIS`, `VISUALIZE`, `FINE_GRAINED_CELLS`, `UNIFORM_GRID`, `COHERENT_GRID`, and `CUDA_BLOCK_SIZE`.
- The modified program has a special timed mode for measuring frame rates. To support this, CMake options are added, including `FPS_MEASURE`, `FPS_MEASURE_START`, and `FPS_MEASURE_DURATION`.

## Performance Analysis

### Methodology

All experiments in this section are launched programmatically using [`measure_fps.py`](./scripts/measure_fps.py), which re-compiles the CMake project using specific arguments, runs the program, and captures the average frame rate (in frames per second, or FPS) from stdout. The measurement of each experiment starts 2 seconds after the program's launch and lasts for 20 seconds. After that, the program exits automatically. To avoid crashing the computer, more resource-consuming configurations are skipped once the frame rate drops below 1 FPS or the program takes over 100 seconds to execute. Statistics used to create the following plots are omitted here for conciseness, and please refer to [`measurements.json`](./scripts/measurements.json) for those detailed numbers.

### Number of Boids

![Frame rate vs. number of boids](images/Frame%20rate%20vs%20number%20of%20boids.png)

All experiments in this set use 1x-sized cells and the block size of 128.

Discussion:

- The naive implementation should be heavily compute-bound, and the time complexity is $O(N^2)$, where $N$ is the number of boids. Experiments confirm this trend in general, as the FPS drops consistently as $N$ increases.
- The scattered & coherent uniform grid implementations do not show a monotonous trend at small $N$'s. The worst performance appears at $N = ~20000$. This may be caused by thread divergence in warps as not all neighbor cells are occupied by boids.
- The two uniform-grid implementations reach their best performance at $N = ~100000$. The GPU's compute capability is probably saturated at this point.
- When $N$ is very large, the time complexity is $O(N^2)$ for all implementations with or without uniform grids, as large $N$'s always increase the number of boids within the effective distance. This is confirmed by the linear trends in the log-log plot.

### OpenGL Visualization

While visualizing a point cloud in OpenGL has $O(N)$ time complexity, its amount of work is much lighter than boid simulations in general. When $N$ is small, experiments with visualization turned on have slightly lower FPS. When $N$ is large, the difference becomes barely detectable.

### Block Size

![Frame rate vs. block size](images/Frame%20rate%20vs%20block%20size.png)

All experiments in this set use 32000 boids, no visualization, and 1x-sized cells.

Discussion:

- For the naive implementation, FPS improves as the block size increases from 8 to 64. This is because increased computation helps cover the latency of random global memory accesses. There is a small drop at 1024, which is probably due to reaching register limits.
- The scattered uniform grid performs best at very small block sizes from 8 to 32, and then flattens. This is because the execution time is dominated by the latency of random global memory accesses. Increasing the block size further does not address this problem, and reduces the scheduler's ability to distribute blocks evenly among MP's.
- The coherent uniform grid has strong gains for 8 to 16, as increased computation covers the latency of semi-sequential global memory accesses. Once the program is compute-bound, increasing the block size further does not bring consistent improvements.

### Coherent Uniform Grids

The coherent uniform grid brings significant performance improvements, and this is the expected outcome. Sorting boid data by cells makes neighbor loops touch contiguous memory. Those data reside in $\leq 9$ pieces of contiguous memory if the cells are 1x-sized, and $\leq 4$ pieces if the cells are 2x-sized. When $N$ is large, the semi-sequential accesses are much faster than thousands of random accesses. When $N$ is small, the overhead of sorting boid data tends to dominate, resulting in the slightly lower FPS of the coherent uniform grid.

### Cell Size

![Frame rate vs. fine-grained cells](images/Frame%20rate%20vs%20fine-grained%20cells.png)

All experiments in this set use no visualization, coherent uniform grids, and the block size of 128.

When $N$ is small, 2x-sized cells should be slightly better, as the smaller number of neighbor cells means lighter looping overhead and better memory coherence. However, this advantage is not obvious is experiments, as it is hard to tell that one configuration is clearly better than the other.

When $N$ is very large, 1x-sized cells should be much better, as more fine-grained cells allow the program to check fewer neighboring boids. If the boid density is $\rho$, and the effective distance is $R$, 1x-sized cells result in $(3 R)^3 = 27 R^3$ neighboring boids, while 2x-sized cells result in $(4 R)^3 = 64 R^3$ neighboring boids. This discrepancy is confirmed by the roughly constant offset in the log-log plot.
Binary file added images/Frame rate vs block size.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/Frame rate vs fine-grained cells.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/Frame rate vs number of boids.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/Recording of boids.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/Screenshot of boids.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
229 changes: 229 additions & 0 deletions scripts/measure_fps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
import json
import re
import shutil
import subprocess
import time
from pathlib import Path
from typing import Any

ROOT_DIR = Path(__file__).parent.parent.absolute()
BUILD_DIR = ROOT_DIR / "build"
BOIDS_EXE = BUILD_DIR / "bin" / "Release" / "cis5650_boids.exe"
MEASUREMENTS_JSON = Path(__file__).parent.absolute() / "measurements.json"


def measure_fps_vs_num_boids() -> None:
measurement_data: list[dict[str, Any]] = []
if MEASUREMENTS_JSON.exists():
assert MEASUREMENTS_JSON.is_file()
with MEASUREMENTS_JSON.open(encoding="utf-8") as file:
measurement_data = json.load(file)
else:
measurement_data = []

def find_measurement(config: dict[str, Any]) -> dict[str, Any] | None:
for measurement in measurement_data:
if all(
key in measurement and measurement[key] == value
for key, value in config.items()
):
return measurement
return None

nums_boids = [5000 * (2**i) for i in range(13)]
for name, base_config in [
("naive", {"UNIFORM_GRID": 0, "COHERENT_GRID": 0}),
("scattered uniform grid", {"UNIFORM_GRID": 1, "COHERENT_GRID": 0}),
("coherent uniform grid", {"UNIFORM_GRID": 1, "COHERENT_GRID": 1}),
]:
print(f"Measuring FPS for method: {name}")
# Visualize first so we can confirm correctness.
for visualize in [1, 0]:
for num_boids in nums_boids:
config = {
"N_FOR_VIS": num_boids,
"VISUALIZE": visualize,
"FINE_GRAINED_CELLS": 1,
**base_config,
"CUDA_BLOCK_SIZE": 128,
}
measurement = find_measurement(config)
if measurement is None:
print(f"Measuring FPS for config: {config}")
start_time = time.time()
fps = build_and_measure_fps(**config)
duration = time.time() - start_time
measurement = {**config, "duration": duration, "fps": fps}
measurement_data.append(measurement)
with MEASUREMENTS_JSON.open("w", encoding="utf-8") as file:
json.dump(measurement_data, file, indent=4)
else:
print(f"Found existing measurement for config: {config}")
print(f" Duration: {measurement['duration']}")
print(f" FPS: {measurement['fps']}")
if measurement["duration"] > 100.0:
print("Skipping further measurements due to long duration")
break
if measurement["fps"] < 1.0:
print("Skipping further measurements due to low FPS")
break


def measure_fps_vs_block_size() -> None:
measurement_data: list[dict[str, Any]] = []
if MEASUREMENTS_JSON.exists():
assert MEASUREMENTS_JSON.is_file()
with MEASUREMENTS_JSON.open(encoding="utf-8") as file:
measurement_data = json.load(file)
else:
measurement_data = []

def find_measurement(config: dict[str, Any]) -> dict[str, Any] | None:
for measurement in measurement_data:
if all(
key in measurement and measurement[key] == value
for key, value in config.items()
):
return measurement
return None

for name, base_config in [
("naive", {"UNIFORM_GRID": 0, "COHERENT_GRID": 0}),
("scattered uniform grid", {"UNIFORM_GRID": 1, "COHERENT_GRID": 0}),
("coherent uniform grid", {"UNIFORM_GRID": 1, "COHERENT_GRID": 1}),
]:
print(f"Measuring FPS for method: {name}")
for block_size in [8 * (2**i) for i in range(8)]:
config = {
"N_FOR_VIS": 320000,
"VISUALIZE": 0,
"FINE_GRAINED_CELLS": 1,
**base_config,
"CUDA_BLOCK_SIZE": block_size,
}
measurement = find_measurement(config)
if measurement is None:
print(f"Measuring FPS for config: {config}")
start_time = time.time()
fps = build_and_measure_fps(**config)
duration = time.time() - start_time
measurement = {**config, "duration": duration, "fps": fps}
measurement_data.append(measurement)
with MEASUREMENTS_JSON.open("w", encoding="utf-8") as file:
json.dump(measurement_data, file, indent=4)
else:
print(f"Found existing measurement for config: {config}")
print(f" Duration: {measurement['duration']}")
print(f" FPS: {measurement['fps']}")


def measure_fps_vs_fine_grained_cells() -> None:
measurement_data: list[dict[str, Any]] = []
if MEASUREMENTS_JSON.exists():
assert MEASUREMENTS_JSON.is_file()
with MEASUREMENTS_JSON.open(encoding="utf-8") as file:
measurement_data = json.load(file)
else:
measurement_data = []

def find_measurement(config: dict[str, Any]) -> dict[str, Any] | None:
for measurement in measurement_data:
if all(
key in measurement and measurement[key] == value
for key, value in config.items()
):
return measurement
return None

nums_boids = [5000 * (2**i) for i in range(13)]
for fine_grained in [0, 1]:
for num_boids in nums_boids:
config = {
"N_FOR_VIS": num_boids,
"VISUALIZE": 0,
"FINE_GRAINED_CELLS": fine_grained,
"UNIFORM_GRID": 1,
"COHERENT_GRID": 1,
"CUDA_BLOCK_SIZE": 128,
}
measurement = find_measurement(config)
if measurement is None:
print(f"Measuring FPS for config: {config}")
start_time = time.time()
fps = build_and_measure_fps(**config)
duration = time.time() - start_time
measurement = {**config, "duration": duration, "fps": fps}
measurement_data.append(measurement)
with MEASUREMENTS_JSON.open("w", encoding="utf-8") as file:
json.dump(measurement_data, file, indent=4)
else:
print(f"Found existing measurement for config: {config}")
print(f" Duration: {measurement['duration']}")
print(f" FPS: {measurement['fps']}")


def build_and_measure_fps(**kwargs: str) -> float:
build(**kwargs)
return measure_fps()


def build(**kwargs: str) -> None:
if BUILD_DIR.exists():
assert BUILD_DIR.is_dir()
shutil.rmtree(BUILD_DIR)

subprocess.run(
args=[
"cmake",
"-G",
"Visual Studio 17 2022",
"-S",
str(ROOT_DIR),
"-B",
str(BUILD_DIR),
*(
f"-D{key}={value}"
for key, value in {
**kwargs,
"FPS_MEASURE": 1,
"FPS_MEASURE_START": 2,
"FPS_MEASURE_DURATION": 20,
}.items()
),
],
check=True,
)
assert BUILD_DIR.is_dir()

subprocess.run(
[
"cmake",
"--build",
str(BUILD_DIR),
"--target",
"cis5650_boids",
"--config",
"Release",
"--parallel",
],
check=True,
)
assert BOIDS_EXE.is_file()


def measure_fps() -> float:
assert BOIDS_EXE.is_file()
result = subprocess.run(
[str(BOIDS_EXE)], cwd=BUILD_DIR, capture_output=True, check=True, text=True
)
for line in result.stdout.splitlines():
re_match = re.match(r"^FPS: (.+)$", line)
if re_match is not None:
return float(re_match[1])
raise RuntimeError("FPS not found in output")


if __name__ == "__main__":
measure_fps_vs_num_boids()
measure_fps_vs_block_size()
measure_fps_vs_fine_grained_cells()
Loading