Skip to content

Commit 9c9a357

Browse files
[GrCUDA-HOTFIX] fix install and benchmarks (#20)
* fixed install dir (GRCUDA-67) * fixed python benchmarks not creating nested folders and not using experimental options (GRCUDA-68) * updated make for cuda benchmarks and readme (GRCUDA-68)
1 parent d63678d commit 9c9a357

File tree

5 files changed

+30
-48
lines changed

5 files changed

+30
-48
lines changed

README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ mx unittest com.nvidia
288288
mx unittest com.nvidia.grcuda.test.BuildKernelTest#testBuildKernelwithNFILegacytSignature
289289
```
290290

291-
5. **Setup the grcuda-data sumbodule**
291+
5. **Setup the grcuda-data submodule**
292292
The `grcuda-data` repository is used as a `git` submodule to store data, results, and plots for demos, benchmarks, and publications. You will need this submodule to run the full benchmark suite, and some of the demos. To setup the submodule, follow this [`README`](https://github.com/AlbertoParravicini/grcuda-data/tree/master).
293293

294294
### Setup your IDE
@@ -325,6 +325,7 @@ Here, we explain how to setup IntelliJ Idea.
325325

326326
To measure the performance of GrCUDA on complex GPU applications, we have developed a custom benchmark suite, found in `projects/resources/python/benchmark`.
327327
These are the same benchmarks used in the [DAG-based Scheduling with Resource Sharing for Multi-task Applications in a Polyglot GPU Runtime](https://ieeexplore.ieee.org/abstract/document/9460491) paper.
328+
All commands are executed from `$GRCUDA_HOME/projects/resources/python/benchmark`;
328329

329330
Run a single benchmark with custom settings
330331
```console
@@ -336,6 +337,13 @@ Run all benchmarks
336337
graalpython --jvm --polyglot benchmark_wrapper.py -d -i 30
337338
```
338339

340+
To run the CUDA version of all benchmarks, build it as follows. You might want to update the GPU architecture (the `-arch` flag) inside `$GRCUDA_HOME/projects/resources/cuda/Makefile` to reflect the hardware at your disposal.
341+
```console
342+
cd $GRCUDA_HOME/projects/resources/cuda;
343+
make
344+
cd -;
345+
```
346+
339347
Run the CUDA version of all benchmarks
340348
```console
341349
graalpython --jvm --polyglot benchmark_wrapper.py -d -i 30 -c

install.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@
33
mx build;
44

55
# Install for Java 8+;
6-
mkdir -p $GRAAL_HOME/jre/languages/grcuda;
6+
mkdir -p $GRAAL_HOME/languages/grcuda;
77
cp $GRCUDA_HOME/mxbuild/dists/jdk1.8/grcuda.jar $GRAAL_HOME/languages/grcuda/.;

projects/resources/cuda/Makefile

Lines changed: 10 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -27,51 +27,23 @@
2727
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2828
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2929

30-
# Use NVCC;
30+
# Use NVCC.
31+
# Set the appropriate GPU architecture, check https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
3132
CXX=nvcc
3233
FLAGS = -std=c++11 -O3 -arch=sm_70
3334

34-
# Use Clang;
35-
CXX=$(CLANG_DIR)/clang++
36-
FLAGS = --cuda-gpu-arch=sm_70 -L/usr/local/cuda/lib64 -lcudart_static -ldl -lrt -pthread -std=c++11 -O3
35+
# (Experimental) Use Clang;
36+
# CXX=$(CLANG_DIR)/clang++
37+
# FLAGS = --cuda-gpu-arch=sm_70 -L/usr/local/cuda/lib64 -lcudart_static -ldl -lrt -pthread -std=c++11 -O3
3738

3839
BIN_FOLDER=bin
3940
FILES=main.cu benchmark.cu b1.cu b5.cu b6.cu b7.cu b8.cu b10.cu
4041

41-
.PHONY: full all b1 b5 b6 b7 b8 b10
42+
.PHONY: all clean
4243

43-
full:
44+
all:
45+
mkdir -p $(BIN_FOLDER);
4446
$(CXX) $(FILES) $(FLAGS) -o $(BIN_FOLDER)/b;
4547

46-
all: \
47-
full \
48-
b1 \
49-
b5 \
50-
b6 \
51-
b7 \
52-
b8 \
53-
b10
54-
55-
b1: b1*
56-
$(CXX) old/b1_default.cu $(FLAGS) -o $(BIN_FOLDER)/b1_default;
57-
$(CXX) old/b1_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b1_sync
58-
59-
b5: b5*
60-
$(CXX) old/b5_default.cu $(FLAGS) -o $(BIN_FOLDER)/b5_default;
61-
$(CXX) old/b5_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b5_sync
62-
63-
b6: b6*
64-
$(CXX) old/b6_default.cu $(FLAGS) -o $(BIN_FOLDER)/b6_default;
65-
$(CXX) old/b6_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b6_sync
66-
67-
b7: b7*
68-
$(CXX) old/b7_default.cu $(FLAGS) -o $(BIN_FOLDER)/b7_default;
69-
$(CXX) old/b7_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b7_sync
70-
71-
b8: b8*
72-
$(CXX) old/b8_default.cu $(FLAGS) -o $(BIN_FOLDER)/b8_default;
73-
$(CXX) old/b8_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b8_sync
74-
75-
b10: b10*
76-
$(CXX) old/b10_default.cu $(FLAGS) -o $(BIN_FOLDER)/b10_default;
77-
$(CXX) old/b10_sync.cu $(FLAGS) -o $(BIN_FOLDER)/b10_sync
48+
clean:
49+
rm $(BIN_FOLDER)/*;

projects/resources/python/benchmark/benchmark_nvprof_wrapper.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from benchmark_result import BenchmarkResult
3636
from benchmark_main import create_block_size_list
3737
from java.lang import System
38+
from pathlib import Path
3839

3940
##############################
4041
##############################
@@ -124,18 +125,18 @@
124125

125126
if POST_TURING:
126127
GRAALPYTHON_CMD_METRICS = """/usr/local/cuda/bin/ncu -f --print-units base --csv --log-file "{}" --profile-from-start off --target-processes all {} \
127-
{}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
128+
{}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --experimental-options --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
128129
--grcuda.ExecutionPolicy={} --grcuda.DependencyPolicy={} --grcuda.RetrieveParentStreamPolicy={} benchmark_main.py \
129130
-i {} -n {} --reinit false --realloc false -g {} -b {} --block_size_1d {} --block_size_2d {} --no_cpu_validation {} {} --nvprof
130131
"""
131132
GRAALPYTHON_CMD_TRACE = """/usr/local/cuda/bin/nvprof --csv --log-file "{}" --print-gpu-trace {} --profile-from-start off --profile-child-processes \
132-
{}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
133+
{}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --experimental-options --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
133134
--grcuda.ExecutionPolicy={} --grcuda.DependencyPolicy={} --grcuda.RetrieveParentStreamPolicy={} benchmark_main.py \
134135
-i {} -n {} --reinit false --realloc false -g {} -b {} --block_size_1d {} --block_size_2d {} --no_cpu_validation {} {} --nvprof
135136
"""
136137
else:
137138
GRAALPYTHON_CMD = """/usr/local/cuda/bin/nvprof --csv --log-file "{}" --print-gpu-trace {} --profile-from-start off --profile-child-processes \
138-
{}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
139+
{}/graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --experimental-options --grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach \
139140
--grcuda.ExecutionPolicy={} --grcuda.DependencyPolicy={} --grcuda.RetrieveParentStreamPolicy={} benchmark_main.py \
140141
-i {} -n {} --reinit false --realloc false -g {} -b {} --block_size_1d {} --block_size_2d {} --no_cpu_validation {} {} --nvprof
141142
"""
@@ -169,7 +170,7 @@ def execute_grcuda_benchmark(benchmark, size, exec_policy, new_stream_policy,
169170
if not os.path.exists(output_folder_path):
170171
if debug:
171172
BenchmarkResult.log_message(f"creating result folder: {output_folder_path}")
172-
os.mkdir(output_folder_path)
173+
Path(output_folder_path).mkdir(parents=True, exist_ok=True)
173174
file_name = f"{b}_{exec_policy}_{'metric' if m else 'nometric'}_{prefetch}{'' if (POST_TURING and m) else '_%p'}.csv"
174175
output_path = os.path.join(output_folder_path, file_name)
175176

projects/resources/python/benchmark/benchmark_wrapper.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from benchmark_result import BenchmarkResult
3636
from benchmark_main import create_block_size_list
3737
from java.lang import System
38+
from pathlib import Path
3839

3940
##############################
4041
##############################
@@ -160,15 +161,15 @@ def execute_cuda_benchmark(benchmark, size, block_size, exec_policy, num_iter, d
160161
if not os.path.exists(output_folder_path):
161162
if debug:
162163
BenchmarkResult.log_message(f"creating result folder: {output_folder_path}")
163-
os.mkdir(output_folder_path)
164+
Path(output_folder_path).mkdir(parents=True, exist_ok=True)
164165
output_path = os.path.join(output_folder_path, file_name)
165166

166167
benchmark_cmd = CUDA_CMD.format(benchmark, exec_policy, size, block_size["block_size_1d"],
167168
block_size["block_size_2d"], num_iter, num_blocks, "-r" if prefetch else "", "-a", output_path)
168169
start = System.nanoTime()
169170
result = subprocess.run(benchmark_cmd,
170171
shell=True,
171-
stdout=subprocess.STDOUT,
172+
stdout=None,
172173
cwd=f"{os.getenv('GRCUDA_HOME')}/projects/resources/cuda/bin")
173174
result.check_returncode()
174175
end = System.nanoTime()
@@ -179,7 +180,7 @@ def execute_cuda_benchmark(benchmark, size, block_size, exec_policy, num_iter, d
179180
##############################
180181
##############################
181182

182-
GRAALPYTHON_CMD = "graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot " \
183+
GRAALPYTHON_CMD = "graalpython --vm.XX:MaxHeapSize={}G --jvm --polyglot --experimental-options " \
183184
"--grcuda.RetrieveNewStreamPolicy={} {} --grcuda.ForceStreamAttach --grcuda.ExecutionPolicy={} --grcuda.DependencyPolicy={} " \
184185
"--grcuda.RetrieveParentStreamPolicy={} benchmark_main.py -i {} -n {} -g {} " \
185186
"--reinit false --realloc false -b {} --block_size_1d {} --block_size_2d {} --no_cpu_validation {} {} -o {}"
@@ -214,7 +215,7 @@ def execute_grcuda_benchmark(benchmark, size, block_sizes, exec_policy, new_stre
214215
if not os.path.exists(output_folder_path):
215216
if debug:
216217
BenchmarkResult.log_message(f"creating result folder: {output_folder_path}")
217-
os.mkdir(output_folder_path)
218+
Path(output_folder_path).mkdir(parents=True, exist_ok=True)
218219
output_path = os.path.join(output_folder_path, file_name)
219220
b1d_size = " ".join([str(b['block_size_1d']) for b in block_sizes])
220221
b2d_size = " ".join([str(b['block_size_2d']) for b in block_sizes])

0 commit comments

Comments
 (0)