Skip to content

Commit 4333459

Browse files
committed
Adding ATMI as the runtime layer to launch Chapel's generated GPU kernels
1 parent 9bc55ef commit 4333459

File tree

7 files changed

+302
-6
lines changed

7 files changed

+302
-6
lines changed

make/compiler/Makefile.hsa

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@ include $(CHPL_MAKE_HOME)/make/compiler/Makefile.gnu
33
ifdef CHPL_ROCM
44
# ROCm locations
55
CLOC=/opt/rocm/cloc/bin/cloc.sh
6-
LIBS+=-lhsa-runtime64 -lhsakmt -lm
6+
LIBS+=-latmi_runtime -lm
77

88
# TODO: move these in third-party directory?
9-
GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib
10-
HSA_INCLUDES=-I/opt/rocm/hsa/include
9+
GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib -L/opt/rocm/libatmi/lib
10+
HSA_INCLUDES=-I/opt/rocm/libatmi/include
1111
else
1212
# HSA locations
1313
CLOC=$(THIRD_PARTY_DIR)/hsa/cloc/bin/cloc.sh

runtime/include/chpl-atmi.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#ifndef _chpl_atmi_h_
2+
#define _chpl_atmi_h_
3+
4+
#include <atmi_runtime.h>
5+
#include <stddef.h> /* size_t */
6+
#include <stdint.h> /* uintXX_t */
7+
#ifndef __cplusplus
8+
#include <stdbool.h>
9+
#endif /* __cplusplus */
10+
11+
#include "chpltypes.h"
12+
#include "chpl-hsa-kernelparams.h"
13+
14+
atmi_kernel_t reduction_kernel;
15+
atmi_kernel_t *gpu_kernels;
16+
17+
enum {
18+
GPU_KERNEL_IMPL = 10565,
19+
REDUCTION_GPU_IMPL = 42
20+
};
21+
/*
22+
typedef struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) {
23+
uint64_t in;
24+
uint64_t out;
25+
uint32_t count;
26+
} hsail_reduce_kernarg_t;
27+
28+
typedef struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) {
29+
uint64_t bundle;
30+
} hsail_kernarg_t;
31+
*/
32+
33+
int chpl_hsa_initialize(void);
34+
35+
int32_t hsa_reduce_int32(const char *op, int32_t *src, size_t count);
36+
int64_t hsa_reduce_int64(const char *op, int64_t *src, size_t count);
37+
38+
void hsa_enqueue_kernel(int kernel_idx, uint32_t wkgrp_size_x,
39+
uint32_t wkitem_count_x, void *bundled_args);
40+
#endif //_chpl_atmi_h_

runtime/include/chpl-gen-includes.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
#include "chpl-tasks.h"
3131
#include "chpltypes.h"
3232
#ifdef TARGET_HSA
33-
#include "chpl-hsa.h"
33+
#include "chpl-atmi.h"
3434
#endif
3535

3636
//

runtime/src/Makefile.share

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@
1717

1818
ifeq ($(strip $(CHPL_MAKE_TARGET_COMPILER)),hsa)
1919
HSA_SRCS = \
20-
chpl-hsa.c \
20+
chpl-atmi.c \
2121
chpl-hsa-reducekernels.cl \
22-
chpl-hsa-reducehost.c
22+
chpl-atmi-reducehost.c
2323
endif
2424

2525
COMMON_LAUNCHER_SRCS = \

runtime/src/chpl-atmi-reducehost.c

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
2+
#include "chpl-atmi.h"
3+
#include "chplrt.h"
4+
#include "chplexit.h"
5+
#include "chpl-mem.h"
6+
7+
/*enum ReduceOp {
8+
MAX,
9+
MIN,
10+
SUM,
11+
PROD,
12+
BITAND,
13+
BITOR,
14+
BITXOR,
15+
LOGAND,
16+
LOGOR
17+
};
18+
*/
19+
20+
/*
21+
* Estimate and schedule the required number of GPU kernels
22+
*/
23+
static inline
24+
void atmi_sched_reducekernels(size_t count,
25+
void *darray[2], size_t *iter_ct,
26+
size_t *items_left)
27+
{
28+
size_t incount, outcount, i, iter, in, out;
29+
uint32_t max_num_wkgrps, num_wkgroups, grid_size_x;
30+
31+
const int num_args = 3;
32+
atmi_task_group_t task_group = {1, ATMI_TRUE};
33+
ATMI_LPARM(lparm);
34+
lparm->group = &task_group;
35+
lparm->kernel_id = REDUCTION_GPU_IMPL;
36+
lparm->synchronous = ATMI_FALSE;
37+
lparm->place = (atmi_place_t)ATMI_PLACE_GPU(0, 0);
38+
39+
incount = count;
40+
max_num_wkgrps = incount / WKGRP_SIZE;
41+
num_wkgroups = (max_num_wkgrps + SEQ_CHUNK_SIZE - 1) / SEQ_CHUNK_SIZE;
42+
grid_size_x = num_wkgroups * WKGRP_SIZE;
43+
outcount = num_wkgroups;
44+
iter = 0;
45+
while (grid_size_x > WKGRP_SIZE) {
46+
in = (iter & 1);
47+
out = (iter & 1) ^ 1;
48+
49+
void *args[] = {&darray[in], &darray[out], &incount};
50+
lparm->gridDim[0] = grid_size_x;
51+
lparm->groupDim[0] = WKGRP_SIZE;
52+
atmi_task_launch(lparm, reduction_kernel, args);
53+
54+
iter += 1;
55+
incount = outcount;
56+
max_num_wkgrps = incount / WKGRP_SIZE;
57+
num_wkgroups = (max_num_wkgrps + SEQ_CHUNK_SIZE - 1) / SEQ_CHUNK_SIZE;
58+
grid_size_x = num_wkgroups * WKGRP_SIZE;
59+
outcount = num_wkgroups;
60+
}
61+
62+
if (iter > 0) {
63+
atmi_task_group_sync(&task_group);
64+
}
65+
66+
(*items_left) = incount;
67+
(*iter_ct) = iter;
68+
}
69+
70+
/*int32_t hsa_reduce_int32(const char *op, int32_t *src, size_t count)
71+
{
72+
int32_t res;
73+
size_t iter, items_left, out, i;
74+
int32_t * darray[2];
75+
hsa_symbol_info_t * symbol_info;
76+
symbol_info = &kernel.symbol_info[0]; //TODO: Remove hardcoded 0 index
77+
darray[0] = src;
78+
if (0 != chpl_posix_memalign((void **) &darray[1], 64,
79+
count * sizeof(int32_t))) {
80+
chpl_exit_any(1);
81+
}
82+
83+
hsa_sched_reducekernels(count, symbol_info, (void**)darray,
84+
&iter, &items_left);
85+
86+
res = 0;
87+
out = (iter & 1);
88+
chpl_msg(2, "HSA: Using CPU to reduce %lu items\n", items_left);
89+
for (i = 0; i < items_left; ++i) res += darray[out][i];
90+
91+
chpl_free (darray[1]);
92+
return res;
93+
}*/
94+
95+
int64_t hsa_reduce_int64(const char *op, int64_t *src, size_t count)
96+
{
97+
int64_t res;
98+
size_t iter, items_left, out, i;
99+
int64_t * darray[2];
100+
darray[0] = src;
101+
if (0 != chpl_posix_memalign((void **) &darray[1], 64,
102+
count * sizeof(int64_t))) {
103+
chpl_exit_any(1);
104+
}
105+
106+
atmi_sched_reducekernels(count, (void**)darray,
107+
&iter, &items_left);
108+
109+
res = 0;
110+
out = (iter & 1);
111+
chpl_msg(2, "HSA: Using CPU to reduce %lu items\n", items_left);
112+
for (i = 0; i < items_left; ++i) res += darray[out][i];
113+
114+
chpl_free (darray[1]);
115+
return res;
116+
}
117+
118+
//FIXME: use the op argument like this to extend this to different ops
119+
/*if (!strcasecmp(op, "Max"))
120+
opType = MAX;
121+
else if (!strcasecmp(op, "Min"))
122+
opType = MIN;
123+
else if (!strcasecmp(op, "Sum"))
124+
opType = SUM;
125+
else if (!strcasecmp(op, "Product"))
126+
opType = PROD;
127+
else if (!strcasecmp(op, "LogicalAnd"))
128+
opType = LOGAND;
129+
else if (!strcasecmp(op, "LogicalOr"))
130+
opType = LOGOR;
131+
else if (!strcasecmp(op, "BitwiseAnd"))
132+
opType = BITAND;
133+
else if (!strcasecmp(op, "BitwiseOr"))
134+
opType = BITOR;
135+
else if (!strcasecmp(op, "BitwiseXor"))
136+
opType = BITXOR; */

runtime/src/chpl-atmi.c

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
2+
#define _GNU_SOURCE
3+
#include <stdio.h>
4+
#include <stdlib.h>
5+
#include <string.h>
6+
#include <assert.h>
7+
#include <unistd.h>
8+
#include <sys/time.h>
9+
#include "chpl-atmi.h"
10+
#include "chplrt.h"
11+
#include "chpl-mem.h"
12+
#include "chplcgfns.h"
13+
14+
#define OUTPUT_ATMI_STATUS(status, msg) \
15+
{ \
16+
if (ATMI_STATUS_SUCCESS != (status)) { \
17+
fprintf(stderr, "ATMI support: %s failed, error code: 0x%x\n", \
18+
#msg, status); \
19+
atmi_finalize(); \
20+
return status; \
21+
} \
22+
}
23+
24+
/**
25+
* Initialize the ATMI/HSA runtime
26+
*/
27+
int chpl_hsa_initialize(void)
28+
{
29+
atmi_status_t st = atmi_init(ATMI_DEVTYPE_ALL);
30+
if(st != ATMI_STATUS_SUCCESS) return -1;
31+
32+
char reduce_kernel_filename[1024];
33+
char gen_kernel_filename[1024];
34+
int arglen = strlen(chpl_executionCommand)+1;
35+
char* argCopy = chpl_mem_allocMany(arglen, sizeof(char),
36+
CHPL_RT_MD_CFG_ARG_COPY_DATA, 0, 0);
37+
char *binName;
38+
int cx;
39+
40+
cx = snprintf(reduce_kernel_filename, 1024,
41+
#ifdef ROCM
42+
"%s/runtime/src/%s/chpl-hsa-reducekernels.hsaco", CHPL_HOME,
43+
#else
44+
"%s/runtime/src/%s/chpl-hsa-reducekernels.o", CHPL_HOME,
45+
#endif
46+
CHPL_RUNTIME_OBJDIR);
47+
if (cx < 0 || cx >= 256) {
48+
OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating reduce kernel filename);
49+
}
50+
strcpy(argCopy, chpl_executionCommand);
51+
binName = strtok(argCopy, " ");
52+
#ifdef ROCM
53+
cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.hsaco", binName);
54+
#else
55+
cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.o", binName);
56+
#endif
57+
if (cx < 0 || cx >= 256) {
58+
OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating generated kernel filename);
59+
}
60+
chpl_mem_free(argCopy, 0, 0);
61+
62+
#ifdef ROCM
63+
atmi_platform_type_t module_type = AMDGCN;
64+
#else
65+
atmi_platform_type_t module_type = BRIG;
66+
#endif
67+
68+
/* FIXME: Create all reduction kernels, not just the int64-sum kernel */
69+
const char *modules[2] = {reduce_kernel_filename, gen_kernel_filename};
70+
atmi_platform_type_t module_types[2] = {module_type, module_type};
71+
st = atmi_module_register(modules, module_types, 2);
72+
OUTPUT_ATMI_STATUS(st, Registering all modules);
73+
74+
size_t reduction_arg_sizes[] = {sizeof(uint64_t), sizeof(uint64_t), sizeof(uint32_t)};
75+
const unsigned int num_reduction_args = sizeof(reduction_arg_sizes)/sizeof(reduction_arg_sizes[0]);
76+
atmi_kernel_create_empty(&reduction_kernel, num_reduction_args, reduction_arg_sizes);
77+
atmi_kernel_add_gpu_impl(reduction_kernel, "reduce_int64_sum", REDUCTION_GPU_IMPL);
78+
79+
size_t kernel_arg_sizes[] = {sizeof(uint64_t)};
80+
const unsigned int num_kernel_args = sizeof(kernel_arg_sizes)/sizeof(kernel_arg_sizes[0]);
81+
gpu_kernels = (atmi_kernel_t *)chpl_malloc(sizeof(atmi_kernel_t) * chpl_num_gpu_kernels);
82+
for (int64_t i = 0; i < chpl_num_gpu_kernels; ++i) {
83+
//FIXME: get the actual kernel name
84+
const char *kernel_name = chpl_gpu_kernels[i];
85+
atmi_kernel_create_empty(&gpu_kernels[i], num_kernel_args, kernel_arg_sizes);
86+
atmi_kernel_add_gpu_impl(gpu_kernels[i], kernel_name, GPU_KERNEL_IMPL);
87+
}
88+
89+
return ATMI_STATUS_SUCCESS;
90+
}
91+
92+
/**
93+
* Release resources used by the base kernels and tear down the HSA structures
94+
*/
95+
int hsa_shutdown(void)
96+
{
97+
chpl_free(gpu_kernels);
98+
atmi_finalize();
99+
}
100+
101+
/*
102+
* Enqueue/execute a kernel
103+
*/
104+
void hsa_enqueue_kernel(int kernel_idx, uint32_t wkgrp_size_x,
105+
uint32_t wkitem_count_x, void *bundled_args)
106+
{
107+
void *args[] = {&bundled_args};
108+
ATMI_LPARM_1D(lparm, wkitem_count_x);
109+
lparm->groupDim[0] = wkgrp_size_x;
110+
lparm->synchronous = ATMI_TRUE;
111+
112+
lparm->kernel_id = GPU_KERNEL_IMPL;
113+
lparm->place = (atmi_place_t)ATMI_PLACE_GPU(0, 0);
114+
atmi_task_launch(lparm, gpu_kernels[kernel_idx], args);
115+
}
116+

util/setchplenv_hsa.bash

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@ if [ "$1" == "debug" ]; then
105105
export CHPL_DEBUG=1
106106
fi
107107

108+
echo -n "Setting CHPL_ROCM"
109+
export CHPL_ROCM=1
110+
echo " to 1"
111+
108112
echo -n "Setting CHPL_LOCALE_MODEL"
109113
export CHPL_LOCALE_MODEL=hsa
110114
echo " to hsa"

0 commit comments

Comments
 (0)