Adding ATMI as the runtime layer to launch Chapel's generated GPU kernels

ashwinma · ashwinma · commit 4333459b0058 · 2016-12-01T18:56:15.000-06:00
diff --git a/make/compiler/Makefile.hsa b/make/compiler/Makefile.hsa
@@ -3,11 +3,11 @@ include $(CHPL_MAKE_HOME)/make/compiler/Makefile.gnu
 ifdef CHPL_ROCM
 # ROCm locations
 CLOC=/opt/rocm/cloc/bin/cloc.sh
-LIBS+=-lhsa-runtime64 -lhsakmt -lm
+LIBS+=-latmi_runtime -lm
 
 # TODO: move these in third-party directory?
-GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib
-HSA_INCLUDES=-I/opt/rocm/hsa/include
+GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib -L/opt/rocm/libatmi/lib
+HSA_INCLUDES=-I/opt/rocm/libatmi/include
 else
 # HSA locations
 CLOC=$(THIRD_PARTY_DIR)/hsa/cloc/bin/cloc.sh
diff --git a/runtime/include/chpl-atmi.h b/runtime/include/chpl-atmi.h
@@ -0,0 +1,40 @@
+#ifndef _chpl_atmi_h_
+#define _chpl_atmi_h_
+
+#include <atmi_runtime.h>
+#include <stddef.h> /* size_t */
+#include <stdint.h> /* uintXX_t */
+#ifndef __cplusplus
+#include <stdbool.h>
+#endif /* __cplusplus */
+
+#include "chpltypes.h"
+#include "chpl-hsa-kernelparams.h"
+
+atmi_kernel_t reduction_kernel;
+atmi_kernel_t *gpu_kernels;
+
+enum {
+    GPU_KERNEL_IMPL = 10565,
+    REDUCTION_GPU_IMPL = 42
+};    
+/*
+typedef struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) {
+    uint64_t in;
+    uint64_t out;
+    uint32_t count;
+} hsail_reduce_kernarg_t;
+
+typedef struct __attribute__ ((aligned(HSA_ARGUMENT_ALIGN_BYTES))) {
+    uint64_t bundle;
+} hsail_kernarg_t;
+*/
+
+int chpl_hsa_initialize(void);
+
+int32_t hsa_reduce_int32(const char *op, int32_t *src, size_t count);
+int64_t hsa_reduce_int64(const char *op, int64_t *src, size_t count);
+
+void hsa_enqueue_kernel(int kernel_idx, uint32_t wkgrp_size_x,
+                        uint32_t wkitem_count_x, void *bundled_args);
+#endif //_chpl_atmi_h_
diff --git a/runtime/include/chpl-gen-includes.h b/runtime/include/chpl-gen-includes.h
@@ -30,7 +30,7 @@
 #include "chpl-tasks.h"
 #include "chpltypes.h"
 #ifdef TARGET_HSA
-#include "chpl-hsa.h"
+#include "chpl-atmi.h"
 #endif
 
 //
diff --git a/runtime/src/Makefile.share b/runtime/src/Makefile.share
@@ -17,9 +17,9 @@
 
 ifeq ($(strip $(CHPL_MAKE_TARGET_COMPILER)),hsa)
 HSA_SRCS = \
-	chpl-hsa.c \
+	chpl-atmi.c \
 	chpl-hsa-reducekernels.cl \
-	chpl-hsa-reducehost.c
+	chpl-atmi-reducehost.c
 endif
 
 COMMON_LAUNCHER_SRCS = \
diff --git a/runtime/src/chpl-atmi-reducehost.c b/runtime/src/chpl-atmi-reducehost.c
@@ -0,0 +1,136 @@
+
+#include "chpl-atmi.h"
+#include "chplrt.h"
+#include "chplexit.h"
+#include "chpl-mem.h"
+
+/*enum ReduceOp {
+  MAX,
+  MIN,
+  SUM,
+  PROD,
+  BITAND,
+  BITOR,
+  BITXOR,
+  LOGAND,
+  LOGOR
+  };
+  */
+
+/*
+ * Estimate and schedule the required number of GPU kernels
+ */
+    static inline
+void atmi_sched_reducekernels(size_t count, 
+        void *darray[2], size_t *iter_ct,
+        size_t *items_left)
+{
+    size_t incount, outcount, i, iter, in, out;
+    uint32_t max_num_wkgrps, num_wkgroups, grid_size_x;
+
+    const int num_args = 3;
+    atmi_task_group_t task_group = {1, ATMI_TRUE};
+    ATMI_LPARM(lparm);
+    lparm->group = &task_group;
+    lparm->kernel_id = REDUCTION_GPU_IMPL;
+    lparm->synchronous = ATMI_FALSE;
+    lparm->place = (atmi_place_t)ATMI_PLACE_GPU(0, 0);
+
+    incount = count;
+    max_num_wkgrps = incount / WKGRP_SIZE;
+    num_wkgroups = (max_num_wkgrps + SEQ_CHUNK_SIZE  - 1) / SEQ_CHUNK_SIZE;
+    grid_size_x = num_wkgroups * WKGRP_SIZE;
+    outcount = num_wkgroups;
+    iter = 0;
+    while (grid_size_x > WKGRP_SIZE) {
+        in = (iter & 1);
+        out = (iter & 1) ^ 1;
+
+        void *args[] = {&darray[in], &darray[out], &incount};
+        lparm->gridDim[0] = grid_size_x;
+        lparm->groupDim[0] = WKGRP_SIZE;
+        atmi_task_launch(lparm, reduction_kernel, args);
+
+        iter += 1;
+        incount = outcount;
+        max_num_wkgrps = incount / WKGRP_SIZE;
+        num_wkgroups = (max_num_wkgrps + SEQ_CHUNK_SIZE  - 1) / SEQ_CHUNK_SIZE;
+        grid_size_x = num_wkgroups * WKGRP_SIZE;
+        outcount = num_wkgroups;
+    }
+
+    if (iter > 0) {
+        atmi_task_group_sync(&task_group);
+    }
+
+    (*items_left) = incount;
+    (*iter_ct) = iter;
+}
+
+/*int32_t hsa_reduce_int32(const char *op, int32_t *src, size_t count)
+  {
+  int32_t res;
+  size_t iter, items_left, out, i;
+  int32_t * darray[2];
+  hsa_symbol_info_t * symbol_info;
+  symbol_info = &kernel.symbol_info[0]; //TODO: Remove hardcoded 0 index
+  darray[0] = src;
+  if (0 != chpl_posix_memalign((void **) &darray[1], 64,
+  count * sizeof(int32_t))) {
+  chpl_exit_any(1);
+  }
+
+  hsa_sched_reducekernels(count, symbol_info, (void**)darray,
+  &iter, &items_left);
+
+  res = 0;
+  out = (iter & 1);
+  chpl_msg(2, "HSA: Using CPU to reduce %lu items\n", items_left);
+  for (i = 0; i < items_left; ++i) res += darray[out][i];
+
+  chpl_free (darray[1]);
+  return res;
+  }*/
+
+int64_t hsa_reduce_int64(const char *op, int64_t *src, size_t count)
+{
+    int64_t res;
+    size_t iter, items_left, out, i;
+    int64_t * darray[2];
+    darray[0] = src;
+    if (0 != chpl_posix_memalign((void **) &darray[1], 64,
+                count * sizeof(int64_t))) {
+        chpl_exit_any(1);
+    }
+
+    atmi_sched_reducekernels(count, (void**)darray,
+            &iter, &items_left);
+
+    res = 0;
+    out = (iter & 1);
+    chpl_msg(2, "HSA: Using CPU to reduce %lu items\n", items_left);
+    for (i = 0; i < items_left; ++i) res += darray[out][i];
+
+    chpl_free (darray[1]);
+    return res;
+}
+
+//FIXME: use the op argument like this to extend this to different ops
+/*if (!strcasecmp(op, "Max"))
+  opType = MAX;
+  else if (!strcasecmp(op, "Min"))
+  opType = MIN;
+  else if (!strcasecmp(op, "Sum"))
+  opType = SUM;
+  else if (!strcasecmp(op, "Product"))
+  opType = PROD;
+  else if (!strcasecmp(op, "LogicalAnd"))
+  opType = LOGAND;
+  else if (!strcasecmp(op, "LogicalOr"))
+  opType = LOGOR;
+  else if (!strcasecmp(op, "BitwiseAnd"))
+  opType = BITAND;
+  else if (!strcasecmp(op, "BitwiseOr"))
+  opType = BITOR;
+  else if (!strcasecmp(op, "BitwiseXor"))
+  opType = BITXOR; */
diff --git a/runtime/src/chpl-atmi.c b/runtime/src/chpl-atmi.c
@@ -0,0 +1,116 @@
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include "chpl-atmi.h"
+#include "chplrt.h"
+#include "chpl-mem.h"
+#include "chplcgfns.h"
+
+#define OUTPUT_ATMI_STATUS(status, msg) \
+{ \
+    if (ATMI_STATUS_SUCCESS != (status)) { \
+        fprintf(stderr, "ATMI support: %s failed, error code: 0x%x\n", \
+#msg, status); \
+        atmi_finalize(); \
+        return status; \
+    } \
+}
+
+/**
+ * Initialize the ATMI/HSA runtime
+ */
+int chpl_hsa_initialize(void)
+{
+    atmi_status_t st = atmi_init(ATMI_DEVTYPE_ALL);
+    if(st != ATMI_STATUS_SUCCESS) return -1;
+
+    char reduce_kernel_filename[1024];
+    char gen_kernel_filename[1024];
+    int arglen = strlen(chpl_executionCommand)+1;
+    char* argCopy = chpl_mem_allocMany(arglen, sizeof(char),
+            CHPL_RT_MD_CFG_ARG_COPY_DATA, 0, 0);
+    char *binName;
+    int cx;
+
+    cx = snprintf(reduce_kernel_filename, 1024,
+#ifdef ROCM
+            "%s/runtime/src/%s/chpl-hsa-reducekernels.hsaco", CHPL_HOME,
+#else
+            "%s/runtime/src/%s/chpl-hsa-reducekernels.o", CHPL_HOME,
+#endif
+            CHPL_RUNTIME_OBJDIR);
+    if (cx < 0 || cx  >= 256) {
+        OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating reduce kernel filename);
+    }
+    strcpy(argCopy, chpl_executionCommand);
+    binName = strtok(argCopy, " ");
+#ifdef ROCM
+    cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.hsaco", binName);
+#else
+    cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.o", binName);
+#endif
+    if (cx < 0 || cx  >= 256) {
+        OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating generated kernel filename);
+    }
+    chpl_mem_free(argCopy, 0, 0);
+
+#ifdef ROCM
+    atmi_platform_type_t module_type = AMDGCN;
+#else
+    atmi_platform_type_t module_type = BRIG;
+#endif
+
+    /* FIXME: Create all reduction kernels, not just the int64-sum kernel */
+    const char *modules[2] = {reduce_kernel_filename, gen_kernel_filename};
+    atmi_platform_type_t module_types[2] = {module_type, module_type};
+    st = atmi_module_register(modules, module_types, 2);
+    OUTPUT_ATMI_STATUS(st, Registering all modules);
+
+    size_t reduction_arg_sizes[] = {sizeof(uint64_t), sizeof(uint64_t), sizeof(uint32_t)};
+    const unsigned int num_reduction_args = sizeof(reduction_arg_sizes)/sizeof(reduction_arg_sizes[0]);
+    atmi_kernel_create_empty(&reduction_kernel, num_reduction_args, reduction_arg_sizes);
+    atmi_kernel_add_gpu_impl(reduction_kernel, "reduce_int64_sum", REDUCTION_GPU_IMPL);
+
+    size_t kernel_arg_sizes[] = {sizeof(uint64_t)}; 
+    const unsigned int num_kernel_args = sizeof(kernel_arg_sizes)/sizeof(kernel_arg_sizes[0]);
+    gpu_kernels = (atmi_kernel_t *)chpl_malloc(sizeof(atmi_kernel_t) * chpl_num_gpu_kernels);
+    for (int64_t i = 0; i < chpl_num_gpu_kernels; ++i) {
+        //FIXME: get the actual kernel name
+        const char *kernel_name = chpl_gpu_kernels[i];
+        atmi_kernel_create_empty(&gpu_kernels[i], num_kernel_args, kernel_arg_sizes);
+        atmi_kernel_add_gpu_impl(gpu_kernels[i], kernel_name, GPU_KERNEL_IMPL);
+    }
+
+    return ATMI_STATUS_SUCCESS;
+}
+
+/**
+ * Release resources used by the base kernels and tear down the HSA structures
+ */
+int hsa_shutdown(void)
+{
+    chpl_free(gpu_kernels);
+    atmi_finalize();
+}
+ 
+/*
+ * Enqueue/execute a kernel
+ */
+void hsa_enqueue_kernel(int kernel_idx, uint32_t wkgrp_size_x,
+        uint32_t wkitem_count_x, void *bundled_args)
+{
+    void *args[] = {&bundled_args}; 
+    ATMI_LPARM_1D(lparm, wkitem_count_x);
+    lparm->groupDim[0] = wkgrp_size_x;
+    lparm->synchronous = ATMI_TRUE;
+
+    lparm->kernel_id = GPU_KERNEL_IMPL;
+    lparm->place = (atmi_place_t)ATMI_PLACE_GPU(0, 0);
+    atmi_task_launch(lparm, gpu_kernels[kernel_idx], args);
+}
+
diff --git a/util/setchplenv_hsa.bash b/util/setchplenv_hsa.bash
@@ -105,6 +105,10 @@ if [ "$1" == "debug" ]; then
     export CHPL_DEBUG=1
 fi
 
+echo -n "Setting CHPL_ROCM"
+export CHPL_ROCM=1
+echo " to 1"
+
 echo -n "Setting CHPL_LOCALE_MODEL"
 export CHPL_LOCALE_MODEL=hsa
 echo " to hsa"