|
| 1 | + |
| 2 | +#define _GNU_SOURCE |
| 3 | +#include <stdio.h> |
| 4 | +#include <stdlib.h> |
| 5 | +#include <string.h> |
| 6 | +#include <assert.h> |
| 7 | +#include <unistd.h> |
| 8 | +#include <sys/time.h> |
| 9 | +#include "chpl-atmi.h" |
| 10 | +#include "chplrt.h" |
| 11 | +#include "chpl-mem.h" |
| 12 | +#include "chplcgfns.h" |
| 13 | + |
| 14 | +#define OUTPUT_ATMI_STATUS(status, msg) \ |
| 15 | +{ \ |
| 16 | + if (ATMI_STATUS_SUCCESS != (status)) { \ |
| 17 | + fprintf(stderr, "ATMI support: %s failed, error code: 0x%x\n", \ |
| 18 | +#msg, status); \ |
| 19 | + atmi_finalize(); \ |
| 20 | + return status; \ |
| 21 | + } \ |
| 22 | +} |
| 23 | + |
| 24 | +/** |
| 25 | + * Initialize the ATMI/HSA runtime |
| 26 | + */ |
| 27 | +int chpl_hsa_initialize(void) |
| 28 | +{ |
| 29 | + atmi_status_t st = atmi_init(ATMI_DEVTYPE_ALL); |
| 30 | + if(st != ATMI_STATUS_SUCCESS) return -1; |
| 31 | + |
| 32 | + char reduce_kernel_filename[1024]; |
| 33 | + char gen_kernel_filename[1024]; |
| 34 | + int arglen = strlen(chpl_executionCommand)+1; |
| 35 | + char* argCopy = chpl_mem_allocMany(arglen, sizeof(char), |
| 36 | + CHPL_RT_MD_CFG_ARG_COPY_DATA, 0, 0); |
| 37 | + char *binName; |
| 38 | + int cx; |
| 39 | + |
| 40 | + cx = snprintf(reduce_kernel_filename, 1024, |
| 41 | +#ifdef ROCM |
| 42 | + "%s/runtime/src/%s/chpl-hsa-reducekernels.hsaco", CHPL_HOME, |
| 43 | +#else |
| 44 | + "%s/runtime/src/%s/chpl-hsa-reducekernels.o", CHPL_HOME, |
| 45 | +#endif |
| 46 | + CHPL_RUNTIME_OBJDIR); |
| 47 | + if (cx < 0 || cx >= 256) { |
| 48 | + OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating reduce kernel filename); |
| 49 | + } |
| 50 | + strcpy(argCopy, chpl_executionCommand); |
| 51 | + binName = strtok(argCopy, " "); |
| 52 | +#ifdef ROCM |
| 53 | + cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.hsaco", binName); |
| 54 | +#else |
| 55 | + cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.o", binName); |
| 56 | +#endif |
| 57 | + if (cx < 0 || cx >= 256) { |
| 58 | + OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating generated kernel filename); |
| 59 | + } |
| 60 | + chpl_mem_free(argCopy, 0, 0); |
| 61 | + |
| 62 | +#ifdef ROCM |
| 63 | + atmi_platform_type_t module_type = AMDGCN; |
| 64 | +#else |
| 65 | + atmi_platform_type_t module_type = BRIG; |
| 66 | +#endif |
| 67 | + |
| 68 | + /* FIXME: Create all reduction kernels, not just the int64-sum kernel */ |
| 69 | + const char *modules[2] = {reduce_kernel_filename, gen_kernel_filename}; |
| 70 | + atmi_platform_type_t module_types[2] = {module_type, module_type}; |
| 71 | + st = atmi_module_register(modules, module_types, 2); |
| 72 | + OUTPUT_ATMI_STATUS(st, Registering all modules); |
| 73 | + |
| 74 | + size_t reduction_arg_sizes[] = {sizeof(uint64_t), sizeof(uint64_t), sizeof(uint32_t)}; |
| 75 | + const unsigned int num_reduction_args = sizeof(reduction_arg_sizes)/sizeof(reduction_arg_sizes[0]); |
| 76 | + atmi_kernel_create_empty(&reduction_kernel, num_reduction_args, reduction_arg_sizes); |
| 77 | + atmi_kernel_add_gpu_impl(reduction_kernel, "reduce_int64_sum", REDUCTION_GPU_IMPL); |
| 78 | + |
| 79 | + size_t kernel_arg_sizes[] = {sizeof(uint64_t)}; |
| 80 | + const unsigned int num_kernel_args = sizeof(kernel_arg_sizes)/sizeof(kernel_arg_sizes[0]); |
| 81 | + gpu_kernels = (atmi_kernel_t *)chpl_malloc(sizeof(atmi_kernel_t) * chpl_num_gpu_kernels); |
| 82 | + for (int64_t i = 0; i < chpl_num_gpu_kernels; ++i) { |
| 83 | + //FIXME: get the actual kernel name |
| 84 | + const char *kernel_name = chpl_gpu_kernels[i]; |
| 85 | + atmi_kernel_create_empty(&gpu_kernels[i], num_kernel_args, kernel_arg_sizes); |
| 86 | + atmi_kernel_add_gpu_impl(gpu_kernels[i], kernel_name, GPU_KERNEL_IMPL); |
| 87 | + } |
| 88 | + |
| 89 | + return ATMI_STATUS_SUCCESS; |
| 90 | +} |
| 91 | + |
| 92 | +/** |
| 93 | + * Release resources used by the base kernels and tear down the HSA structures |
| 94 | + */ |
| 95 | +int hsa_shutdown(void) |
| 96 | +{ |
| 97 | + chpl_free(gpu_kernels); |
| 98 | + atmi_finalize(); |
| 99 | +} |
| 100 | + |
| 101 | +/* |
| 102 | + * Enqueue/execute a kernel |
| 103 | + */ |
| 104 | +void hsa_enqueue_kernel(int kernel_idx, uint32_t wkgrp_size_x, |
| 105 | + uint32_t wkitem_count_x, void *bundled_args) |
| 106 | +{ |
| 107 | + void *args[] = {&bundled_args}; |
| 108 | + ATMI_LPARM_1D(lparm, wkitem_count_x); |
| 109 | + lparm->groupDim[0] = wkgrp_size_x; |
| 110 | + lparm->synchronous = ATMI_TRUE; |
| 111 | + |
| 112 | + lparm->kernel_id = GPU_KERNEL_IMPL; |
| 113 | + lparm->place = (atmi_place_t)ATMI_PLACE_GPU(0, 0); |
| 114 | + atmi_task_launch(lparm, gpu_kernels[kernel_idx], args); |
| 115 | +} |
| 116 | + |
0 commit comments