Skip to content

Commit eaf782d

Browse files
authored
[UR] Add new urKernelGetSuggestedLocalWorkSizeWithArgs() API (intel#21442)
We want to remove the possibility of setting arguments separately from launching a kernel. However, the OpenCL spec requires all kernel arguments to be set before calling `urKernelGetSuggestedLocalWorkSize()`, so we have to add a new `urKernelGetSuggestedLocalWorkSizeWithArgs()` API in order to make our goal possible. `urKernelGetSuggestedLocalWorkSizeWithArgs()` has to call `urKernelGetSuggestedLocalWorkSize()` only for now, because its full implementation requires more changes in the sanitizers' layer, which will be implemented in the following pull requests. See the conversation: intel#21290 (comment) Ref: intel#21290 --------- Signed-off-by: Lukasz Dorau <lukasz.dorau@intel.com>
1 parent 1d26f9f commit eaf782d

33 files changed

Lines changed: 702 additions & 10 deletions

unified-runtime/include/unified-runtime/ur_api.h

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,8 @@ typedef enum ur_function_t {
515515
UR_FUNCTION_ENQUEUE_HOST_TASK_EXP = 309,
516516
/// Enumerator for ::urCommandBufferAppendKernelLaunchWithArgsExp
517517
UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_WITH_ARGS_EXP = 310,
518+
/// Enumerator for ::urKernelGetSuggestedLocalWorkSizeWithArgs
519+
UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS = 311,
518520
/// @cond
519521
UR_FUNCTION_FORCE_UINT32 = 0x7fffffff
520522
/// @endcond
@@ -9501,6 +9503,55 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
95019503
/// suggested local work size that will contain the result of the query
95029504
size_t *pSuggestedLocalWorkSize);
95039505

9506+
///////////////////////////////////////////////////////////////////////////////
9507+
/// @brief Set kernel args and get the suggested local work size for a kernel.
9508+
///
9509+
/// @details
9510+
/// - Query a suggested local work size for a kernel given a global size for
9511+
/// each dimension.
9512+
/// - The application may call this function from simultaneous threads for
9513+
/// the same context.
9514+
///
9515+
/// @returns
9516+
/// - ::UR_RESULT_SUCCESS
9517+
/// - ::UR_RESULT_ERROR_UNINITIALIZED
9518+
/// - ::UR_RESULT_ERROR_DEVICE_LOST
9519+
/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
9520+
/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
9521+
/// + `NULL == hKernel`
9522+
/// + `NULL == hQueue`
9523+
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
9524+
/// + `NULL == pGlobalWorkOffset`
9525+
/// + `NULL == pGlobalWorkSize`
9526+
/// + `NULL == pSuggestedLocalWorkSize`
9527+
/// + `pArgs == NULL && numArgs > 0`
9528+
/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION
9529+
/// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type`
9530+
/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
9531+
UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs(
9532+
/// [in] handle of the kernel
9533+
ur_kernel_handle_t hKernel,
9534+
/// [in] handle of the queue object
9535+
ur_queue_handle_t hQueue,
9536+
/// [in] number of dimensions, from 1 to 3, to specify the global
9537+
/// and work-group work-items
9538+
uint32_t numWorkDim,
9539+
/// [in] pointer to an array of numWorkDim unsigned values that specify
9540+
/// the offset used to calculate the global ID of a work-item
9541+
const size_t *pGlobalWorkOffset,
9542+
/// [in] pointer to an array of numWorkDim unsigned values that specify
9543+
/// the number of global work-items in workDim that will execute the
9544+
/// kernel function
9545+
const size_t *pGlobalWorkSize,
9546+
/// [in] Number of entries in pArgs
9547+
uint32_t numArgs,
9548+
/// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
9549+
/// properties.
9550+
const ur_exp_kernel_arg_properties_t *pArgs,
9551+
/// [out] pointer to an array of numWorkDim unsigned values that specify
9552+
/// suggested local work size that will contain the result of the query
9553+
size_t *pSuggestedLocalWorkSize);
9554+
95049555
///////////////////////////////////////////////////////////////////////////////
95059556
/// @brief Query the maximum number of work groups for a cooperative kernel
95069557
///
@@ -14581,6 +14632,21 @@ typedef struct ur_kernel_get_suggested_local_work_size_params_t {
1458114632
size_t **ppSuggestedLocalWorkSize;
1458214633
} ur_kernel_get_suggested_local_work_size_params_t;
1458314634

14635+
///////////////////////////////////////////////////////////////////////////////
14636+
/// @brief Function parameters for urKernelGetSuggestedLocalWorkSizeWithArgs
14637+
/// @details Each entry is a pointer to the parameter passed to the function;
14638+
/// allowing the callback the ability to modify the parameter's value
14639+
typedef struct ur_kernel_get_suggested_local_work_size_with_args_params_t {
14640+
ur_kernel_handle_t *phKernel;
14641+
ur_queue_handle_t *phQueue;
14642+
uint32_t *pnumWorkDim;
14643+
const size_t **ppGlobalWorkOffset;
14644+
const size_t **ppGlobalWorkSize;
14645+
uint32_t *pnumArgs;
14646+
const ur_exp_kernel_arg_properties_t **ppArgs;
14647+
size_t **ppSuggestedLocalWorkSize;
14648+
} ur_kernel_get_suggested_local_work_size_with_args_params_t;
14649+
1458414650
///////////////////////////////////////////////////////////////////////////////
1458514651
/// @brief Function parameters for urKernelSetArgValue
1458614652
/// @details Each entry is a pointer to the parameter passed to the function;

unified-runtime/include/unified-runtime/ur_api_funcs.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ _UR_API(urKernelRelease)
7272
_UR_API(urKernelGetNativeHandle)
7373
_UR_API(urKernelCreateWithNativeHandle)
7474
_UR_API(urKernelGetSuggestedLocalWorkSize)
75+
_UR_API(urKernelGetSuggestedLocalWorkSizeWithArgs)
7576
_UR_API(urKernelSetArgValue)
7677
_UR_API(urKernelSetArgLocal)
7778
_UR_API(urKernelSetArgPointer)

unified-runtime/include/unified-runtime/ur_ddi.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,13 @@ typedef ur_result_t(UR_APICALL *ur_pfnKernelGetSuggestedLocalWorkSize_t)(
521521
ur_kernel_handle_t, ur_queue_handle_t, uint32_t, const size_t *,
522522
const size_t *, size_t *);
523523

524+
///////////////////////////////////////////////////////////////////////////////
525+
/// @brief Function-pointer for urKernelGetSuggestedLocalWorkSizeWithArgs
526+
typedef ur_result_t(
527+
UR_APICALL *ur_pfnKernelGetSuggestedLocalWorkSizeWithArgs_t)(
528+
ur_kernel_handle_t, ur_queue_handle_t, uint32_t, const size_t *,
529+
const size_t *, uint32_t, const ur_exp_kernel_arg_properties_t *, size_t *);
530+
524531
///////////////////////////////////////////////////////////////////////////////
525532
/// @brief Function-pointer for urKernelSetArgValue
526533
typedef ur_result_t(UR_APICALL *ur_pfnKernelSetArgValue_t)(
@@ -580,6 +587,8 @@ typedef struct ur_kernel_dditable_t {
580587
ur_pfnKernelGetNativeHandle_t pfnGetNativeHandle;
581588
ur_pfnKernelCreateWithNativeHandle_t pfnCreateWithNativeHandle;
582589
ur_pfnKernelGetSuggestedLocalWorkSize_t pfnGetSuggestedLocalWorkSize;
590+
ur_pfnKernelGetSuggestedLocalWorkSizeWithArgs_t
591+
pfnGetSuggestedLocalWorkSizeWithArgs;
583592
ur_pfnKernelSetArgValue_t pfnSetArgValue;
584593
ur_pfnKernelSetArgLocal_t pfnSetArgLocal;
585594
ur_pfnKernelSetArgPointer_t pfnSetArgPointer;

unified-runtime/include/unified-runtime/ur_print.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2169,6 +2169,19 @@ urPrintKernelGetSuggestedLocalWorkSizeParams(
21692169
const struct ur_kernel_get_suggested_local_work_size_params_t *params,
21702170
char *buffer, const size_t buff_size, size_t *out_size);
21712171

2172+
///////////////////////////////////////////////////////////////////////////////
2173+
/// @brief Print ur_kernel_get_suggested_local_work_size_with_args_params_t
2174+
/// struct
2175+
/// @returns
2176+
/// - ::UR_RESULT_SUCCESS
2177+
/// - ::UR_RESULT_ERROR_INVALID_SIZE
2178+
/// - `buff_size < out_size`
2179+
UR_APIEXPORT ur_result_t UR_APICALL
2180+
urPrintKernelGetSuggestedLocalWorkSizeWithArgsParams(
2181+
const struct ur_kernel_get_suggested_local_work_size_with_args_params_t
2182+
*params,
2183+
char *buffer, const size_t buff_size, size_t *out_size);
2184+
21722185
///////////////////////////////////////////////////////////////////////////////
21732186
/// @brief Print ur_kernel_set_arg_value_params_t struct
21742187
/// @returns

unified-runtime/include/unified-runtime/ur_print.hpp

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1373,6 +1373,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
13731373
case UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_WITH_ARGS_EXP:
13741374
os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_WITH_ARGS_EXP";
13751375
break;
1376+
case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS:
1377+
os << "UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS";
1378+
break;
13761379
default:
13771380
os << "unknown enumerator";
13781381
break;
@@ -14923,6 +14926,67 @@ operator<<(std::ostream &os, [[maybe_unused]] const struct
1492314926
return os;
1492414927
}
1492514928

14929+
///////////////////////////////////////////////////////////////////////////////
14930+
/// @brief Print operator for the
14931+
/// ur_kernel_get_suggested_local_work_size_with_args_params_t type
14932+
/// @returns
14933+
/// std::ostream &
14934+
inline std::ostream &
14935+
operator<<(std::ostream &os, [[maybe_unused]] const struct
14936+
ur_kernel_get_suggested_local_work_size_with_args_params_t *params) {
14937+
14938+
os << ".hKernel = ";
14939+
14940+
ur::details::printPtr(os, *(params->phKernel));
14941+
14942+
os << ", ";
14943+
os << ".hQueue = ";
14944+
14945+
ur::details::printPtr(os, *(params->phQueue));
14946+
14947+
os << ", ";
14948+
os << ".numWorkDim = ";
14949+
14950+
os << *(params->pnumWorkDim);
14951+
14952+
os << ", ";
14953+
os << ".pGlobalWorkOffset = ";
14954+
14955+
ur::details::printPtr(os, *(params->ppGlobalWorkOffset));
14956+
14957+
os << ", ";
14958+
os << ".pGlobalWorkSize = ";
14959+
14960+
ur::details::printPtr(os, *(params->ppGlobalWorkSize));
14961+
14962+
os << ", ";
14963+
os << ".numArgs = ";
14964+
14965+
os << *(params->pnumArgs);
14966+
14967+
os << ", ";
14968+
os << ".pArgs = ";
14969+
ur::details::printPtr(os, reinterpret_cast<const void *>(*(params->ppArgs)));
14970+
if (*(params->ppArgs) != NULL) {
14971+
os << " {";
14972+
for (size_t i = 0; i < *params->pnumArgs; ++i) {
14973+
if (i != 0) {
14974+
os << ", ";
14975+
}
14976+
14977+
os << (*(params->ppArgs))[i];
14978+
}
14979+
os << "}";
14980+
}
14981+
14982+
os << ", ";
14983+
os << ".pSuggestedLocalWorkSize = ";
14984+
14985+
ur::details::printPtr(os, *(params->ppSuggestedLocalWorkSize));
14986+
14987+
return os;
14988+
}
14989+
1492614990
///////////////////////////////////////////////////////////////////////////////
1492714991
/// @brief Print operator for the ur_kernel_set_arg_value_params_t type
1492814992
/// @returns
@@ -22582,6 +22646,10 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os,
2258222646
os << (const struct ur_kernel_get_suggested_local_work_size_params_t *)
2258322647
params;
2258422648
} break;
22649+
case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS: {
22650+
os << (const struct
22651+
ur_kernel_get_suggested_local_work_size_with_args_params_t *)params;
22652+
} break;
2258522653
case UR_FUNCTION_KERNEL_SET_ARG_VALUE: {
2258622654
os << (const struct ur_kernel_set_arg_value_params_t *)params;
2258722655
} break;

unified-runtime/scripts/core/kernel.yml

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,55 @@ returns:
602602
- $X_RESULT_ERROR_UNSUPPORTED_FEATURE
603603
--- #--------------------------------------------------------------------------
604604
type: function
605+
desc: "Set kernel args and get the suggested local work size for a kernel."
606+
class: $xKernel
607+
name: GetSuggestedLocalWorkSizeWithArgs
608+
ordinal: "0"
609+
details:
610+
- "Query a suggested local work size for a kernel given a global size for each dimension."
611+
- "The application may call this function from simultaneous threads for the same context."
612+
params:
613+
- type: $x_kernel_handle_t
614+
name: hKernel
615+
desc: |
616+
[in] handle of the kernel
617+
- type: $x_queue_handle_t
618+
name: hQueue
619+
desc: |
620+
[in] handle of the queue object
621+
- type: uint32_t
622+
name: numWorkDim
623+
desc: |
624+
[in] number of dimensions, from 1 to 3, to specify the global
625+
and work-group work-items
626+
- type: const size_t*
627+
name: pGlobalWorkOffset
628+
desc: |
629+
[in] pointer to an array of numWorkDim unsigned values that specify
630+
the offset used to calculate the global ID of a work-item
631+
- type: const size_t*
632+
name: pGlobalWorkSize
633+
desc: |
634+
[in] pointer to an array of numWorkDim unsigned values that specify
635+
the number of global work-items in workDim that will execute the
636+
kernel function
637+
- type: uint32_t
638+
name: numArgs
639+
desc: "[in] Number of entries in pArgs"
640+
- type: "const $x_exp_kernel_arg_properties_t*"
641+
name: pArgs
642+
desc: "[in][optional][range(0, numArgs)] pointer to a list of kernel arg properties."
643+
- type: size_t*
644+
name: pSuggestedLocalWorkSize
645+
desc: |
646+
[out] pointer to an array of numWorkDim unsigned values that specify
647+
suggested local work size that will contain the result of the query
648+
returns:
649+
- $X_RESULT_ERROR_INVALID_NULL_POINTER:
650+
- "`pArgs == NULL && numArgs > 0`"
651+
- $X_RESULT_ERROR_UNSUPPORTED_FEATURE
652+
--- #--------------------------------------------------------------------------
653+
type: function
605654
desc: "Query the maximum number of work groups for a cooperative kernel"
606655
class: $xKernel
607656
name: SuggestMaxCooperativeGroupCount

unified-runtime/scripts/core/registry.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -730,7 +730,10 @@ etors:
730730
- name: COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_WITH_ARGS_EXP
731731
desc: Enumerator for $xCommandBufferAppendKernelLaunchWithArgsExp
732732
value: '310'
733-
max_id: '310'
733+
- name: KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS
734+
desc: Enumerator for $xKernelGetSuggestedLocalWorkSizeWithArgs
735+
value: '311'
736+
max_id: '311'
734737
---
735738
type: enum
736739
desc: Defines structure types

unified-runtime/source/adapters/cuda/kernel.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
445445
return UR_RESULT_SUCCESS;
446446
}
447447

448+
UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs(
449+
ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim,
450+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
451+
[[maybe_unused]] uint32_t numArgs,
452+
[[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs,
453+
size_t *pSuggestedLocalWorkSize) {
454+
return urKernelGetSuggestedLocalWorkSize(hKernel, hQueue, workDim,
455+
pGlobalWorkOffset, pGlobalWorkSize,
456+
pSuggestedLocalWorkSize);
457+
}
458+
448459
UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants(
449460
ur_kernel_handle_t, uint32_t, const ur_specialization_constant_info_t *) {
450461
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;

unified-runtime/source/adapters/cuda/ur_interface_loader.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
141141
pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
142142
pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants;
143143
pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
144+
pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs =
145+
urKernelGetSuggestedLocalWorkSizeWithArgs;
144146
pDdiTable->pfnSuggestMaxCooperativeGroupCount =
145147
urKernelSuggestMaxCooperativeGroupCount;
146148
return UR_RESULT_SUCCESS;

unified-runtime/source/adapters/hip/kernel.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,3 +373,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
373373
pSuggestedLocalWorkSize);
374374
return UR_RESULT_SUCCESS;
375375
}
376+
377+
UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs(
378+
ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim,
379+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
380+
[[maybe_unused]] uint32_t numArgs,
381+
[[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs,
382+
size_t *pSuggestedLocalWorkSize) {
383+
return urKernelGetSuggestedLocalWorkSize(hKernel, hQueue, workDim,
384+
pGlobalWorkOffset, pGlobalWorkSize,
385+
pSuggestedLocalWorkSize);
386+
}

0 commit comments

Comments
 (0)