Skip to content

Commit 8e628e8

Browse files
tqchenseberg
andcommitted
Incorporate feedbacks from sberg
Co-authored-by: Sebastian Berg <sebastianb@nvidia.com>
1 parent bddb25b commit 8e628e8

File tree

1 file changed

+104
-109
lines changed

1 file changed

+104
-109
lines changed

include/dlpack/dlpack.h

Lines changed: 104 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -362,57 +362,51 @@ typedef struct DLManagedTensorVersioned {
362362
DLTensor dl_tensor;
363363
} DLManagedTensorVersioned;
364364

365-
//--------------------------------------------------------------------
366-
// DLPack C functions for speed exchange
367-
//--------------------------------------------------------------------
365+
//----------------------------------------------------------------------
366+
// DLPack `__c_dlpack_exchange_api__` fast exchange protocol definitions
367+
//----------------------------------------------------------------------
368368
/*!
369-
* \brief A generic C-style allocator that exposes allocation of a Tensor/Array.
369+
* \brief Request a producer library to create a new tensor.
370370
*
371-
* This information can then be used to set allocators of a callee to run allocations.
372-
* This information can then be used to set the callee's allocator to perform allocations.
373-
* This function can be exposed by the framework through the DLPackExchangeAPI.
371+
* Create a new `DLManagedTensorVersioned` within the context of the producer
372+
* library. The allocation is defined via the prototype DLTensor.
374373
*
375-
* This particular function does not assume a Python environment; as a result,
376-
* the error handling mechanism is different from Python-related functions.
374+
* This function is exposed by the framework through the DLPackExchangeAPI.
377375
*
378-
* \param prototype The prototype DLTensor to offer details about the device and shape.
379-
* Other field information will be ignored during allocation.
376+
* \param prototype The prototype DLTensor. Only the dtype, ndim, shape,
377+
* and device fields are used.
380378
* \param out The output DLManagedTensorVersioned.
381-
* \param error_ctx The context to set the error.
379+
* \param error_ctx Context for `SetError`.
382380
* \param SetError The function to set the error.
383-
* \return 0 on success, -1 on failure.
384-
* The callee should call SetError(error_ctx, kind, message) to set the error kind and message.
385-
* \note Error propagation via SetError.
381+
* \return The owning DLManagedTensorVersioned* or NULL on failure.
382+
* SetError is called exactly when NULL is returned (the implementor
383+
* must ensure this).
384+
* \note - As a C function, must not thrown C++ exceptions.
385+
* - Error propagation via SetError to avoid any direct need
386+
* of Python API. Due to this `SetError` may have to ensure the GIL is
387+
* held since it will presumably set a Python error.
386388
*
387389
* \sa DLPackExchangeAPI
388390
*/
389-
typedef int (*DLPackManagedTensorAllocator)( //
390-
DLTensor* prototype, DLManagedTensorVersioned** out, void* error_ctx, //
391-
void (*SetError)(void* error_ctx, const char* kind, const char* message) //
391+
typedef int (*DLPackManagedTensorAllocator)( //
392+
DLTensor* prototype, DLManagedTensorVersioned** out, void* error_ctx, //
393+
void (*SetError)(void* error_ctx, const char* kind, const char* message) //
392394
);
393395

394396
/*!
395397
* \brief Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned.
396398
*
397-
* This function is a C-style function pointer to quickly convert a PyObject* Tensor/NDArray
398-
* to a DLManagedTensorVersioned without going through the Python interpreter.
399-
*
400399
* This function does not perform any stream synchronization. The consumer should query
401400
* DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
402401
*
403402
* This function is exposed by the framework through the DLPackExchangeAPI.
404403
*
405-
* This information can then be picked up by importers and libraries to perform a fast conversion.
406-
* This function should not throw any exceptions; if it fails, it should return -1 and
407-
* set the error message via PyErr_SetXXX.
408-
*
409-
* \param py_object The Python object to convert; this should be PyObject*.
410-
* We use void* to avoid dependency on Python.h.
411-
*
412-
* \param out The output DLManagedTensorVersioned.
413-
* \return 0 on success, -1 on failure. PyError should be set if -1 is returned.
414-
* \note We use void* to avoid dependency on Python.h, so this specific type is
415-
* not dependent on Python.h and can be copied to dlpack.h.
404+
* \param py_object The Python object to convert. Must have the same type
405+
* as the one the `DLPackExchangeAPI` was discovered from.
406+
* \return The owning DLManagedTensorVersioned* or NULL on failure with a
407+
* Python exception set. If the data cannot be described using DLPack
408+
* this should be a BufferError if possible.
409+
* \note - As a C function, must not thrown C++ exceptions.
416410
*
417411
* \sa DLPackExchangeAPI, DLPackCurrentWorkStream
418412
*/
@@ -422,38 +416,26 @@ typedef int (*DLPackManagedTensorFromPyObjectNoSync)( //
422416
);
423417

424418
/*!
425-
* \brief Exports a PyObject* Tensor/NDArray to a DLTensor whose space is pre-allocated on stack.
426-
*
427-
* This function is a C-style function pointer to quickly convert a PyObject* Tensor/NDArray
428-
* to a DLTensor whose space is pre-allocated on stack without going through the Python interpreter.
419+
* \brief Exports a PyObject* Tensor/NDArray to a provided DLTensor.
429420
*
430-
* This is an non-owning conversion, the producer still owns the memory of data, strides, shape.
431-
* The liveness of DLTensor is only guaranteed until the consumer returns control to the caller.
421+
* This function provides a faster interface for temporary, non-owning, exchange.
422+
* The producer (implementor) still owns the memory of data, strides, shape.
423+
* The liveness of the DLTensor and the data it views is only guaranteed until
424+
* control is returned.
432425
*
433-
* In the context of this function, we expect the producer to allocated space for data, strides and shape.
426+
* This function currently assumes that the producer (implementor) can fill
427+
* in the DLTensor shape and strides without the need for temporary allocations.
434428
*
435429
* This function does not perform any stream synchronization. The consumer should query
436430
* DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
437431
*
438-
* This function is useful when the consumer do not need to retain the tensor memory.
439-
* It generally can provide about 2x faster conversion than DLPackManagedTensorFromPyObjectNoSync.
440-
*
441-
* For cases where consumer may needs to reorganize the tensor memory via temporary managed copy,
442-
* DLPackManagedTensorFromPyObjectNoSync should be used.
443-
*
444432
* This function is exposed by the framework through the DLPackExchangeAPI.
445433
*
446-
* This information can then be picked up by importers and libraries to perform a fast conversion.
447-
* This function should not throw any exceptions; if it fails, it should return -1 and
448-
* set the error message via PyErr_SetXXX.
449-
*
450-
* \param py_object The Python object to convert; this should be PyObject*.
451-
* We use void* to avoid dependency on Python.h.
452-
*
434+
* \param py_object The Python object to convert. Must have the same type
435+
* as the one the `DLPackExchangeAPI` was discovered from.
453436
* \param out The output DLTensor, whose space is pre-allocated on stack.
454-
* \return 0 on success, -1 on failure. PyError should be set if -1 is returned.
455-
* \note We use void* to avoid dependency on Python.h, so this specific type is
456-
* not dependent on Python.h and can be copied to dlpack.h.
437+
* \return 0 on success, -1 on failure with a Python exception set.
438+
* \note - As a C function, must not thrown C++ exceptions.
457439
*
458440
* \sa DLPackExchangeAPI, DLPackCurrentWorkStream
459441
*/
@@ -465,21 +447,21 @@ typedef int (*DLPackDLTensorFromPyObjectNoSync)( //
465447
/*!
466448
* \brief Obtain the current work stream of a device.
467449
*
468-
* This function is a C-style function pointer to obtain the current work stream
469-
* of a device for frameworks that rely on a context manager to manage the stream.
450+
* Obtain the current work stream of a device from the producer framework.
470451
* For example, it should map to torch.cuda.current_stream in PyTorch.
471452
*
472-
* This function can be set to NULL if the framework does not rely on a context manager
473-
* to manage the stream. However, we encourage frameworks to provide this function
474-
* if possible.
475-
*
476-
* As if this field is not set, likely consumer cannot safely do stream based
477-
* exchange based on the
453+
* When device_type is kDLCPU, the consumer do not have to query the stream
454+
* and the producer can simply return NULL when queried.
455+
* The consumer do not have to do anything on stream sync or setting.
456+
* So CPU only framework can just provide a dummy implementation that
457+
* always set out_current_stream[0] to NULL.
478458
*
479459
* \param device_type The device type.
480460
* \param device_id The device id.
481461
* \param out_current_stream The output current work stream.
482-
* \return 0 on success, -1 on failure.
462+
*
463+
* \return 0 on success, -1 on failure with a Python exception set.
464+
* \note - As a C function, must not thrown C++ exceptions.
483465
*
484466
* \sa DLPackExchangeAPI
485467
*/
@@ -492,54 +474,47 @@ typedef int (*DLPackCurrentWorkStream)( //
492474
/*!
493475
* \brief Imports a DLManagedTensorVersioned to a PyObject* Tensor/NDArray.
494476
*
495-
* This function is a C-style function pointer to quickly convert a DLManagedTensorVersioned
496-
* to a PyObject* without going through the Python Interpreter.
477+
* Convert an owning DLManagedTensorVersioned* to the Python tensor of the
478+
* producer (implementor) library with the correct type.
497479
*
498480
* This function does not perform any stream synchronization.
499481
*
500482
* This function is exposed by the framework through the DLPackExchangeAPI.
501483
*
502-
* \param tensor The DLManagedTensorVersioned to convert.
484+
* \param tensor The DLManagedTensorVersioned to convert the ownership of the
485+
* the data is stolen.
503486
* \param out_py_object The output Python object.
504-
* \return 0 on success, -1 on failure. PyError should be set if -1 is returned.
505-
* \note We use void* to avoid dependency on Python.h, so this specific type is
506-
* not dependent on Python.h and can be copied to dlpack.h.
487+
* \return 0 on success, -1 on failure with a Python exception set.
507488
*
508489
* \sa DLPackExchangeAPI
509490
*/
510-
typedef int (*DLPackManagedTensorToPyObjectNoSync)( //
511-
DLManagedTensorVersioned* tensor, void** out_py_object //
491+
typedef int (*DLPackManagedTensorToPyObjectNoSync)( //
492+
DLManagedTensorVersioned* tensor, //
493+
void** out_py_object //
512494
);
513495

514496
/*!
515497
* \brief Framework-specific function pointers table for DLPack exchange.
516498
*
517-
* Guidelines for leveraging DLPackExchangeAPI:
499+
* Additionally to `__dlpack__()` we define a C function table sharable by
500+
* Python implementations via `__c_dlpack_exchange_api__`.
501+
* This attribute must be set on the type as a Python integer compatible
502+
* with `PyLong_FromVoidPtr`/`PyLong_AsVoidPtr`.
518503
*
519-
* There are generally two kinds of consumer needs for DLPack exchange:
520-
* - N0: library support, where consumer.kernel(x, y, z) would like to run a kernel
521-
* with the data from x, y, z. The consumer is also expected to run the kernel with the same
522-
* stream context as the producer. For example, when x, y, z is torch.Tensor,
523-
* consumer should query exchange_api->current_work_stream to get the
524-
* current stream and launch the kernel with the same stream.
525-
* This setup is necessary for no synchronization in kernel launch and maximum compatibility
526-
* with CUDA graph capture in the producer.
527-
* This is the desirable behavior for library extension support for frameworks like PyTorch.
528-
* - N1: data ingestion and retention
504+
* A consumer library may use a pattern such as:
529505
*
530-
* Note that obj.__dlpack__() API should provide useful ways for N1.
531-
* The primary focus of the current DLPackExchangeAPI is to enable faster exchange N0
532-
* with the support of the function pointer current_work_stream.
506+
* \code
533507
*
534-
* Array/Tensor libraries should statically create and initialize this structure
535-
* then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array.
536-
* The DLPackExchangeAPI* should stay alive throughout the lifetime of the process.
508+
* PyObject *api_obj = type(tensor_obj).__c_dlpack_exchange_api__; // as C-code
509+
* MyDLPackExchangeAPI *api = PyLong_AsVoidPtr(api_obj);
510+
* if (api == NULL && PyErr_Occurred()) { goto handle_error; }
537511
*
538-
* One simple way to do so is to create a static instance of DLPackExchangeAPI
539-
* within the framework and return a pointer to it. The following code
540-
* shows an example to do so in C++. It should also be reasonably easy
541-
* to do so in other languages.
512+
* \endcode
513+
*
514+
* Note that this must be defined on the type. The consumer should look up the
515+
* attribute on the type and may cache the result for each unique type.
542516
*
517+
* The precise API table is given by:
543518
* \code
544519
* struct MyDLPackExchangeAPI : public DLPackExchangeAPI {
545520
* MyDLPackExchangeAPI() {
@@ -560,55 +535,75 @@ typedef int (*DLPackManagedTensorToPyObjectNoSync)( //
560535
* };
561536
* \endcode
562537
*
563-
* Each framework should attach a dunder `__c_dlpack_exchange_api__` integer
564-
* to point to the DLPackExchangeAPI* pointer.
538+
* Guidelines for leveraging DLPackExchangeAPI:
565539
*
566-
* Importantly, the attribute should be attached to the class of the Tensor, not the instance.
540+
* There are generally two kinds of consumer needs for DLPack exchange:
541+
* - N0: library support, where consumer.kernel(x, y, z) would like to run a kernel
542+
* with the data from x, y, z. The consumer is also expected to run the kernel with the same
543+
* stream context as the producer. For example, when x, y, z is torch.Tensor,
544+
* consumer should query exchange_api->current_work_stream to get the
545+
* current stream and launch the kernel with the same stream.
546+
* This setup is necessary for no synchronization in kernel launch and maximum compatibility
547+
* with CUDA graph capture in the producer.
548+
* This is the desirable behavior for library extension support for frameworks like PyTorch.
549+
* - N1: data ingestion and retention
567550
*
568-
* mypackage.Tensor.__c_dlpack_exchange_api__ = MyPackageDLPackExchangeAPI
551+
* Note that obj.__dlpack__() API should provide useful ways for N1.
552+
* The primary focus of the current DLPackExchangeAPI is to enable faster exchange N0
553+
* with the support of the function pointer current_work_stream.
569554
*
570-
* or equivalently:
555+
* Array/Tensor libraries should statically create and initialize this structure
556+
* then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array.
557+
* The DLPackExchangeAPI* must stay alive throughout the lifetime of the process.
571558
*
572-
* type(tensor_obj).__c_dlpack_exchange_api__ = MyPackageDLPackExchangeAPI
559+
* One simple way to do so is to create a static instance of DLPackExchangeAPI
560+
* within the framework and return a pointer to it. The following code
561+
* shows an example to do so in C++. It should also be reasonably easy
562+
* to do so in other languages.
573563
*/
574564
struct DLPackExchangeAPI {
575565
/*!
576-
* \brief The current DLPack version.
566+
* \brief The provided DLPack version the consumer must check major version
567+
* compatibility before using this struct.
577568
*/
578569
DLPackVersion version;
579570
/*!
580571
* \brief Optional pointer to an older DLPackExchangeAPI in the chain.
581572
*
582-
* It should be set to NULL if the framework does not support older versions.
573+
* It must be NULL if the framework does not support older versions.
574+
* If the current major version is larger than the one supported by the
575+
* consumer, the consumer may walk this to find an earlier supported version.
583576
*
584577
* \sa DLPackExchangeAPI
585578
*/
586579
struct DLPackExchangeAPI* prev_version_api;
587580
/*!
588-
* \brief Framework-specific function pointer for DLPackManagedTensorAllocator
581+
* \brief Producer function pointer for DLPackManagedTensorAllocator
582+
* This function must be not NULL.
589583
* \sa DLPackManagedTensorAllocator
590584
*/
591585
DLPackManagedTensorAllocator managed_tensor_allocator;
592586
/*!
593-
* \brief Framework-specific function pointer for DLPackManagedTensorFromPyObject
587+
* \brief Producer function pointer for DLPackManagedTensorFromPyObject
588+
* This function must be not NULL.
594589
* \sa DLPackManagedTensorFromPyObject
595590
*/
596591
DLPackManagedTensorFromPyObjectNoSync managed_tensor_from_py_object_no_sync;
597592
/*!
598-
* \brief Framework-specific function pointer for DLPackManagedTensorToPyObject
593+
* \brief Producer function pointer for DLPackManagedTensorToPyObject
594+
* This function must be not NULL.
599595
* \sa DLPackManagedTensorToPyObject
600596
*/
601597
DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync;
602598
/*!
603-
* \brief Framework-specific function pointer for DLPackDLTensorFromPyObject
599+
* \brief Producer function pointer for DLPackDLTensorFromPyObject
600+
* This function can be NULL when the producer does not support this function.
604601
* \sa DLPackDLTensorFromPyObjectNoSync
605602
*/
606603
DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync;
607604
/*!
608-
* \brief Framework-specific function pointer for DLPackCurrentWorkStream
609-
*
610-
* This function can be set to NULL if the framework does not rely on context manager to manage the stream.
611-
*
605+
* \brief Producer function pointer for DLPackCurrentWorkStream
606+
* This function must be not NULL.
612607
* \sa DLPackCurrentWorkStream
613608
*/
614609
DLPackCurrentWorkStream current_work_stream;

0 commit comments

Comments
 (0)