@@ -362,57 +362,51 @@ typedef struct DLManagedTensorVersioned {
362362 DLTensor dl_tensor;
363363} DLManagedTensorVersioned;
364364
365- // --------------------------------------------------------------------
366- // DLPack C functions for speed exchange
367- // --------------------------------------------------------------------
365+ // ----------------------------------------------------------------------
366+ // DLPack `__c_dlpack_exchange_api__` fast exchange protocol definitions
367+ // ----------------------------------------------------------------------
368368/* !
369- * \brief A generic C-style allocator that exposes allocation of a Tensor/Array .
369+ * \brief Request a producer library to create a new tensor .
370370 *
371- * This information can then be used to set allocators of a callee to run allocations.
372- * This information can then be used to set the callee's allocator to perform allocations.
373- * This function can be exposed by the framework through the DLPackExchangeAPI.
371+ * Create a new `DLManagedTensorVersioned` within the context of the producer
372+ * library. The allocation is defined via the prototype DLTensor.
374373 *
375- * This particular function does not assume a Python environment; as a result,
376- * the error handling mechanism is different from Python-related functions.
374+ * This function is exposed by the framework through the DLPackExchangeAPI.
377375 *
378- * \param prototype The prototype DLTensor to offer details about the device and shape.
379- * Other field information will be ignored during allocation .
376+ * \param prototype The prototype DLTensor. Only the dtype, ndim, shape,
377+ * and device fields are used .
380378 * \param out The output DLManagedTensorVersioned.
381- * \param error_ctx The context to set the error .
379+ * \param error_ctx Context for `SetError` .
382380 * \param SetError The function to set the error.
383- * \return 0 on success, -1 on failure.
384- * The callee should call SetError(error_ctx, kind, message) to set the error kind and message.
385- * \note Error propagation via SetError.
381+ * \return The owning DLManagedTensorVersioned* or NULL on failure.
382+ * SetError is called exactly when NULL is returned (the implementor
383+ * must ensure this).
384+ * \note - As a C function, must not thrown C++ exceptions.
385+ * - Error propagation via SetError to avoid any direct need
386+ * of Python API. Due to this `SetError` may have to ensure the GIL is
387+ * held since it will presumably set a Python error.
386388 *
387389 * \sa DLPackExchangeAPI
388390 */
389- typedef int (*DLPackManagedTensorAllocator)( //
390- DLTensor* prototype, DLManagedTensorVersioned** out, void * error_ctx, //
391- void (*SetError)(void * error_ctx, const char * kind, const char * message) //
391+ typedef int (*DLPackManagedTensorAllocator)( //
392+ DLTensor* prototype, DLManagedTensorVersioned** out, void * error_ctx, //
393+ void (*SetError)(void * error_ctx, const char * kind, const char * message) //
392394);
393395
394396/* !
395397 * \brief Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned.
396398 *
397- * This function is a C-style function pointer to quickly convert a PyObject* Tensor/NDArray
398- * to a DLManagedTensorVersioned without going through the Python interpreter.
399- *
400399 * This function does not perform any stream synchronization. The consumer should query
401400 * DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
402401 *
403402 * This function is exposed by the framework through the DLPackExchangeAPI.
404403 *
405- * This information can then be picked up by importers and libraries to perform a fast conversion.
406- * This function should not throw any exceptions; if it fails, it should return -1 and
407- * set the error message via PyErr_SetXXX.
408- *
409- * \param py_object The Python object to convert; this should be PyObject*.
410- * We use void* to avoid dependency on Python.h.
411- *
412- * \param out The output DLManagedTensorVersioned.
413- * \return 0 on success, -1 on failure. PyError should be set if -1 is returned.
414- * \note We use void* to avoid dependency on Python.h, so this specific type is
415- * not dependent on Python.h and can be copied to dlpack.h.
404+ * \param py_object The Python object to convert. Must have the same type
405+ * as the one the `DLPackExchangeAPI` was discovered from.
406+ * \return The owning DLManagedTensorVersioned* or NULL on failure with a
407+ * Python exception set. If the data cannot be described using DLPack
408+ * this should be a BufferError if possible.
409+ * \note - As a C function, must not thrown C++ exceptions.
416410 *
417411 * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
418412 */
@@ -422,38 +416,26 @@ typedef int (*DLPackManagedTensorFromPyObjectNoSync)( //
422416);
423417
424418/* !
425- * \brief Exports a PyObject* Tensor/NDArray to a DLTensor whose space is pre-allocated on stack.
426- *
427- * This function is a C-style function pointer to quickly convert a PyObject* Tensor/NDArray
428- * to a DLTensor whose space is pre-allocated on stack without going through the Python interpreter.
419+ * \brief Exports a PyObject* Tensor/NDArray to a provided DLTensor.
429420 *
430- * This is an non-owning conversion, the producer still owns the memory of data, strides, shape.
431- * The liveness of DLTensor is only guaranteed until the consumer returns control to the caller.
421+ * This function provides a faster interface for temporary, non-owning, exchange.
422+ * The producer (implementor) still owns the memory of data, strides, shape.
423+ * The liveness of the DLTensor and the data it views is only guaranteed until
424+ * control is returned.
432425 *
433- * In the context of this function, we expect the producer to allocated space for data, strides and shape.
426+ * This function currently assumes that the producer (implementor) can fill
427+ * in the DLTensor shape and strides without the need for temporary allocations.
434428 *
435429 * This function does not perform any stream synchronization. The consumer should query
436430 * DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
437431 *
438- * This function is useful when the consumer do not need to retain the tensor memory.
439- * It generally can provide about 2x faster conversion than DLPackManagedTensorFromPyObjectNoSync.
440- *
441- * For cases where consumer may needs to reorganize the tensor memory via temporary managed copy,
442- * DLPackManagedTensorFromPyObjectNoSync should be used.
443- *
444432 * This function is exposed by the framework through the DLPackExchangeAPI.
445433 *
446- * This information can then be picked up by importers and libraries to perform a fast conversion.
447- * This function should not throw any exceptions; if it fails, it should return -1 and
448- * set the error message via PyErr_SetXXX.
449- *
450- * \param py_object The Python object to convert; this should be PyObject*.
451- * We use void* to avoid dependency on Python.h.
452- *
434+ * \param py_object The Python object to convert. Must have the same type
435+ * as the one the `DLPackExchangeAPI` was discovered from.
453436 * \param out The output DLTensor, whose space is pre-allocated on stack.
454- * \return 0 on success, -1 on failure. PyError should be set if -1 is returned.
455- * \note We use void* to avoid dependency on Python.h, so this specific type is
456- * not dependent on Python.h and can be copied to dlpack.h.
437+ * \return 0 on success, -1 on failure with a Python exception set.
438+ * \note - As a C function, must not thrown C++ exceptions.
457439 *
458440 * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
459441 */
@@ -465,21 +447,21 @@ typedef int (*DLPackDLTensorFromPyObjectNoSync)( //
465447/* !
466448 * \brief Obtain the current work stream of a device.
467449 *
468- * This function is a C-style function pointer to obtain the current work stream
469- * of a device for frameworks that rely on a context manager to manage the stream.
450+ * Obtain the current work stream of a device from the producer framework.
470451 * For example, it should map to torch.cuda.current_stream in PyTorch.
471452 *
472- * This function can be set to NULL if the framework does not rely on a context manager
473- * to manage the stream. However, we encourage frameworks to provide this function
474- * if possible.
475- *
476- * As if this field is not set, likely consumer cannot safely do stream based
477- * exchange based on the
453+ * When device_type is kDLCPU, the consumer do not have to query the stream
454+ * and the producer can simply return NULL when queried.
455+ * The consumer do not have to do anything on stream sync or setting.
456+ * So CPU only framework can just provide a dummy implementation that
457+ * always set out_current_stream[0] to NULL.
478458 *
479459 * \param device_type The device type.
480460 * \param device_id The device id.
481461 * \param out_current_stream The output current work stream.
482- * \return 0 on success, -1 on failure.
462+ *
463+ * \return 0 on success, -1 on failure with a Python exception set.
464+ * \note - As a C function, must not thrown C++ exceptions.
483465 *
484466 * \sa DLPackExchangeAPI
485467 */
@@ -492,54 +474,47 @@ typedef int (*DLPackCurrentWorkStream)( //
492474/* !
493475 * \brief Imports a DLManagedTensorVersioned to a PyObject* Tensor/NDArray.
494476 *
495- * This function is a C-style function pointer to quickly convert a DLManagedTensorVersioned
496- * to a PyObject* without going through the Python Interpreter .
477+ * Convert an owning DLManagedTensorVersioned* to the Python tensor of the
478+ * producer (implementor) library with the correct type .
497479 *
498480 * This function does not perform any stream synchronization.
499481 *
500482 * This function is exposed by the framework through the DLPackExchangeAPI.
501483 *
502- * \param tensor The DLManagedTensorVersioned to convert.
484+ * \param tensor The DLManagedTensorVersioned to convert the ownership of the
485+ * the data is stolen.
503486 * \param out_py_object The output Python object.
504- * \return 0 on success, -1 on failure. PyError should be set if -1 is returned.
505- * \note We use void* to avoid dependency on Python.h, so this specific type is
506- * not dependent on Python.h and can be copied to dlpack.h.
487+ * \return 0 on success, -1 on failure with a Python exception set.
507488 *
508489 * \sa DLPackExchangeAPI
509490 */
510- typedef int (*DLPackManagedTensorToPyObjectNoSync)( //
511- DLManagedTensorVersioned* tensor, void ** out_py_object //
491+ typedef int (*DLPackManagedTensorToPyObjectNoSync)( //
492+ DLManagedTensorVersioned* tensor, //
493+ void ** out_py_object //
512494);
513495
514496/* !
515497 * \brief Framework-specific function pointers table for DLPack exchange.
516498 *
517- * Guidelines for leveraging DLPackExchangeAPI:
499+ * Additionally to `__dlpack__()` we define a C function table sharable by
500+ * Python implementations via `__c_dlpack_exchange_api__`.
501+ * This attribute must be set on the type as a Python integer compatible
502+ * with `PyLong_FromVoidPtr`/`PyLong_AsVoidPtr`.
518503 *
519- * There are generally two kinds of consumer needs for DLPack exchange:
520- * - N0: library support, where consumer.kernel(x, y, z) would like to run a kernel
521- * with the data from x, y, z. The consumer is also expected to run the kernel with the same
522- * stream context as the producer. For example, when x, y, z is torch.Tensor,
523- * consumer should query exchange_api->current_work_stream to get the
524- * current stream and launch the kernel with the same stream.
525- * This setup is necessary for no synchronization in kernel launch and maximum compatibility
526- * with CUDA graph capture in the producer.
527- * This is the desirable behavior for library extension support for frameworks like PyTorch.
528- * - N1: data ingestion and retention
504+ * A consumer library may use a pattern such as:
529505 *
530- * Note that obj.__dlpack__() API should provide useful ways for N1.
531- * The primary focus of the current DLPackExchangeAPI is to enable faster exchange N0
532- * with the support of the function pointer current_work_stream.
506+ * \code
533507 *
534- * Array/Tensor libraries should statically create and initialize this structure
535- * then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array.
536- * The DLPackExchangeAPI* should stay alive throughout the lifetime of the process.
508+ * PyObject *api_obj = type(tensor_obj).__c_dlpack_exchange_api__; // as C-code
509+ * MyDLPackExchangeAPI *api = PyLong_AsVoidPtr(api_obj);
510+ * if (api == NULL && PyErr_Occurred()) { goto handle_error; }
537511 *
538- * One simple way to do so is to create a static instance of DLPackExchangeAPI
539- * within the framework and return a pointer to it. The following code
540- * shows an example to do so in C++. It should also be reasonably easy
541- * to do so in other languages .
512+ * \endcode
513+ *
514+ * Note that this must be defined on the type. The consumer should look up the
515+ * attribute on the type and may cache the result for each unique type .
542516 *
517+ * The precise API table is given by:
543518 * \code
544519 * struct MyDLPackExchangeAPI : public DLPackExchangeAPI {
545520 * MyDLPackExchangeAPI() {
@@ -560,55 +535,75 @@ typedef int (*DLPackManagedTensorToPyObjectNoSync)( //
560535 * };
561536 * \endcode
562537 *
563- * Each framework should attach a dunder `__c_dlpack_exchange_api__` integer
564- * to point to the DLPackExchangeAPI* pointer.
538+ * Guidelines for leveraging DLPackExchangeAPI:
565539 *
566- * Importantly, the attribute should be attached to the class of the Tensor, not the instance.
540+ * There are generally two kinds of consumer needs for DLPack exchange:
541+ * - N0: library support, where consumer.kernel(x, y, z) would like to run a kernel
542+ * with the data from x, y, z. The consumer is also expected to run the kernel with the same
543+ * stream context as the producer. For example, when x, y, z is torch.Tensor,
544+ * consumer should query exchange_api->current_work_stream to get the
545+ * current stream and launch the kernel with the same stream.
546+ * This setup is necessary for no synchronization in kernel launch and maximum compatibility
547+ * with CUDA graph capture in the producer.
548+ * This is the desirable behavior for library extension support for frameworks like PyTorch.
549+ * - N1: data ingestion and retention
567550 *
568- * mypackage.Tensor.__c_dlpack_exchange_api__ = MyPackageDLPackExchangeAPI
551+ * Note that obj.__dlpack__() API should provide useful ways for N1.
552+ * The primary focus of the current DLPackExchangeAPI is to enable faster exchange N0
553+ * with the support of the function pointer current_work_stream.
569554 *
570- * or equivalently:
555+ * Array/Tensor libraries should statically create and initialize this structure
556+ * then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array.
557+ * The DLPackExchangeAPI* must stay alive throughout the lifetime of the process.
571558 *
572- * type(tensor_obj).__c_dlpack_exchange_api__ = MyPackageDLPackExchangeAPI
559+ * One simple way to do so is to create a static instance of DLPackExchangeAPI
560+ * within the framework and return a pointer to it. The following code
561+ * shows an example to do so in C++. It should also be reasonably easy
562+ * to do so in other languages.
573563 */
574564struct DLPackExchangeAPI {
575565 /* !
576- * \brief The current DLPack version.
566+ * \brief The provided DLPack version the consumer must check major version
567+ * compatibility before using this struct.
577568 */
578569 DLPackVersion version;
579570 /* !
580571 * \brief Optional pointer to an older DLPackExchangeAPI in the chain.
581572 *
582- * It should be set to NULL if the framework does not support older versions.
573+ * It must be NULL if the framework does not support older versions.
574+ * If the current major version is larger than the one supported by the
575+ * consumer, the consumer may walk this to find an earlier supported version.
583576 *
584577 * \sa DLPackExchangeAPI
585578 */
586579 struct DLPackExchangeAPI * prev_version_api;
587580 /* !
588- * \brief Framework-specific function pointer for DLPackManagedTensorAllocator
581+ * \brief Producer function pointer for DLPackManagedTensorAllocator
582+ * This function must be not NULL.
589583 * \sa DLPackManagedTensorAllocator
590584 */
591585 DLPackManagedTensorAllocator managed_tensor_allocator;
592586 /* !
593- * \brief Framework-specific function pointer for DLPackManagedTensorFromPyObject
587+ * \brief Producer function pointer for DLPackManagedTensorFromPyObject
588+ * This function must be not NULL.
594589 * \sa DLPackManagedTensorFromPyObject
595590 */
596591 DLPackManagedTensorFromPyObjectNoSync managed_tensor_from_py_object_no_sync;
597592 /* !
598- * \brief Framework-specific function pointer for DLPackManagedTensorToPyObject
593+ * \brief Producer function pointer for DLPackManagedTensorToPyObject
594+ * This function must be not NULL.
599595 * \sa DLPackManagedTensorToPyObject
600596 */
601597 DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync;
602598 /* !
603- * \brief Framework-specific function pointer for DLPackDLTensorFromPyObject
599+ * \brief Producer function pointer for DLPackDLTensorFromPyObject
600+ * This function can be NULL when the producer does not support this function.
604601 * \sa DLPackDLTensorFromPyObjectNoSync
605602 */
606603 DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync;
607604 /* !
608- * \brief Framework-specific function pointer for DLPackCurrentWorkStream
609- *
610- * This function can be set to NULL if the framework does not rely on context manager to manage the stream.
611- *
605+ * \brief Producer function pointer for DLPackCurrentWorkStream
606+ * This function must be not NULL.
612607 * \sa DLPackCurrentWorkStream
613608 */
614609 DLPackCurrentWorkStream current_work_stream;
0 commit comments