@@ -419,35 +419,35 @@ void HgiVulkanBlitCmds::CopyBufferCpuToGpu(
419419 if (!buffer->IsCPUStagingAddress (copyOp.cpuSourceBuffer ) ||
420420 copyOp.sourceByteOffset != copyOp.destinationByteOffset ) {
421421
422- // Offset into the src buffer.
423- const uint8_t * const src =
424- static_cast <const uint8_t *>(copyOp.cpuSourceBuffer ) +
425- copyOp.sourceByteOffset ;
422+ // Offset into the src buffer
423+ const auto src =
424+ static_cast <const std::byte *>(copyOp.cpuSourceBuffer ) +
425+ copyOp.sourceByteOffset ;
426426
427427 // Offset into the dst buffer.
428- uint8_t * const dst =
429- static_cast <uint8_t *>(buffer->GetCPUStagingAddress ()) +
430- copyOp.destinationByteOffset ;
428+ const auto dst =
429+ static_cast <std::byte *>(buffer->GetCPUStagingAddress ()) +
430+ copyOp.destinationByteOffset ;
431431
432432 memcpy (dst, src, copyOp.byteSize );
433433 }
434434
435- // Schedule copy data from staging buffer to device-local buffer.
436- HgiVulkanBuffer* stagingBuffer = buffer-> GetStagingBuffer ();
437-
438- if (TF_VERIFY ( stagingBuffer)) {
439- VkBufferCopy copyRegion = {};
435+ // Schedule copy data from staging buffer to device-local buffer if needed .
436+ // With UMA/ReBAR, the staging address is already the device buffer, so no
437+ // additional copy is necessary.
438+ if (HgiVulkanBuffer* stagingBuffer = buffer-> GetStagingBuffer ( )) {
439+ VkBufferCopy copyRegion{};
440440 // Note we use the destinationByteOffset as the srcOffset here. The staging buffer
441441 // should be prepared with the same data layout of the destination buffer.
442442 copyRegion.srcOffset = copyOp.destinationByteOffset ;
443443 copyRegion.dstOffset = copyOp.destinationByteOffset ;
444444 copyRegion.size = copyOp.byteSize ;
445445
446446 vkCmdCopyBuffer (
447- _commandBuffer->GetVulkanCommandBuffer (),
447+ _commandBuffer->GetVulkanCommandBuffer (),
448448 stagingBuffer->GetVulkanBuffer (),
449449 buffer->GetVulkanBuffer (),
450- 1 ,
450+ 1 ,
451451 ©Region);
452452 }
453453}
@@ -467,43 +467,44 @@ HgiVulkanBlitCmds::CopyBufferGpuToCpu(HgiBufferGpuToCpuOp const& copyOp)
467467 HgiVulkanBuffer* buffer = static_cast <HgiVulkanBuffer*>(
468468 copyOp.gpuSourceBuffer .Get ());
469469
470- // Make sure there is a staging buffer in the buffer by asking for cpuAddr.
471- void * cpuAddress = buffer->GetCPUStagingAddress ();
472- HgiVulkanBuffer* stagingBuffer = buffer->GetStagingBuffer ();
473- if (!TF_VERIFY (stagingBuffer)) {
474- return ;
470+ // Schedule copy data from device-local buffer to staging buffer if needed.
471+ // With UMA/ReBAR, the staging address is already the device buffer, so no
472+ // additional copy is necessary.
473+ size_t srcOffset = copyOp.sourceByteOffset ;
474+ if (HgiVulkanBuffer* stagingBuffer = buffer->GetStagingBuffer ()) {
475+ // Copy from device-local GPU buffer into CPU staging buffer
476+ VkBufferCopy copyRegion = {};
477+ copyRegion.srcOffset = srcOffset;
478+ // No need to use dst offset during intermediate step of copying into
479+ // staging buffer.
480+ copyRegion.dstOffset = 0 ;
481+ copyRegion.size = copyOp.byteSize ;
482+ vkCmdCopyBuffer (
483+ _commandBuffer->GetVulkanCommandBuffer (),
484+ buffer->GetVulkanBuffer (),
485+ stagingBuffer->GetVulkanBuffer (),
486+ 1 ,
487+ ©Region);
488+ // No need to offset into the staging buffer for the next copy.
489+ srcOffset = 0 ;
475490 }
476491
477- // Copy from device-local GPU buffer into GPU staging buffer
478- VkBufferCopy copyRegion = {};
479- copyRegion.srcOffset = copyOp.sourceByteOffset ;
480- // No need to use dst offset during intermediate step of copying into
481- // staging buffer.
482- copyRegion.dstOffset = 0 ;
483- copyRegion.size = copyOp.byteSize ;
484- vkCmdCopyBuffer (
485- _commandBuffer->GetVulkanCommandBuffer (),
486- buffer->GetVulkanBuffer (),
487- stagingBuffer->GetVulkanBuffer (),
488- 1 ,
489- ©Region);
490-
491- // Next schedule a callback when the above GPU-GPU copy completes.
492+ // Next schedule a callback when the above GPU-CPU copy completes.
492493
493494 // Offset into the dst buffer
494- char * dst = (( char *) copyOp.cpuDestinationBuffer ) +
495+ const auto dst = static_cast <std::byte*>( copyOp.cpuDestinationBuffer ) +
495496 copyOp.destinationByteOffset ;
496497
497- // No need to offset into src buffer since we copied into staging buffer
498- // without dst offset.
499- const char * src = (( const char *) cpuAddress) ;
498+ const auto src =
499+ static_cast < const std::byte*>(buffer-> GetCPUStagingAddress ()) +
500+ srcOffset ;
500501
501502 // bytes to copy
502- size_t size = copyOp.byteSize ;
503+ const size_t size = copyOp.byteSize ;
503504
504505 // Copy to cpu buffer when cmd buffer has been executed
505506 _commandBuffer->AddCompletedHandler (
506- [dst, src, size]{ memcpy (dst, src, size);}
507+ [dst, src, size]{ memcpy (dst, src, size); }
507508 );
508509}
509510
0 commit comments