@@ -391,35 +391,35 @@ void HgiVulkanBlitCmds::CopyBufferCpuToGpu(
391391 if (!buffer->IsCPUStagingAddress (copyOp.cpuSourceBuffer ) ||
392392 copyOp.sourceByteOffset != copyOp.destinationByteOffset ) {
393393
394- // Offset into the src buffer.
395- const uint8_t * const src =
396- static_cast <const uint8_t *>(copyOp.cpuSourceBuffer ) +
397- copyOp.sourceByteOffset ;
394+ // Offset into the src buffer
395+ const auto src =
396+ static_cast <const std::byte *>(copyOp.cpuSourceBuffer ) +
397+ copyOp.sourceByteOffset ;
398398
399399 // Offset into the dst buffer.
400- uint8_t * const dst =
401- static_cast <uint8_t *>(buffer->GetCPUStagingAddress ()) +
402- copyOp.destinationByteOffset ;
400+ const auto dst =
401+ static_cast <std::byte *>(buffer->GetCPUStagingAddress ()) +
402+ copyOp.destinationByteOffset ;
403403
404404 memcpy (dst, src, copyOp.byteSize );
405405 }
406406
407- // Schedule copy data from staging buffer to device-local buffer.
408- HgiVulkanBuffer* stagingBuffer = buffer-> GetStagingBuffer ();
409-
410- if (TF_VERIFY ( stagingBuffer)) {
411- VkBufferCopy copyRegion = {};
407+ // Schedule copy data from staging buffer to device-local buffer if needed .
408+ // With UMA/ReBAR, the staging address is already the device buffer, so no
409+ // additional copy is necessary.
410+ if (HgiVulkanBuffer* stagingBuffer = buffer-> GetStagingBuffer ( )) {
411+ VkBufferCopy copyRegion{};
412412 // Note we use the destinationByteOffset as the srcOffset here. The staging buffer
413413 // should be prepared with the same data layout of the destination buffer.
414414 copyRegion.srcOffset = copyOp.destinationByteOffset ;
415415 copyRegion.dstOffset = copyOp.destinationByteOffset ;
416416 copyRegion.size = copyOp.byteSize ;
417417
418418 vkCmdCopyBuffer (
419- _commandBuffer->GetVulkanCommandBuffer (),
419+ _commandBuffer->GetVulkanCommandBuffer (),
420420 stagingBuffer->GetVulkanBuffer (),
421421 buffer->GetVulkanBuffer (),
422- 1 ,
422+ 1 ,
423423 ©Region);
424424 }
425425}
@@ -439,43 +439,44 @@ HgiVulkanBlitCmds::CopyBufferGpuToCpu(HgiBufferGpuToCpuOp const& copyOp)
439439 HgiVulkanBuffer* buffer = static_cast <HgiVulkanBuffer*>(
440440 copyOp.gpuSourceBuffer .Get ());
441441
442- // Make sure there is a staging buffer in the buffer by asking for cpuAddr.
443- void * cpuAddress = buffer->GetCPUStagingAddress ();
444- HgiVulkanBuffer* stagingBuffer = buffer->GetStagingBuffer ();
445- if (!TF_VERIFY (stagingBuffer)) {
446- return ;
442+ // Schedule copy data from device-local buffer to staging buffer if needed.
443+ // With UMA/ReBAR, the staging address is already the device buffer, so no
444+ // additional copy is necessary.
445+ size_t srcOffset = copyOp.sourceByteOffset ;
446+ if (HgiVulkanBuffer* stagingBuffer = buffer->GetStagingBuffer ()) {
447+ // Copy from device-local GPU buffer into CPU staging buffer
448+ VkBufferCopy copyRegion = {};
449+ copyRegion.srcOffset = srcOffset;
450+ // No need to use dst offset during intermediate step of copying into
451+ // staging buffer.
452+ copyRegion.dstOffset = 0 ;
453+ copyRegion.size = copyOp.byteSize ;
454+ vkCmdCopyBuffer (
455+ _commandBuffer->GetVulkanCommandBuffer (),
456+ buffer->GetVulkanBuffer (),
457+ stagingBuffer->GetVulkanBuffer (),
458+ 1 ,
459+ ©Region);
460+ // No need to offset into the staging buffer for the next copy.
461+ srcOffset = 0 ;
447462 }
448463
449- // Copy from device-local GPU buffer into GPU staging buffer
450- VkBufferCopy copyRegion = {};
451- copyRegion.srcOffset = copyOp.sourceByteOffset ;
452- // No need to use dst offset during intermediate step of copying into
453- // staging buffer.
454- copyRegion.dstOffset = 0 ;
455- copyRegion.size = copyOp.byteSize ;
456- vkCmdCopyBuffer (
457- _commandBuffer->GetVulkanCommandBuffer (),
458- buffer->GetVulkanBuffer (),
459- stagingBuffer->GetVulkanBuffer (),
460- 1 ,
461- ©Region);
462-
463- // Next schedule a callback when the above GPU-GPU copy completes.
464+ // Next schedule a callback when the above GPU-CPU copy completes.
464465
465466 // Offset into the dst buffer
466- char * dst = (( char *) copyOp.cpuDestinationBuffer ) +
467+ const auto dst = static_cast <std::byte*>( copyOp.cpuDestinationBuffer ) +
467468 copyOp.destinationByteOffset ;
468469
469- // No need to offset into src buffer since we copied into staging buffer
470- // without dst offset.
471- const char * src = (( const char *) cpuAddress) ;
470+ const auto src =
471+ static_cast < const std::byte*>(buffer-> GetCPUStagingAddress ()) +
472+ srcOffset ;
472473
473474 // bytes to copy
474- size_t size = copyOp.byteSize ;
475+ const size_t size = copyOp.byteSize ;
475476
476477 // Copy to cpu buffer when cmd buffer has been executed
477478 _commandBuffer->AddCompletedHandler (
478- [dst, src, size]{ memcpy (dst, src, size);}
479+ [dst, src, size]{ memcpy (dst, src, size); }
479480 );
480481}
481482
0 commit comments