PixarAnimationStudios
diff --git a/‎pxr/imaging/hdSt/resourceRegistry.cpp‎
Lines changed: 10 additions & 6 deletions b/‎pxr/imaging/hdSt/resourceRegistry.cpp‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎pxr/imaging/hdSt/stagingBuffer.cpp‎
Lines changed: 6 additions & 6 deletions b/‎pxr/imaging/hdSt/stagingBuffer.cpp‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎pxr/imaging/hdSt/stagingBuffer.h‎
Lines changed: 2 additions & 2 deletions b/‎pxr/imaging/hdSt/stagingBuffer.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pxr/imaging/hgiVulkan/blitCmds.cpp‎
Lines changed: 41 additions & 40 deletions b/‎pxr/imaging/hgiVulkan/blitCmds.cpp‎
Lines changed: 41 additions & 40 deletions
@@ -843,15 +843,18 @@ HdStResourceRegistry::_Commit()
                                 if (req.range && req.range->RequiresStaging()) {
                                     const size_t numElements =
                                         source->GetNumElements();
-                                    // Avoid calling functions on 
+                                    // Avoid calling functions on
                                     // HdNullBufferSources
                                     if (numElements > 0) {
-                                        stagingBufferSize += numElements *
+                                        stagingBufferSize.fetch_add(
+                                            numElements *
                                             HdDataSizeOfTupleType(
-                                                source->GetTupleType());
+                                                source->GetTupleType()),
+                                                std::memory_order_relaxed);
                                     }
-                                    stagingBufferSize += 
-                                        _GetChainedStagingSize(source);
+                                    stagingBufferSize.fetch_add(
+                                        _GetChainedStagingSize(source),
+                                        std::memory_order_relaxed);
                                 }
                             }
                         }
@@ -934,7 +937,8 @@ HdStResourceRegistry::_Commit()
         HD_TRACE_SCOPE("Copy");
         // 4. copy phase:
         //
-        _stagingBuffer->Resize(stagingBufferSize);
+        _stagingBuffer->Resize(
+            stagingBufferSize.load(std::memory_order_relaxed));
 
         for (_PendingSource &pendingSource : _pendingSources) {
             HdBufferArrayRangeSharedPtr &dstRange = pendingSource.range;
 
@@ -19,7 +19,7 @@ HdStStagingBuffer::HdStStagingBuffer(HdStResourceRegistry *resourceRegistry)
     , _capacity(0)
     , _activeSlot(0)
 {
-    _tripleBuffered = resourceRegistry->GetHgi()->GetCapabilities()->
+    _isUma = resourceRegistry->GetHgi()->GetCapabilities()->
                           IsSet(HgiDeviceCapabilitiesBitsUnifiedMemory);
 }
 
@@ -70,8 +70,8 @@ HdStStagingBuffer::StageCopy(HgiBufferCpuToGpuOp const &copyOp)
     // expensive than just submitting the CPU to GPU copy operation directly.
     // The value of 'queueThreshold' is estimated (when is the extra memcpy
     // into the staging buffer slower than immediately issuing a gpu upload)
-    static const int queueThreshold = 512*1024;
-    if (!_tripleBuffered && copyOp.byteSize > queueThreshold) {
+    static constexpr int queueThreshold = 512 * 1024;
+    if (!_isUma && copyOp.byteSize > queueThreshold) {
         HgiBlitCmds* blitCmds = _resourceRegistry->GetGlobalBlitCmds();
         blitCmds->CopyBufferCpuToGpu(copyOp);
         return;
@@ -154,7 +154,7 @@ HdStStagingBuffer::Flush()
 
     blitCmds->PushDebugGroup(__ARCH_PRETTY_FUNCTION__);
 
-    if (!_tripleBuffered) {
+    if (!_isUma) {
         // If this isn't UMA then blit the staging buffer to GPU.
         HgiBufferCpuToGpuOp op;
         HgiBufferHandle buffer = _handles[_activeSlot];
@@ -167,9 +167,9 @@ HdStStagingBuffer::Flush()
         op.destinationByteOffset = 0;
         op.byteSize = _head;
         blitCmds->CopyBufferCpuToGpu(op);
-        blitCmds->InsertMemoryBarrier(HgiMemoryBarrierAll);
     }
 
+    blitCmds->InsertMemoryBarrier(HgiMemoryBarrierAll);
     for (auto const &copyOp : _gpuCopyOps) {
         blitCmds->CopyBufferGpuToGpu(copyOp);
     }
@@ -179,7 +179,7 @@ HdStStagingBuffer::Flush()
     _gpuCopyOps.clear();
     _head = 0;
 
-    if (_tripleBuffered) {
+    if (_isUma) {
         _activeSlot++;
         _activeSlot = (_activeSlot < MULTIBUFFERING) ? _activeSlot : 0;
     }
 
@@ -65,11 +65,11 @@ class HdStStagingBuffer
 
     HdStResourceRegistry *_resourceRegistry;
     HgiBufferHandle _handles[MULTIBUFFERING];
+    std::vector<HgiBufferGpuToGpuOp> _gpuCopyOps;
     size_t _head;
     size_t _capacity;
     size_t _activeSlot;
-    bool _tripleBuffered;
-    std::vector<HgiBufferGpuToGpuOp> _gpuCopyOps;
+    bool _isUma;
 };
 
 PXR_NAMESPACE_CLOSE_SCOPE
 
@@ -419,35 +419,35 @@ void HgiVulkanBlitCmds::CopyBufferCpuToGpu(
     if (!buffer->IsCPUStagingAddress(copyOp.cpuSourceBuffer) ||
         copyOp.sourceByteOffset != copyOp.destinationByteOffset) {
 
-        // Offset into the src buffer.
-        const uint8_t* const src =
-            static_cast<const uint8_t*>(copyOp.cpuSourceBuffer) +
-                copyOp.sourceByteOffset;
+        // Offset into the src buffer
+        const auto src =
+            static_cast<const std::byte*>(copyOp.cpuSourceBuffer) +
+            copyOp.sourceByteOffset;
 
         // Offset into the dst buffer.
-        uint8_t* const dst =
-            static_cast<uint8_t*>(buffer->GetCPUStagingAddress()) +
-                copyOp.destinationByteOffset;
+        const auto dst =
+            static_cast<std::byte*>(buffer->GetCPUStagingAddress()) +
+            copyOp.destinationByteOffset;
 
         memcpy(dst, src, copyOp.byteSize);
     }
 
-    // Schedule copy data from staging buffer to device-local buffer.
-    HgiVulkanBuffer* stagingBuffer = buffer->GetStagingBuffer();
-
-    if (TF_VERIFY(stagingBuffer)) {
-        VkBufferCopy copyRegion = {};
+    // Schedule copy data from staging buffer to device-local buffer if needed.
+    // With UMA/ReBAR, the staging address is already the device buffer, so no
+    // additional copy is necessary.
+    if (HgiVulkanBuffer* stagingBuffer = buffer->GetStagingBuffer()) {
+        VkBufferCopy copyRegion{};
         // Note we use the destinationByteOffset as the srcOffset here. The staging buffer
         // should be prepared with the same data layout of the destination buffer.
         copyRegion.srcOffset = copyOp.destinationByteOffset;
         copyRegion.dstOffset = copyOp.destinationByteOffset;
         copyRegion.size = copyOp.byteSize;
 
         vkCmdCopyBuffer(
-            _commandBuffer->GetVulkanCommandBuffer(), 
+            _commandBuffer->GetVulkanCommandBuffer(),
             stagingBuffer->GetVulkanBuffer(),
             buffer->GetVulkanBuffer(),
-            1, 
+            1,
             &copyRegion);
     }
 }
@@ -467,43 +467,44 @@ HgiVulkanBlitCmds::CopyBufferGpuToCpu(HgiBufferGpuToCpuOp const& copyOp)
     HgiVulkanBuffer* buffer = static_cast<HgiVulkanBuffer*>(
         copyOp.gpuSourceBuffer.Get());
 
-    // Make sure there is a staging buffer in the buffer by asking for cpuAddr.
-    void* cpuAddress = buffer->GetCPUStagingAddress();
-    HgiVulkanBuffer* stagingBuffer = buffer->GetStagingBuffer();
-    if (!TF_VERIFY(stagingBuffer)) {
-        return;
+    // Schedule copy data from device-local buffer to staging buffer if needed.
+    // With UMA/ReBAR, the staging address is already the device buffer, so no
+    // additional copy is necessary.
+    size_t srcOffset = copyOp.sourceByteOffset;
+    if (HgiVulkanBuffer* stagingBuffer = buffer->GetStagingBuffer()) {
+        // Copy from device-local GPU buffer into CPU staging buffer
+        VkBufferCopy copyRegion = {};
+        copyRegion.srcOffset = srcOffset;
+        // No need to use dst offset during intermediate step of copying into 
+        // staging buffer.
+        copyRegion.dstOffset = 0;
+        copyRegion.size = copyOp.byteSize;
+        vkCmdCopyBuffer(
+            _commandBuffer->GetVulkanCommandBuffer(), 
+            buffer->GetVulkanBuffer(),
+            stagingBuffer->GetVulkanBuffer(),
+            1, 
+            &copyRegion);
+        // No need to offset into the staging buffer for the next copy.
+        srcOffset = 0;
     }
 
-    // Copy from device-local GPU buffer into GPU staging buffer
-    VkBufferCopy copyRegion = {};
-    copyRegion.srcOffset = copyOp.sourceByteOffset;
-    // No need to use dst offset during intermediate step of copying into 
-    // staging buffer.
-    copyRegion.dstOffset = 0;
-    copyRegion.size = copyOp.byteSize;
-    vkCmdCopyBuffer(
-        _commandBuffer->GetVulkanCommandBuffer(), 
-        buffer->GetVulkanBuffer(),
-        stagingBuffer->GetVulkanBuffer(),
-        1, 
-        &copyRegion);
-
-    // Next schedule a callback when the above GPU-GPU copy completes.
+    // Next schedule a callback when the above GPU-CPU copy completes.
 
     // Offset into the dst buffer
-    char* dst = ((char*) copyOp.cpuDestinationBuffer) +
+    const auto dst = static_cast<std::byte*>(copyOp.cpuDestinationBuffer) +
         copyOp.destinationByteOffset;
 
-    // No need to offset into src buffer since we copied into staging buffer
-    // without dst offset.
-    const char* src = ((const char*) cpuAddress);
+    const auto src =
+        static_cast<const std::byte*>(buffer->GetCPUStagingAddress()) +
+        srcOffset;
 
     // bytes to copy
-    size_t size = copyOp.byteSize;
+    const size_t size = copyOp.byteSize;
 
     // Copy to cpu buffer when cmd buffer has been executed
     _commandBuffer->AddCompletedHandler(
-        [dst, src, size]{ memcpy(dst, src, size);}
+        [dst, src, size]{ memcpy(dst, src, size); }
     );
 }