Skip to content

Commit bd4e20a

Browse files
committed
[hdSt, hgiVulkan] UMA and ReBAR support
1 parent 14e978d commit bd4e20a

File tree

9 files changed

+265
-143
lines changed

9 files changed

+265
-143
lines changed

pxr/imaging/hdSt/resourceRegistry.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -843,15 +843,18 @@ HdStResourceRegistry::_Commit()
843843
if (req.range && req.range->RequiresStaging()) {
844844
const size_t numElements =
845845
source->GetNumElements();
846-
// Avoid calling functions on
846+
// Avoid calling functions on
847847
// HdNullBufferSources
848848
if (numElements > 0) {
849-
stagingBufferSize += numElements *
849+
stagingBufferSize.fetch_add(
850+
numElements *
850851
HdDataSizeOfTupleType(
851-
source->GetTupleType());
852+
source->GetTupleType()),
853+
std::memory_order_relaxed);
852854
}
853-
stagingBufferSize +=
854-
_GetChainedStagingSize(source);
855+
stagingBufferSize.fetch_add(
856+
_GetChainedStagingSize(source),
857+
std::memory_order_relaxed);
855858
}
856859
}
857860
}
@@ -934,7 +937,8 @@ HdStResourceRegistry::_Commit()
934937
HD_TRACE_SCOPE("Copy");
935938
// 4. copy phase:
936939
//
937-
_stagingBuffer->Resize(stagingBufferSize);
940+
_stagingBuffer->Resize(
941+
stagingBufferSize.load(std::memory_order_relaxed));
938942

939943
for (_PendingSource &pendingSource : _pendingSources) {
940944
HdBufferArrayRangeSharedPtr &dstRange = pendingSource.range;

pxr/imaging/hdSt/stagingBuffer.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ HdStStagingBuffer::HdStStagingBuffer(HdStResourceRegistry *resourceRegistry)
1919
, _capacity(0)
2020
, _activeSlot(0)
2121
{
22-
_tripleBuffered = resourceRegistry->GetHgi()->GetCapabilities()->
22+
_isUma = resourceRegistry->GetHgi()->GetCapabilities()->
2323
IsSet(HgiDeviceCapabilitiesBitsUnifiedMemory);
2424
}
2525

@@ -70,8 +70,8 @@ HdStStagingBuffer::StageCopy(HgiBufferCpuToGpuOp const &copyOp)
7070
// expensive than just submitting the CPU to GPU copy operation directly.
7171
// The value of 'queueThreshold' is estimated (when is the extra memcpy
7272
// into the staging buffer slower than immediately issuing a gpu upload)
73-
static const int queueThreshold = 512*1024;
74-
if (!_tripleBuffered && copyOp.byteSize > queueThreshold) {
73+
static constexpr int queueThreshold = 512 * 1024;
74+
if (!_isUma && copyOp.byteSize > queueThreshold) {
7575
HgiBlitCmds* blitCmds = _resourceRegistry->GetGlobalBlitCmds();
7676
blitCmds->CopyBufferCpuToGpu(copyOp);
7777
return;
@@ -153,7 +153,7 @@ HdStStagingBuffer::Flush()
153153

154154
blitCmds->PushDebugGroup(__ARCH_PRETTY_FUNCTION__);
155155

156-
if (!_tripleBuffered) {
156+
if (!_isUma) {
157157
// If this isn't UMA then blit the staging buffer to GPU.
158158
HgiBufferCpuToGpuOp op;
159159
HgiBufferHandle buffer = _handles[_activeSlot];
@@ -166,9 +166,9 @@ HdStStagingBuffer::Flush()
166166
op.destinationByteOffset = 0;
167167
op.byteSize = _head;
168168
blitCmds->CopyBufferCpuToGpu(op);
169-
blitCmds->InsertMemoryBarrier(HgiMemoryBarrierAll);
170169
}
171170

171+
blitCmds->InsertMemoryBarrier(HgiMemoryBarrierAll);
172172
for (auto const &copyOp : _gpuCopyOps) {
173173
blitCmds->CopyBufferGpuToGpu(copyOp);
174174
}
@@ -178,7 +178,7 @@ HdStStagingBuffer::Flush()
178178
_gpuCopyOps.clear();
179179
_head = 0;
180180

181-
if (_tripleBuffered) {
181+
if (_isUma) {
182182
_activeSlot++;
183183
_activeSlot = (_activeSlot < MULTIBUFFERING) ? _activeSlot : 0;
184184
}

pxr/imaging/hdSt/stagingBuffer.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,11 @@ class HdStStagingBuffer
6565

6666
HdStResourceRegistry *_resourceRegistry;
6767
HgiBufferHandle _handles[MULTIBUFFERING];
68+
std::vector<HgiBufferGpuToGpuOp> _gpuCopyOps;
6869
size_t _head;
6970
size_t _capacity;
7071
size_t _activeSlot;
71-
bool _tripleBuffered;
72-
std::vector<HgiBufferGpuToGpuOp> _gpuCopyOps;
72+
bool _isUma;
7373
};
7474

7575
PXR_NAMESPACE_CLOSE_SCOPE

pxr/imaging/hgiVulkan/blitCmds.cpp

Lines changed: 41 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -391,35 +391,35 @@ void HgiVulkanBlitCmds::CopyBufferCpuToGpu(
391391
if (!buffer->IsCPUStagingAddress(copyOp.cpuSourceBuffer) ||
392392
copyOp.sourceByteOffset != copyOp.destinationByteOffset) {
393393

394-
// Offset into the src buffer.
395-
const uint8_t* const src =
396-
static_cast<const uint8_t*>(copyOp.cpuSourceBuffer) +
397-
copyOp.sourceByteOffset;
394+
// Offset into the src buffer
395+
const auto src =
396+
static_cast<const std::byte*>(copyOp.cpuSourceBuffer) +
397+
copyOp.sourceByteOffset;
398398

399399
// Offset into the dst buffer.
400-
uint8_t* const dst =
401-
static_cast<uint8_t*>(buffer->GetCPUStagingAddress()) +
402-
copyOp.destinationByteOffset;
400+
const auto dst =
401+
static_cast<std::byte*>(buffer->GetCPUStagingAddress()) +
402+
copyOp.destinationByteOffset;
403403

404404
memcpy(dst, src, copyOp.byteSize);
405405
}
406406

407-
// Schedule copy data from staging buffer to device-local buffer.
408-
HgiVulkanBuffer* stagingBuffer = buffer->GetStagingBuffer();
409-
410-
if (TF_VERIFY(stagingBuffer)) {
411-
VkBufferCopy copyRegion = {};
407+
// Schedule copy data from staging buffer to device-local buffer if needed.
408+
// With UMA/ReBAR, the staging address is already the device buffer, so no
409+
// additional copy is necessary.
410+
if (HgiVulkanBuffer* stagingBuffer = buffer->GetStagingBuffer()) {
411+
VkBufferCopy copyRegion{};
412412
// Note we use the destinationByteOffset as the srcOffset here. The staging buffer
413413
// should be prepared with the same data layout of the destination buffer.
414414
copyRegion.srcOffset = copyOp.destinationByteOffset;
415415
copyRegion.dstOffset = copyOp.destinationByteOffset;
416416
copyRegion.size = copyOp.byteSize;
417417

418418
vkCmdCopyBuffer(
419-
_commandBuffer->GetVulkanCommandBuffer(),
419+
_commandBuffer->GetVulkanCommandBuffer(),
420420
stagingBuffer->GetVulkanBuffer(),
421421
buffer->GetVulkanBuffer(),
422-
1,
422+
1,
423423
&copyRegion);
424424
}
425425
}
@@ -439,43 +439,44 @@ HgiVulkanBlitCmds::CopyBufferGpuToCpu(HgiBufferGpuToCpuOp const& copyOp)
439439
HgiVulkanBuffer* buffer = static_cast<HgiVulkanBuffer*>(
440440
copyOp.gpuSourceBuffer.Get());
441441

442-
// Make sure there is a staging buffer in the buffer by asking for cpuAddr.
443-
void* cpuAddress = buffer->GetCPUStagingAddress();
444-
HgiVulkanBuffer* stagingBuffer = buffer->GetStagingBuffer();
445-
if (!TF_VERIFY(stagingBuffer)) {
446-
return;
442+
// Schedule copy data from device-local buffer to staging buffer if needed.
443+
// With UMA/ReBAR, the staging address is already the device buffer, so no
444+
// additional copy is necessary.
445+
size_t srcOffset = copyOp.sourceByteOffset;
446+
if (HgiVulkanBuffer* stagingBuffer = buffer->GetStagingBuffer()) {
447+
// Copy from device-local GPU buffer into CPU staging buffer
448+
VkBufferCopy copyRegion = {};
449+
copyRegion.srcOffset = srcOffset;
450+
// No need to use dst offset during intermediate step of copying into
451+
// staging buffer.
452+
copyRegion.dstOffset = 0;
453+
copyRegion.size = copyOp.byteSize;
454+
vkCmdCopyBuffer(
455+
_commandBuffer->GetVulkanCommandBuffer(),
456+
buffer->GetVulkanBuffer(),
457+
stagingBuffer->GetVulkanBuffer(),
458+
1,
459+
&copyRegion);
460+
// No need to offset into the staging buffer for the next copy.
461+
srcOffset = 0;
447462
}
448463

449-
// Copy from device-local GPU buffer into GPU staging buffer
450-
VkBufferCopy copyRegion = {};
451-
copyRegion.srcOffset = copyOp.sourceByteOffset;
452-
// No need to use dst offset during intermediate step of copying into
453-
// staging buffer.
454-
copyRegion.dstOffset = 0;
455-
copyRegion.size = copyOp.byteSize;
456-
vkCmdCopyBuffer(
457-
_commandBuffer->GetVulkanCommandBuffer(),
458-
buffer->GetVulkanBuffer(),
459-
stagingBuffer->GetVulkanBuffer(),
460-
1,
461-
&copyRegion);
462-
463-
// Next schedule a callback when the above GPU-GPU copy completes.
464+
// Next schedule a callback when the above GPU-CPU copy completes.
464465

465466
// Offset into the dst buffer
466-
char* dst = ((char*) copyOp.cpuDestinationBuffer) +
467+
const auto dst = static_cast<std::byte*>(copyOp.cpuDestinationBuffer) +
467468
copyOp.destinationByteOffset;
468469

469-
// No need to offset into src buffer since we copied into staging buffer
470-
// without dst offset.
471-
const char* src = ((const char*) cpuAddress);
470+
const auto src =
471+
static_cast<const std::byte*>(buffer->GetCPUStagingAddress()) +
472+
srcOffset;
472473

473474
// bytes to copy
474-
size_t size = copyOp.byteSize;
475+
const size_t size = copyOp.byteSize;
475476

476477
// Copy to cpu buffer when cmd buffer has been executed
477478
_commandBuffer->AddCompletedHandler(
478-
[dst, src, size]{ memcpy(dst, src, size);}
479+
[dst, src, size]{ memcpy(dst, src, size); }
479480
);
480481
}
481482

0 commit comments

Comments
 (0)