Skip to content

Commit c724ef8

Browse files
committed
[hdSt, hgiVulkan] UMA and ReBAR support
1 parent c41f2c1 commit c724ef8

File tree

9 files changed

+281
-159
lines changed

9 files changed

+281
-159
lines changed

pxr/imaging/hdSt/resourceRegistry.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -843,15 +843,18 @@ HdStResourceRegistry::_Commit()
843843
if (req.range && req.range->RequiresStaging()) {
844844
const size_t numElements =
845845
source->GetNumElements();
846-
// Avoid calling functions on
846+
// Avoid calling functions on
847847
// HdNullBufferSources
848848
if (numElements > 0) {
849-
stagingBufferSize += numElements *
849+
stagingBufferSize.fetch_add(
850+
numElements *
850851
HdDataSizeOfTupleType(
851-
source->GetTupleType());
852+
source->GetTupleType()),
853+
std::memory_order_relaxed);
852854
}
853-
stagingBufferSize +=
854-
_GetChainedStagingSize(source);
855+
stagingBufferSize.fetch_add(
856+
_GetChainedStagingSize(source),
857+
std::memory_order_relaxed);
855858
}
856859
}
857860
}
@@ -934,7 +937,8 @@ HdStResourceRegistry::_Commit()
934937
HD_TRACE_SCOPE("Copy");
935938
// 4. copy phase:
936939
//
937-
_stagingBuffer->Resize(stagingBufferSize);
940+
_stagingBuffer->Resize(
941+
stagingBufferSize.load(std::memory_order_relaxed));
938942

939943
for (_PendingSource &pendingSource : _pendingSources) {
940944
HdBufferArrayRangeSharedPtr &dstRange = pendingSource.range;

pxr/imaging/hdSt/stagingBuffer.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ HdStStagingBuffer::HdStStagingBuffer(HdStResourceRegistry *resourceRegistry)
1919
, _capacity(0)
2020
, _activeSlot(0)
2121
{
22-
_tripleBuffered = resourceRegistry->GetHgi()->GetCapabilities()->
22+
_isUma = resourceRegistry->GetHgi()->GetCapabilities()->
2323
IsSet(HgiDeviceCapabilitiesBitsUnifiedMemory);
2424
}
2525

@@ -70,8 +70,8 @@ HdStStagingBuffer::StageCopy(HgiBufferCpuToGpuOp const &copyOp)
7070
// expensive than just submitting the CPU to GPU copy operation directly.
7171
// The value of 'queueThreshold' is estimated (when is the extra memcpy
7272
// into the staging buffer slower than immediately issuing a gpu upload)
73-
static const int queueThreshold = 512*1024;
74-
if (!_tripleBuffered && copyOp.byteSize > queueThreshold) {
73+
static constexpr int queueThreshold = 512 * 1024;
74+
if (!_isUma && copyOp.byteSize > queueThreshold) {
7575
HgiBlitCmds* blitCmds = _resourceRegistry->GetGlobalBlitCmds();
7676
blitCmds->CopyBufferCpuToGpu(copyOp);
7777
return;
@@ -154,7 +154,7 @@ HdStStagingBuffer::Flush()
154154

155155
blitCmds->PushDebugGroup(__ARCH_PRETTY_FUNCTION__);
156156

157-
if (!_tripleBuffered) {
157+
if (!_isUma) {
158158
// If this isn't UMA then blit the staging buffer to GPU.
159159
HgiBufferCpuToGpuOp op;
160160
HgiBufferHandle buffer = _handles[_activeSlot];
@@ -167,9 +167,9 @@ HdStStagingBuffer::Flush()
167167
op.destinationByteOffset = 0;
168168
op.byteSize = _head;
169169
blitCmds->CopyBufferCpuToGpu(op);
170-
blitCmds->InsertMemoryBarrier(HgiMemoryBarrierAll);
171170
}
172171

172+
blitCmds->InsertMemoryBarrier(HgiMemoryBarrierAll);
173173
for (auto const &copyOp : _gpuCopyOps) {
174174
blitCmds->CopyBufferGpuToGpu(copyOp);
175175
}
@@ -179,7 +179,7 @@ HdStStagingBuffer::Flush()
179179
_gpuCopyOps.clear();
180180
_head = 0;
181181

182-
if (_tripleBuffered) {
182+
if (_isUma) {
183183
_activeSlot++;
184184
_activeSlot = (_activeSlot < MULTIBUFFERING) ? _activeSlot : 0;
185185
}

pxr/imaging/hdSt/stagingBuffer.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,11 @@ class HdStStagingBuffer
6565

6666
HdStResourceRegistry *_resourceRegistry;
6767
HgiBufferHandle _handles[MULTIBUFFERING];
68+
std::vector<HgiBufferGpuToGpuOp> _gpuCopyOps;
6869
size_t _head;
6970
size_t _capacity;
7071
size_t _activeSlot;
71-
bool _tripleBuffered;
72-
std::vector<HgiBufferGpuToGpuOp> _gpuCopyOps;
72+
bool _isUma;
7373
};
7474

7575
PXR_NAMESPACE_CLOSE_SCOPE

pxr/imaging/hgiVulkan/blitCmds.cpp

Lines changed: 41 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -419,35 +419,35 @@ void HgiVulkanBlitCmds::CopyBufferCpuToGpu(
419419
if (!buffer->IsCPUStagingAddress(copyOp.cpuSourceBuffer) ||
420420
copyOp.sourceByteOffset != copyOp.destinationByteOffset) {
421421

422-
// Offset into the src buffer.
423-
const uint8_t* const src =
424-
static_cast<const uint8_t*>(copyOp.cpuSourceBuffer) +
425-
copyOp.sourceByteOffset;
422+
// Offset into the src buffer
423+
const auto src =
424+
static_cast<const std::byte*>(copyOp.cpuSourceBuffer) +
425+
copyOp.sourceByteOffset;
426426

427427
// Offset into the dst buffer.
428-
uint8_t* const dst =
429-
static_cast<uint8_t*>(buffer->GetCPUStagingAddress()) +
430-
copyOp.destinationByteOffset;
428+
const auto dst =
429+
static_cast<std::byte*>(buffer->GetCPUStagingAddress()) +
430+
copyOp.destinationByteOffset;
431431

432432
memcpy(dst, src, copyOp.byteSize);
433433
}
434434

435-
// Schedule copy data from staging buffer to device-local buffer.
436-
HgiVulkanBuffer* stagingBuffer = buffer->GetStagingBuffer();
437-
438-
if (TF_VERIFY(stagingBuffer)) {
439-
VkBufferCopy copyRegion = {};
435+
// Schedule copy data from staging buffer to device-local buffer if needed.
436+
// With UMA/ReBAR, the staging address is already the device buffer, so no
437+
// additional copy is necessary.
438+
if (HgiVulkanBuffer* stagingBuffer = buffer->GetStagingBuffer()) {
439+
VkBufferCopy copyRegion{};
440440
// Note we use the destinationByteOffset as the srcOffset here. The staging buffer
441441
// should be prepared with the same data layout of the destination buffer.
442442
copyRegion.srcOffset = copyOp.destinationByteOffset;
443443
copyRegion.dstOffset = copyOp.destinationByteOffset;
444444
copyRegion.size = copyOp.byteSize;
445445

446446
vkCmdCopyBuffer(
447-
_commandBuffer->GetVulkanCommandBuffer(),
447+
_commandBuffer->GetVulkanCommandBuffer(),
448448
stagingBuffer->GetVulkanBuffer(),
449449
buffer->GetVulkanBuffer(),
450-
1,
450+
1,
451451
&copyRegion);
452452
}
453453
}
@@ -467,43 +467,44 @@ HgiVulkanBlitCmds::CopyBufferGpuToCpu(HgiBufferGpuToCpuOp const& copyOp)
467467
HgiVulkanBuffer* buffer = static_cast<HgiVulkanBuffer*>(
468468
copyOp.gpuSourceBuffer.Get());
469469

470-
// Make sure there is a staging buffer in the buffer by asking for cpuAddr.
471-
void* cpuAddress = buffer->GetCPUStagingAddress();
472-
HgiVulkanBuffer* stagingBuffer = buffer->GetStagingBuffer();
473-
if (!TF_VERIFY(stagingBuffer)) {
474-
return;
470+
// Schedule copy data from device-local buffer to staging buffer if needed.
471+
// With UMA/ReBAR, the staging address is already the device buffer, so no
472+
// additional copy is necessary.
473+
size_t srcOffset = copyOp.sourceByteOffset;
474+
if (HgiVulkanBuffer* stagingBuffer = buffer->GetStagingBuffer()) {
475+
// Copy from device-local GPU buffer into CPU staging buffer
476+
VkBufferCopy copyRegion = {};
477+
copyRegion.srcOffset = srcOffset;
478+
// No need to use dst offset during intermediate step of copying into
479+
// staging buffer.
480+
copyRegion.dstOffset = 0;
481+
copyRegion.size = copyOp.byteSize;
482+
vkCmdCopyBuffer(
483+
_commandBuffer->GetVulkanCommandBuffer(),
484+
buffer->GetVulkanBuffer(),
485+
stagingBuffer->GetVulkanBuffer(),
486+
1,
487+
&copyRegion);
488+
// No need to offset into the staging buffer for the next copy.
489+
srcOffset = 0;
475490
}
476491

477-
// Copy from device-local GPU buffer into GPU staging buffer
478-
VkBufferCopy copyRegion = {};
479-
copyRegion.srcOffset = copyOp.sourceByteOffset;
480-
// No need to use dst offset during intermediate step of copying into
481-
// staging buffer.
482-
copyRegion.dstOffset = 0;
483-
copyRegion.size = copyOp.byteSize;
484-
vkCmdCopyBuffer(
485-
_commandBuffer->GetVulkanCommandBuffer(),
486-
buffer->GetVulkanBuffer(),
487-
stagingBuffer->GetVulkanBuffer(),
488-
1,
489-
&copyRegion);
490-
491-
// Next schedule a callback when the above GPU-GPU copy completes.
492+
// Next schedule a callback when the above GPU-CPU copy completes.
492493

493494
// Offset into the dst buffer
494-
char* dst = ((char*) copyOp.cpuDestinationBuffer) +
495+
const auto dst = static_cast<std::byte*>(copyOp.cpuDestinationBuffer) +
495496
copyOp.destinationByteOffset;
496497

497-
// No need to offset into src buffer since we copied into staging buffer
498-
// without dst offset.
499-
const char* src = ((const char*) cpuAddress);
498+
const auto src =
499+
static_cast<const std::byte*>(buffer->GetCPUStagingAddress()) +
500+
srcOffset;
500501

501502
// bytes to copy
502-
size_t size = copyOp.byteSize;
503+
const size_t size = copyOp.byteSize;
503504

504505
// Copy to cpu buffer when cmd buffer has been executed
505506
_commandBuffer->AddCompletedHandler(
506-
[dst, src, size]{ memcpy(dst, src, size);}
507+
[dst, src, size]{ memcpy(dst, src, size); }
507508
);
508509
}
509510

0 commit comments

Comments
 (0)