Skip to content

Commit a63dd74

Browse files
MarijnS95claude
andauthored
Add ComputeEncoder abstraction with parallel/serial barrier modes (llvm#1100)
Introduces a generic command encoder abstraction for recording GPU commands to a command buffer. `ComputeEncoder` provides `dispatch()` and `barrier()` operations with two modes: `Parallel` (no automatic barriers, caller manages synchronization) and `Serial` (auto-inserts barriers between commands using tracked destination scope as next source). Each backend implements barrier tracking natively: - Metal: `MTL::BarrierScope` accumulated on the encoder - Vulkan: `VkPipelineStageFlags`/`VkAccessFlags` on the command buffer - DX12: UAV barrier flag on the command buffer VK/DX store barrier state on the command buffer (encoders hold a back-reference) so it persists across encoder lifetimes. Metal stores it on the encoder since each `MTL::ComputeCommandEncoder` is a separate native object with implicit inter-encoder ordering. Creating a parallel encoder flushes a full barrier on VK/DX to ensure prior work is visible. endEncoding() flushes any pending barriers. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent a997bac commit a63dd74

7 files changed

Lines changed: 544 additions & 52 deletions

File tree

include/API/CommandBuffer.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@
1717
#define OFFLOADTEST_API_COMMANDBUFFER_H
1818

1919
#include "API/API.h"
20+
#include "API/Encoder.h"
21+
22+
#include "llvm/Support/Error.h"
23+
24+
#include <memory>
2025

2126
namespace offloadtest {
2227

@@ -30,6 +35,15 @@ class CommandBuffer {
3035
CommandBuffer &operator=(const CommandBuffer &) = delete;
3136

3237
GPUAPI getKind() const { return Kind; }
38+
39+
/// Create a compute command encoder for recording dispatch commands.
40+
/// Barriers are automatically inserted between commands.
41+
virtual llvm::Expected<std::unique_ptr<ComputeEncoder>>
42+
createComputeEncoder() {
43+
return llvm::createStringError(
44+
std::errc::not_supported,
45+
"createComputeEncoder not implemented for this backend");
46+
}
3347
};
3448

3549
} // namespace offloadtest

include/API/Device.h

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,14 +53,9 @@ struct InputLayoutDesc {
5353
std::optional<uint32_t> InstanceStepRate;
5454
};
5555

56-
struct DXBinding {
57-
uint32_t Register;
58-
uint32_t Space;
59-
};
60-
6156
struct ResourceBindingDesc {
6257
ResourceKind Kind;
63-
DXBinding DXBinding;
58+
DirectXBinding DXBinding;
6459
std::optional<VulkanBinding> VKBinding;
6560
uint32_t DescriptorCount;
6661
};

include/API/Encoder.h

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
//===- Encoder.h - Offload API Command Encoder Abstraction ----------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef OFFLOADTEST_API_ENCODER_H
10+
#define OFFLOADTEST_API_ENCODER_H
11+
12+
#include "API/API.h"
13+
14+
#include "llvm/ADT/StringRef.h"
15+
#include "llvm/Support/Error.h"
16+
17+
#include <cstddef>
18+
#include <cstdint>
19+
20+
namespace offloadtest {
21+
22+
class Buffer;
23+
24+
/// Base class for all command encoders. An encoder records commands into a
25+
/// command buffer. Call endEncoding() when done recording. Barriers are
26+
/// automatically inserted between commands.
27+
class CommandEncoder {
28+
GPUAPI API;
29+
bool Ended = false;
30+
31+
protected:
32+
/// Backend-specific cleanup. Called exactly once, either explicitly via
33+
/// endEncoding() or implicitly from the most-derived destructor.
34+
virtual void endEncodingImpl() = 0;
35+
36+
public:
37+
explicit CommandEncoder(GPUAPI API) : API(API) {}
38+
virtual ~CommandEncoder();
39+
CommandEncoder(const CommandEncoder &) = delete;
40+
CommandEncoder &operator=(const CommandEncoder &) = delete;
41+
42+
GPUAPI getAPI() const { return API; }
43+
bool isEnded() const { return Ended; }
44+
45+
/// Copy \p Size bytes from \p Src at \p SrcOffset to \p Dst at
46+
/// \p DstOffset.
47+
virtual llvm::Error copyBufferToBuffer(Buffer &Src, size_t SrcOffset,
48+
Buffer &Dst, size_t DstOffset,
49+
size_t Size) = 0;
50+
51+
/// Begin a named debug group. Visible in GPU debuggers (PIX, RenderDoc,
52+
/// Xcode). Must be balanced by a corresponding popDebugGroup() call.
53+
virtual void pushDebugGroup(llvm::StringRef Label) {}
54+
55+
/// End the most recently pushed debug group.
56+
virtual void popDebugGroup() {}
57+
58+
/// Insert a point-in-time debug marker.
59+
virtual void insertDebugSignpost(llvm::StringRef Label) {}
60+
61+
/// Finish recording. No further commands may be recorded after this call.
62+
/// Idempotent: safe to call more than once. If not called explicitly, the
63+
/// most-derived destructor invokes it as a safeguard against leaked open
64+
/// encoders.
65+
void endEncoding() {
66+
if (Ended)
67+
return;
68+
endEncodingImpl();
69+
Ended = true;
70+
}
71+
};
72+
73+
/// Encoder for recording compute dispatch commands.
74+
class ComputeEncoder : public CommandEncoder {
75+
public:
76+
using CommandEncoder::CommandEncoder;
77+
78+
/// Dispatch a compute grid. GroupCount specifies how many workgroups to
79+
/// launch in each dimension. The workgroup size is derived from the bound
80+
/// pipeline state (e.g. the shader's numthreads attribute).
81+
virtual llvm::Error dispatch(uint32_t GroupCountX, uint32_t GroupCountY,
82+
uint32_t GroupCountZ) = 0;
83+
};
84+
85+
} // namespace offloadtest
86+
87+
#endif // OFFLOADTEST_API_ENCODER_H

lib/API/DX/Device.cpp

Lines changed: 93 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333

3434
#include "API/Capabilities.h"
3535
#include "API/Device.h"
36+
#include "API/Encoder.h"
3637
#include "API/FormatConversion.h"
3738
#include "DXFeatures.h"
3839
#include "Support/Pipeline.h"
@@ -43,6 +44,7 @@
4344
#include "llvm/ADT/SmallVector.h"
4445
#include "llvm/Object/DXContainer.h"
4546
#include "llvm/Support/Error.h"
47+
#include "llvm/Support/FormatVariadic.h"
4648
#include "llvm/Support/Signals.h"
4749

4850
#include <atomic>
@@ -533,6 +535,8 @@ class DXCommandBuffer : public offloadtest::CommandBuffer {
533535
public:
534536
ComPtr<ID3D12CommandAllocator> Allocator;
535537
ComPtr<ID3D12GraphicsCommandList> CmdList;
538+
/// Whether a UAV barrier is pending from a prior compute command.
539+
bool PendingUAVBarrier = false;
536540

537541
static llvm::Expected<std::unique_ptr<DXCommandBuffer>>
538542
create(ComPtr<ID3D12Device> Device) {
@@ -557,6 +561,20 @@ class DXCommandBuffer : public offloadtest::CommandBuffer {
557561
return CB->getKind() == GPUAPI::DirectX;
558562
}
559563

564+
void addPendingUAVBarrier() { PendingUAVBarrier = true; }
565+
566+
void flushBarrier() {
567+
if (!PendingUAVBarrier)
568+
return;
569+
const D3D12_RESOURCE_BARRIER Barrier =
570+
CD3DX12_RESOURCE_BARRIER::UAV(nullptr);
571+
CmdList->ResourceBarrier(1, &Barrier);
572+
PendingUAVBarrier = false;
573+
}
574+
575+
llvm::Expected<std::unique_ptr<offloadtest::ComputeEncoder>>
576+
createComputeEncoder() override;
577+
560578
private:
561579
DXCommandBuffer() : CommandBuffer(GPUAPI::DirectX) {}
562580
};
@@ -606,6 +624,63 @@ struct DescriptorAllocator {
606624
: Heap(Heap), DescIncSize(DescIncSize), Capacity(Capacity) {}
607625
};
608626

627+
class DXComputeEncoder : public offloadtest::ComputeEncoder {
628+
DXCommandBuffer &CB;
629+
630+
void addUAVBarrier() {
631+
CB.addPendingUAVBarrier();
632+
CB.flushBarrier();
633+
}
634+
635+
public:
636+
DXComputeEncoder(DXCommandBuffer &CB)
637+
: ComputeEncoder(GPUAPI::DirectX), CB(CB) {}
638+
639+
~DXComputeEncoder() override { endEncoding(); }
640+
641+
static bool classof(const CommandEncoder *E) {
642+
return E->getAPI() == GPUAPI::DirectX;
643+
}
644+
645+
// D3D12 debug labels require WinPixEventRuntime for the proper event
646+
// encoding. Without it, BeginEvent/EndEvent/SetMarker with metadata type 0
647+
// crash the D3D12 debug layer, so leave these as no-ops for now.
648+
void pushDebugGroup(llvm::StringRef Label) override {}
649+
void popDebugGroup() override {}
650+
void insertDebugSignpost(llvm::StringRef Label) override {}
651+
652+
llvm::Error dispatch(uint32_t GroupCountX, uint32_t GroupCountY,
653+
uint32_t GroupCountZ) override {
654+
addUAVBarrier();
655+
insertDebugSignpost(llvm::formatv("Dispatch [{0},{1},{2}]", GroupCountX,
656+
GroupCountY, GroupCountZ)
657+
.str());
658+
CB.CmdList->Dispatch(GroupCountX, GroupCountY, GroupCountZ);
659+
return llvm::Error::success();
660+
}
661+
662+
llvm::Error copyBufferToBuffer(offloadtest::Buffer &Src, size_t SrcOffset,
663+
offloadtest::Buffer &Dst, size_t DstOffset,
664+
size_t Size) override {
665+
auto &DXSrc = static_cast<DXBuffer &>(Src);
666+
auto &DXDst = static_cast<DXBuffer &>(Dst);
667+
addUAVBarrier();
668+
insertDebugSignpost(llvm::formatv("CopyBuffer {0}B", Size).str());
669+
CB.CmdList->CopyBufferRegion(DXDst.Buffer.Get(), DstOffset,
670+
DXSrc.Buffer.Get(), SrcOffset, Size);
671+
return llvm::Error::success();
672+
}
673+
674+
void endEncodingImpl() override { popDebugGroup(); }
675+
};
676+
677+
llvm::Expected<std::unique_ptr<offloadtest::ComputeEncoder>>
678+
DXCommandBuffer::createComputeEncoder() {
679+
auto Enc = std::make_unique<DXComputeEncoder>(*this);
680+
Enc->pushDebugGroup("ComputeEncoder");
681+
return Enc;
682+
}
683+
609684
class DXDevice : public offloadtest::Device {
610685
private:
611686
ComPtr<IDXCoreAdapter> Adapter;
@@ -1594,25 +1669,17 @@ class DXDevice : public offloadtest::Device {
15941669
}
15951670

15961671
void addUploadBeginBarrier(InvocationState &IS, ComPtr<ID3D12Resource> R) {
1597-
const D3D12_RESOURCE_BARRIER Barrier = {
1598-
D3D12_RESOURCE_BARRIER_TYPE_TRANSITION,
1599-
D3D12_RESOURCE_BARRIER_FLAG_NONE,
1600-
{D3D12_RESOURCE_TRANSITION_BARRIER{
1601-
R.Get(), D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES,
1602-
D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_COPY_DEST}}};
1672+
const D3D12_RESOURCE_BARRIER Barrier = CD3DX12_RESOURCE_BARRIER::Transition(
1673+
R.Get(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_COPY_DEST);
16031674
IS.CB->CmdList->ResourceBarrier(1, &Barrier);
16041675
}
16051676

16061677
void addUploadEndBarrier(InvocationState &IS, ComPtr<ID3D12Resource> R,
16071678
bool IsUAV) {
1608-
const D3D12_RESOURCE_BARRIER Barrier = {
1609-
D3D12_RESOURCE_BARRIER_TYPE_TRANSITION,
1610-
D3D12_RESOURCE_BARRIER_FLAG_NONE,
1611-
{D3D12_RESOURCE_TRANSITION_BARRIER{
1612-
R.Get(), D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES,
1613-
D3D12_RESOURCE_STATE_COPY_DEST,
1614-
IsUAV ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
1615-
: D3D12_RESOURCE_STATE_GENERIC_READ}}};
1679+
const D3D12_RESOURCE_BARRIER Barrier = CD3DX12_RESOURCE_BARRIER::Transition(
1680+
R.Get(), D3D12_RESOURCE_STATE_COPY_DEST,
1681+
IsUAV ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
1682+
: D3D12_RESOURCE_STATE_GENERIC_READ);
16161683
IS.CB->CmdList->ResourceBarrier(1, &Barrier);
16171684
}
16181685

@@ -1715,10 +1782,18 @@ class DXDevice : public offloadtest::Device {
17151782
}
17161783
}
17171784

1718-
const llvm::ArrayRef<int> DispatchSize =
1719-
llvm::ArrayRef<int>(P.Shaders[0].DispatchSize);
1720-
1721-
IS.CB->CmdList->Dispatch(DispatchSize[0], DispatchSize[1], DispatchSize[2]);
1785+
{
1786+
auto EncoderOrErr = IS.CB->createComputeEncoder();
1787+
if (!EncoderOrErr)
1788+
return EncoderOrErr.takeError();
1789+
auto &Encoder = *EncoderOrErr.get();
1790+
const llvm::ArrayRef<int> DispatchSize =
1791+
llvm::ArrayRef<int>(P.Shaders[0].DispatchSize);
1792+
if (auto Err = Encoder.dispatch(DispatchSize[0], DispatchSize[1],
1793+
DispatchSize[2]))
1794+
return Err;
1795+
Encoder.endEncoding();
1796+
}
17221797

17231798
auto CopyBackResource = [&IS, this](ResourcePair &R) {
17241799
if (R.first->isTexture()) {

lib/API/Device.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
//===----------------------------------------------------------------------===//
1111

1212
#include "API/Device.h"
13+
#include "API/Encoder.h"
1314
#include "API/FormatConversion.h"
1415

1516
#include "Config.h"
@@ -21,6 +22,8 @@
2122

2223
using namespace offloadtest;
2324

25+
CommandEncoder::~CommandEncoder() {}
26+
2427
Buffer::~Buffer() {}
2528

2629
CommandBuffer::~CommandBuffer() {}

0 commit comments

Comments
 (0)