Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions enzyme/Enzyme/AdjointGenerator.h
Original file line number Diff line number Diff line change
Expand Up @@ -4038,10 +4038,14 @@ class AdjointGenerator : public llvm::InstVisitor<AdjointGenerator> {
case Intrinsic::nvvm_membar_cta:
case Intrinsic::nvvm_membar_gl:
case Intrinsic::nvvm_membar_sys: {
SmallVector<Value *, 1> args = {};
auto cal = cast<CallInst>(
Builder2.CreateCall(getIntrinsicDeclaration(M, ID), args));
cal->setCallingConv(getIntrinsicDeclaration(M, ID)->getCallingConv());
auto &Call = cast<CallBase>(I);
SmallVector<Value *, 4> args;
args.reserve(Call.arg_size());
for (unsigned i = 0; i < Call.arg_size(); ++i)
args.push_back(gutils->getNewFromOriginal(Call.getArgOperand(i)));
auto *Fn = getIntrinsicDeclaration(M, ID);
auto cal = cast<CallInst>(Builder2.CreateCall(Fn, args));
cal->setCallingConv(Fn->getCallingConv());
cal->setDebugLoc(gutils->getNewFromOriginal(I.getDebugLoc()));
return false;
}
Expand Down
9 changes: 8 additions & 1 deletion enzyme/Enzyme/EnzymeLogic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4605,10 +4605,17 @@ Function *EnzymeLogic::CreatePrimalAndGradient(
auto BarrierInst = Arch == Triple::amdgcn
? (llvm::Intrinsic::ID)Intrinsic::amdgcn_s_barrier
: (llvm::Intrinsic::ID)Intrinsic::nvvm_barrier0;
#endif
SmallVector<Value *, 1> BarrierArgs;
#if LLVM_VERSION_MAJOR > 20
if (Arch != Triple::amdgcn) {
BarrierArgs.push_back(ConstantInt::get(
Type::getInt32Ty(gutils->newFunc->getContext()), 0));
}
#endif
instbuilder.CreateCall(
getIntrinsicDeclaration(gutils->newFunc->getParent(), BarrierInst),
{});
BarrierArgs);
OldEntryInsts->moveAfter(entry);
sharedBlock->moveAfter(entry);
IRBuilder<> sbuilder(sharedBlock);
Expand Down
106 changes: 106 additions & 0 deletions enzyme/test/Enzyme/ReverseMode/cuda-barrier-sync.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
; RUN: split-file %s %t
; RUN: if [ %llvmver -le 20 ]; then %opt < %t/llvm20.ll %newLoadEnzyme -enzyme-preopt=false -enzyme-detect-readthrow=0 -passes="enzyme" -S | FileCheck %t/llvm20.ll; fi
; RUN: if [ %llvmver -gt 20 ]; then %opt < %t/llvm21plus.ll %newLoadEnzyme -enzyme-preopt=false -enzyme-detect-readthrow=0 -passes="enzyme" -S | FileCheck %t/llvm21plus.ll; fi

;--- llvm20.ll
target triple = "nvptx64-nvidia-cuda"

declare void @llvm.nvvm.barrier0()
declare float @__enzyme_autodiff(float (float)*, ...)

define float @f_sync(float %x) {
entry:
call void @llvm.nvvm.barrier0()
%res = fadd float %x, 1.000000e+00
ret float %res
}

define float @test(float %x) {
entry:
%r = call float (float (float)*, ...) @__enzyme_autodiff(float (float)* @f_sync, float %x)
ret float %r
}

; CHECK: define internal { float } @diffef_sync(float %x, float %differeturn)
; CHECK: call void @llvm.nvvm.barrier0()
; CHECK: call void @llvm.nvvm.barrier0()
; CHECK: ret { float }

;--- llvm21plus.ll
target triple = "nvptx64-nvidia-cuda"

declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32)
declare void @llvm.nvvm.barrier.cta.sync.aligned.count(i32, i32)
declare float @__enzyme_autodiff(...)

define float @f_sync_all(float %x) {
entry:
call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 7)
%res = fadd float %x, 1.000000e+00
ret float %res
}

define float @test_all(float %x) {
entry:
%r = call float (float (float)*, ...) @__enzyme_autodiff(float (float)* @f_sync_all, float %x)
ret float %r
}

define float @f_sync_all_dyn(i32 %id, float %x) {
entry:
call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %id)
%res = fadd float %x, 1.000000e+00
ret float %res
}

define float @test_all_dyn(i32 %id, float %x) {
entry:
%r = call float (float (i32, float)*, ...) @__enzyme_autodiff(float (i32, float)* @f_sync_all_dyn, i32 %id, float %x)
ret float %r
}

define float @f_sync_count(float %x) {
entry:
call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 7, i32 16)
%res = fadd float %x, 1.000000e+00
ret float %res
}

define float @test_count(float %x) {
entry:
%r = call float (float (float)*, ...) @__enzyme_autodiff(float (float)* @f_sync_count, float %x)
ret float %r
}

define float @f_sync_count_dyn(i32 %id, i32 %n, float %x) {
entry:
call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 %id, i32 %n)
%res = fadd float %x, 1.000000e+00
ret float %res
}

define float @test_count_dyn(i32 %id, i32 %n, float %x) {
entry:
%r = call float (float (i32, i32, float)*, ...) @__enzyme_autodiff(float (i32, i32, float)* @f_sync_count_dyn, i32 %id, i32 %n, float %x)
ret float %r
}

; CHECK: define internal { float } @diffef_sync_all(float %x, float %differeturn)
; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 7)
; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 7)
; CHECK: ret { float }

; CHECK: define internal { float } @diffef_sync_all_dyn(i32 %id, float %x, float %differeturn)
; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %id)
; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %id)
; CHECK: ret { float }

; CHECK: define internal { float } @diffef_sync_count(float %x, float %differeturn)
; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 7, i32 16)
; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 7, i32 16)
; CHECK: ret { float }

; CHECK: define internal { float } @diffef_sync_count_dyn(i32 %id, i32 %n, float %x, float %differeturn)
; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 %id, i32 %n)
; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 %id, i32 %n)
; CHECK: ret { float }
Loading