Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions enzyme/Enzyme/DiffeGradientUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,67 @@ SmallVector<SelectInst *, 4> DiffeGradientUtils::addToDiffe(
return res;
}

if (auto VecT = dyn_cast<VectorType>(VT)) {
if (!VecT->getElementCount().isScalable()) {
Type *elemTy = VecT->getElementType();
auto elemBytes = (DL.getTypeSizeInBits(elemTy) + 7) / 8;

// Only handle element-aligned windows
if (elemBytes != 0 && start % elemBytes == 0 && size % elemBytes == 0) {
unsigned left_idx = start / elemBytes;
unsigned right_idx = (start + size) / elemBytes; // exclusive

unsigned numElts = VecT->getElementCount().getFixedValue();
if (left_idx > numElts)
left_idx = numElts;
if (right_idx > numElts)
right_idx = numElts;

auto maskVec = [&](Value *dsub) -> Value * {
if (left_idx == 0 && right_idx == numElts)
return dsub;
Value *masked = Constant::getNullValue(VT);
for (unsigned i = left_idx; i < right_idx; i++) {
Comment thread
minansys marked this conversation as resolved.
Value *vidx =
ConstantInt::get(Type::getInt32Ty(val->getContext()), i);
Value *el = BuilderM.CreateExtractElement(dsub, vidx);
masked = BuilderM.CreateInsertElement(masked, el, vidx);
}
return masked;
};

if (getWidth() == 1) {
SmallVector<unsigned, 1> eidxs;
for (auto idx : idxs.slice(ignoreFirstSlicesOfDif))
eidxs.push_back((unsigned)cast<ConstantInt>(idx)->getZExtValue());

Value *subdif = extractMeta(BuilderM, dif, eidxs);
return addToDiffe(val, maskVec(subdif), BuilderM, addingType, idxs,
mask);
} else {
SmallVector<SelectInst *, 4> res;
for (unsigned j = 0; j < getWidth(); j++) {
SmallVector<Value *, 1> lidxs;
SmallVector<unsigned, 1> eidxs = {(unsigned)j};

lidxs.push_back(
ConstantInt::get(Type::getInt32Ty(val->getContext()), j));
for (auto idx : idxs.slice(ignoreFirstSlicesOfDif))
eidxs.push_back((unsigned)cast<ConstantInt>(idx)->getZExtValue());
for (auto idx : idxs)
lidxs.push_back(idx);

Value *subdif = extractMeta(BuilderM, dif, eidxs);
for (auto v : addToDiffe(val, maskVec(subdif), BuilderM, addingType,
lidxs, mask))
res.push_back(v);
}
return res;
}
}
}
}

llvm::errs() << " VT: " << *VT << " idxs:{";
for (auto idx : idxs)
llvm::errs() << *idx << ",";
Expand Down
39 changes: 39 additions & 0 deletions enzyme/test/Enzyme/ReverseMode/partial_vec_window.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
; RUN: %opt < %s %newLoadEnzyme -passes="enzyme,function(mem2reg,early-cse,sroa,instsimplify,%simplifycfg,adce)" -enzyme-preopt=false -S | FileCheck %s
; Regression test: partial-window accumulation into a fixed vector (<2 x float>).
; Previously asserted: "unhandled accumulate with partial sizes".

source_filename = "partial_vec_window"
target triple = "x86_64-pc-linux-gnu"

%ret2v = type { <2 x float>, <2 x float> }

define %ret2v @make(float %x) {
entry:
%v0 = insertelement <2 x float> zeroinitializer, float %x, i32 0
%r0 = insertvalue %ret2v undef, <2 x float> %v0, 0
%r1 = insertvalue %ret2v %r0, <2 x float> zeroinitializer, 1
ret %ret2v %r1
}

define float @tester(float %x) {
entry:
%call = call %ret2v @make(float %x)
%vec = extractvalue %ret2v %call, 0

; Force "partial" use: only the first 4 bytes of the <2 x float>
%tmp = alloca <2 x float>, align 8
store <2 x float> %vec, <2 x float>* %tmp, align 8
%fp = bitcast <2 x float>* %tmp to float*
%a = load float, float* %fp, align 4

ret float %a
}

define float @test_derivative(float %x) {
entry:
%d = call float (float (float)*, ...) @__enzyme_autodiff(float (float)* @tester, float %x)
ret float %d
}

declare float @__enzyme_autodiff(float (float)*, ...)
; CHECK: @diffetester
Comment thread
minansys marked this conversation as resolved.
60 changes: 60 additions & 0 deletions enzyme/test/Enzyme/ReverseModeVector/partial_vec_window.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
; RUN: if [ %llvmver -lt 16 ]; then %opt < %s %loadEnzyme -enzyme -enzyme-preopt=false -mem2reg -simplifycfg -early-cse -S | FileCheck %s; fi
; RUN: %opt < %s %newLoadEnzyme -passes="enzyme,function(mem2reg,%simplifycfg,early-cse)" -enzyme-preopt=false -S | FileCheck %s

; Regression test: reverse vector mode handles partial-window accumulation into a fixed vector.

%struct.Gradients = type { [2 x float] }
%ret2v = type { <2 x float>, <2 x float> }

declare %struct.Gradients @__enzyme_autodiff(float (float)*, ...)

define %ret2v @make(float %x) {
entry:
%v0 = insertelement <2 x float> zeroinitializer, float %x, i32 0
%r0 = insertvalue %ret2v undef, <2 x float> %v0, 0
%r1 = insertvalue %ret2v %r0, <2 x float> zeroinitializer, 1
ret %ret2v %r1
}

define float @tester(float %x) {
entry:
%call = call %ret2v @make(float %x)
%vec = extractvalue %ret2v %call, 0
%tmp = alloca <2 x float>, align 8
store <2 x float> %vec, <2 x float>* %tmp, align 8
%fp = bitcast <2 x float>* %tmp to float*
%a = load float, float* %fp, align 4
ret float %a
}

define %struct.Gradients @test_derivative(float %x) {
entry:
%d = call %struct.Gradients (float (float)*, ...) @__enzyme_autodiff(float (float)* @tester, metadata !"enzyme_width", i64 2, float %x)
ret %struct.Gradients %d
}

; CHECK-LABEL: define internal { [2 x float] } @diffe2tester(float %x, [2 x float] %differeturn)
; CHECK: entry:
; CHECK: %"vec'de" = alloca [2 x <2 x float>]
; CHECK: %"call'de" = alloca [2 x %ret2v]
; CHECK: %"x'de" = alloca [2 x float]
; CHECK: %call_augmented = call [2 x %ret2v] @augmented_make(float %x)
; CHECK: %"tmp'ipa" = alloca <2 x float>
; CHECK: %"tmp'ipa1" = alloca <2 x float>
; CHECK: %[[D0:.+]] = extractvalue [2 x float] %differeturn, 0
; CHECK: %[[L0:.+]] = load float, {{.*}}align 4{{.*}}
; CHECK: %[[A0:.+]] = fadd fast float %[[L0]], %[[D0]]
; CHECK: store float %[[A0]], {{.*}}align 4{{.*}}
; CHECK: %[[D1:.+]] = extractvalue [2 x float] %differeturn, 1
; CHECK: %[[L1:.+]] = load float, {{.*}}align 4{{.*}}
; CHECK: %[[A1:.+]] = fadd fast float %[[L1]], %[[D1]]
; CHECK: store float %[[A1]], {{.*}}align 4{{.*}}
; CHECK: %[[V0:.+]] = load <2 x float>, {{.*}}align 8{{.*}}
; CHECK: %[[V1:.+]] = load <2 x float>, {{.*}}align 8{{.*}}
; CHECK: %[[PACK:.+]] = load [2 x <2 x float>], {{.*}}align 8
; CHECK: %[[LANE0V:.+]] = extractvalue [2 x <2 x float>] %[[PACK]], 0
; CHECK: %[[LANE0:.+]] = extractelement <2 x float> %[[LANE0V]], i32 0
; CHECK: %[[LANE1V:.+]] = extractvalue [2 x <2 x float>] %[[PACK]], 1
; CHECK: %[[LANE1:.+]] = extractelement <2 x float> %[[LANE1V]], i32 0
; CHECK: %[[MAKE:.+]] = call { [2 x float] } @diffe2make(float %x)
; CHECK: ret { [2 x float] }
Loading