forked from altera-fpga/hls-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstreaming_cholesky.hpp
255 lines (219 loc) · 9.81 KB
/
streaming_cholesky.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
#ifndef __STREAMING_CHOLESKY_HPP__
#define __STREAMING_CHOLESKY_HPP__
#include "constexpr_math.hpp"
#include "tuple.hpp"
#include "unrolled_loop.hpp"
namespace fpga_linalg {
/*
Cholesky decomposition - Computes L such that A=LL* where:
- A is the input matrix (hermitian, positive definite)
- L is a lower triangular matrix
- L* is the conjugate transpose of L
This function implements a modified version of the Cholesky–Banachiewicz
algorithm.
Pseudo code:
int row_size = 0;
for (column = 0; column <= row_size; column++) {
for (row = column; row < rows; row++) {
float sum = 0;
for (k = 0; k < column; k++)
sum += L[row][k] * L[column][k];
if (row == column)
L[row][column] = sqrt(A[row][row] - sum);
else
L[row][column] = (A[row][column] - sum) / L[column][column];
}
}
The input and output matrices are consumed/produced from/to pipes.
*/
template <typename T, // The datatype for the computation
bool is_complex, // True if T is ac_complex<X>
int rows, // Number of rows==columns in the A matrices
int raw_latency, // Read after write (RAW) latency (in iterations) of
// the triangular loop of this function.
// This value depends on the FPGA target, the
// datatype, the target frequency, etc.
// This value will have to be tuned for optimal
// performance. Refer to the Triangular Loop
// design pattern tutorial.
// In general, find a high value for which the
// compiler is able to achieve an II of 1 and
// go down from there.
int pipe_size, // Number of elements read/write per pipe operation
// to read the input matrix
typename AIn, // A matrix input pipe, receive pipe_size
// elements from the pipe with each read
typename LOut // L matrix output pipe, send one element to the
// pipe with each write.
// Only lower-left elements of L are
// sent in row order, starting with row 0.
>
struct StreamingCholesky {
void operator()() const {
// Functional assertions
static_assert(rows >= 4,
"Only matrices of size 4x4 and over are supported");
static_assert(pipe_size >= 1,
"The pipe must be able to contain at least one element");
// Set the computation type to T or ac_complex<T> depending on the value
// of is_complex
using TT = std::conditional_t<is_complex, ac_complex<T>, T>;
constexpr int kColumns = rows;
// Number of lower-left elements in the L output matrix
constexpr int kLMatrixSize = kColumns * (kColumns + 1) / 2;
// Compute Cholesky decompositions as long as matrices are given as inputs
while (1) {
// Break memories up to store pipe_size elements per bank
constexpr short kBankwidth = pipe_size * sizeof(TT);
constexpr unsigned short kNumBanks = rows / pipe_size;
// When specifying numbanks for a memory, it must be a power of 2.
// Unused banks will be automatically optimized away.
constexpr short kNumBanksNextPow2 =
fpga_tools::Pow2(fpga_tools::CeilLog2(kNumBanks));
[[intel::numbanks(kNumBanksNextPow2)]] // NO-FORMAT: Attribute
[[intel::bankwidth(kBankwidth)]] // NO-FORMAT: Attribute
[[intel::private_copies(4)]] // NO-FORMAT: Attribute
[[intel::max_replicates(1)]] // NO-FORMAT: Attribute
TT a_load[rows][kColumns];
// Two copies of L to be able to load two complete rows per iteration
// Multiple private copies to be able to overlap multiple loop
// iterations
[[intel::private_copies(4)]] // NO-FORMAT: Attribute
TT l_result_compute[rows][kColumns];
[[intel::private_copies(4)]] // NO-FORMAT: Attribute
TT l_result_compute_copy[rows][kColumns];
[[intel::private_copies(2)]] // NO-FORMAT: Attribute
TT l_result[kLMatrixSize];
// Copy a matrix from the pipe to a local memory
// Number of pipe reads of pipe_size required to read a full column
constexpr int kExtraIteration = ((rows % pipe_size) != 0) ? 1 : 0;
constexpr int kLoopIterPerColumn = (rows / pipe_size) + kExtraIteration;
// Number of pipe reads of pipe_size to read all the matrices
constexpr int kLoopIter = kLoopIterPerColumn * kColumns;
// Size in bits of the loop iterator over kLoopIter iterations
constexpr int kLoopIterBitSize =
fpga_tools::BitsForMaxValue<kLoopIter + 1>();
[[intel::initiation_interval(1)]] // NO-FORMAT: Attribute
for (ac_int<kLoopIterBitSize, false> li = 0; li < kLoopIter; li++) {
fpga_tools::NTuple<TT, pipe_size> pipe_read = AIn::read();
int write_idx = li % kLoopIterPerColumn;
fpga_tools::UnrolledLoop<kLoopIterPerColumn>([&](auto k) {
fpga_tools::UnrolledLoop<pipe_size>([&](auto t) {
if constexpr (k * pipe_size + t < kColumns) {
if (write_idx == k) {
a_load[li / kLoopIterPerColumn][k * pipe_size + t] =
pipe_read.template get<t>();
}
}
// Delay data signals to create a vine-based data distribution
// to lower signal fanout.
pipe_read.template get<t>() =
sycl::ext::intel::fpga_reg(pipe_read.template get<t>());
});
write_idx = sycl::ext::intel::fpga_reg(write_idx);
});
}
// Computation of the number of iterations required for the triangular
// loop. Refer to the triangular_loop tutorial for details on how
// to compute these.
constexpr int kRegularIterations = kColumns * (kColumns + 1) / 2;
constexpr int kExtraIterations = (raw_latency - 1) * raw_latency / 2;
constexpr int kExtraIterationsToRemove =
kColumns >= raw_latency
? 0
: (raw_latency - kColumns) * (raw_latency - kColumns + 1) / 2;
constexpr int kTotalIterations =
kRegularIterations + kExtraIterations - kExtraIterationsToRemove;
// Compute the L matrix
int column = 0;
int row = 0;
TT div_term{0};
[[intel::ivdep(raw_latency)]] // NO-FORMAT: Attribute
for (int iteration = 0; iteration < kTotalIterations; iteration++) {
// Perform the dot product of the elements of the two rows indexed by
// row and column from element 0 to column
TT sum = 0;
fpga_tools::UnrolledLoop<kColumns>([&](auto k) {
TT to_add;
bool should_compute = k < column;
TT mul_lhs = should_compute ? l_result_compute[row][k] : T{0};
TT mul_rhs = should_compute ? l_result_compute_copy[column][k] : T{0};
if constexpr (is_complex) {
to_add = mul_lhs * mul_rhs.conj();
} else {
to_add = mul_lhs * mul_rhs;
}
sum += to_add;
});
TT a_loaded = (row < rows) ? a_load[row][column] : TT{0};
TT diff = a_loaded - sum;
// Only do useful work for meaningful iterations
TT to_store;
if (row == column) {
// Perform the reciprocal sqrt rather than the sqrt because:
// - it has a shorter latency and will reduce the RAW latency
// of the loop
// - the result of the sqrt is used as a divisor which is also
// a long operation, so replacing x/sqrt by x*rsqrt will save
// latency
// - the diagonal elements will need to be inverted later, but we
// can do that while outside this loop when we transfer the L
// matrix to the pipe
if constexpr (is_complex) {
div_term = {sycl::rsqrt(diff.r()), 0};
} else {
div_term = sycl::rsqrt(diff);
}
to_store = div_term;
} else {
to_store = diff * div_term;
}
if (column <= row) {
// Store the results to two working copies of L to be able to read
// two complete rows at each iteration
l_result_compute[row][column] = to_store;
l_result_compute_copy[row][column] = to_store;
// Store the result to the output matrix
if constexpr (is_complex) {
l_result[row * (row + 1) / 2 + column] = to_store.conj();
} else {
l_result[row * (row + 1) / 2 + column] = to_store;
}
}
// Update loop indexes
if (row == (rows - 1)) {
column = column + 1;
row = sycl::min(column, rows - raw_latency);
} else {
row = row + 1;
}
} // end of iteration
// Go over the L matrix and write each element to the pipe
int l_idx = 0;
[[intel::loop_coalesce(2)]] // NO-FORMAT: Attribute
for (int row = 0; row < rows; row++) {
for (int column = 0; column <= row; column++) {
TT to_write;
TT current_l_value = l_result[l_idx];
// The diagonal elements need to be inverted as the
// inversion was removed from the above compute loop
// to reduce the RAW latency
if (row == column) {
if constexpr (is_complex) {
to_write = {1 / current_l_value.r(), 0};
}
else{
to_write = 1 / current_l_value;
}
} else {
to_write = current_l_value;
}
LOut::write(to_write);
l_idx++;
}
}
} // end of while(1)
} // end of operator
}; // end of struct
} // namespace fpga_linalg
#endif /* __STREAMING_CHOLESKY_HPP__ */