Skip to content

Commit 32668a7

Browse files
authored
CPU mx.linalg.cholesky_inverse and mx.linalg.tri_inv (#1307)
* add cholesky inv + tri inv * always run tri_inv on cpu * consistent naming
1 parent 780c197 commit 32668a7

File tree

7 files changed

+267
-62
lines changed

7 files changed

+267
-62
lines changed

docs/src/python/linalg.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ Linear Algebra
99
:toctree: _autosummary
1010

1111
inv
12+
tri_inv
1213
norm
1314
cholesky
15+
cholesky_inv
1416
qr
1517
svd

mlx/backend/common/inverse.cpp

Lines changed: 103 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,106 @@
1010
#include <lapack.h>
1111
#endif
1212

13+
// Wrapper to account for differences in
14+
// LAPACK implementations (basically how to pass the 'uplo' string to fortran).
15+
int strtri_wrapper(char uplo, char diag, float* matrix, int N) {
16+
int info;
17+
18+
#ifdef LAPACK_FORTRAN_STRLEN_END
19+
strtri_(
20+
/* uplo = */ &uplo,
21+
/* diag = */ &diag,
22+
/* N = */ &N,
23+
/* a = */ matrix,
24+
/* lda = */ &N,
25+
/* info = */ &info,
26+
/* uplo_len = */ static_cast<size_t>(1),
27+
/* diag_len = */ static_cast<size_t>(1));
28+
#else
29+
strtri_(
30+
/* uplo = */ &uplo,
31+
/* diag = */ &diag,
32+
/* N = */ &N,
33+
/* a = */ matrix,
34+
/* lda = */ &N,
35+
/* info = */ &info);
36+
#endif
37+
38+
return info;
39+
}
40+
1341
namespace mlx::core {
1442

15-
void inverse_impl(const array& a, array& inv) {
43+
void general_inv(array& inv, int N, int i) {
44+
int info;
45+
auto ipiv = array::Data{allocator::malloc_or_wait(sizeof(int) * N)};
46+
// Compute LU factorization.
47+
sgetrf_(
48+
/* m = */ &N,
49+
/* n = */ &N,
50+
/* a = */ inv.data<float>() + N * N * i,
51+
/* lda = */ &N,
52+
/* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
53+
/* info = */ &info);
54+
55+
if (info != 0) {
56+
std::stringstream ss;
57+
ss << "inverse_impl: LU factorization failed with error code " << info;
58+
throw std::runtime_error(ss.str());
59+
}
60+
61+
static const int lwork_query = -1;
62+
float workspace_size = 0;
63+
64+
// Compute workspace size.
65+
sgetri_(
66+
/* m = */ &N,
67+
/* a = */ nullptr,
68+
/* lda = */ &N,
69+
/* ipiv = */ nullptr,
70+
/* work = */ &workspace_size,
71+
/* lwork = */ &lwork_query,
72+
/* info = */ &info);
73+
74+
if (info != 0) {
75+
std::stringstream ss;
76+
ss << "inverse_impl: LU workspace calculation failed with error code "
77+
<< info;
78+
throw std::runtime_error(ss.str());
79+
}
80+
81+
const int lwork = workspace_size;
82+
auto scratch = array::Data{allocator::malloc_or_wait(sizeof(float) * lwork)};
83+
84+
// Compute inverse.
85+
sgetri_(
86+
/* m = */ &N,
87+
/* a = */ inv.data<float>() + N * N * i,
88+
/* lda = */ &N,
89+
/* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
90+
/* work = */ static_cast<float*>(scratch.buffer.raw_ptr()),
91+
/* lwork = */ &lwork,
92+
/* info = */ &info);
93+
94+
if (info != 0) {
95+
std::stringstream ss;
96+
ss << "inverse_impl: inversion failed with error code " << info;
97+
throw std::runtime_error(ss.str());
98+
}
99+
}
100+
101+
void tri_inv(array& inv, int N, int i, bool upper) {
102+
const char uplo = upper ? 'L' : 'U';
103+
const char diag = 'N';
104+
int info = strtri_wrapper(uplo, diag, inv.data<float>() + N * N * i, N);
105+
if (info != 0) {
106+
std::stringstream ss;
107+
ss << "inverse_impl: triangular inversion failed with error code " << info;
108+
throw std::runtime_error(ss.str());
109+
}
110+
}
111+
112+
void inverse_impl(const array& a, array& inv, bool tri, bool upper) {
16113
// Lapack uses the column-major convention. We take advantage of the following
17114
// identity to avoid transposing (see
18115
// https://math.stackexchange.com/a/340234):
@@ -24,63 +121,11 @@ void inverse_impl(const array& a, array& inv) {
24121
const int N = a.shape(-1);
25122
const size_t num_matrices = a.size() / (N * N);
26123

27-
int info;
28-
auto ipiv = array::Data{allocator::malloc_or_wait(sizeof(int) * N)};
29-
30124
for (int i = 0; i < num_matrices; i++) {
31-
// Compute LU factorization.
32-
sgetrf_(
33-
/* m = */ &N,
34-
/* n = */ &N,
35-
/* a = */ inv.data<float>() + N * N * i,
36-
/* lda = */ &N,
37-
/* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
38-
/* info = */ &info);
39-
40-
if (info != 0) {
41-
std::stringstream ss;
42-
ss << "inverse_impl: LU factorization failed with error code " << info;
43-
throw std::runtime_error(ss.str());
44-
}
45-
46-
static const int lwork_query = -1;
47-
float workspace_size = 0;
48-
49-
// Compute workspace size.
50-
sgetri_(
51-
/* m = */ &N,
52-
/* a = */ nullptr,
53-
/* lda = */ &N,
54-
/* ipiv = */ nullptr,
55-
/* work = */ &workspace_size,
56-
/* lwork = */ &lwork_query,
57-
/* info = */ &info);
58-
59-
if (info != 0) {
60-
std::stringstream ss;
61-
ss << "inverse_impl: LU workspace calculation failed with error code "
62-
<< info;
63-
throw std::runtime_error(ss.str());
64-
}
65-
66-
const int lwork = workspace_size;
67-
auto scratch =
68-
array::Data{allocator::malloc_or_wait(sizeof(float) * lwork)};
69-
70-
// Compute inverse.
71-
sgetri_(
72-
/* m = */ &N,
73-
/* a = */ inv.data<float>() + N * N * i,
74-
/* lda = */ &N,
75-
/* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
76-
/* work = */ static_cast<float*>(scratch.buffer.raw_ptr()),
77-
/* lwork = */ &lwork,
78-
/* info = */ &info);
79-
80-
if (info != 0) {
81-
std::stringstream ss;
82-
ss << "inverse_impl: inversion failed with error code " << info;
83-
throw std::runtime_error(ss.str());
125+
if (tri) {
126+
tri_inv(inv, N, i, upper);
127+
} else {
128+
general_inv(inv, N, i);
84129
}
85130
}
86131
}
@@ -89,7 +134,7 @@ void Inverse::eval(const std::vector<array>& inputs, array& output) {
89134
if (inputs[0].dtype() != float32) {
90135
throw std::runtime_error("[Inverse::eval] only supports float32.");
91136
}
92-
inverse_impl(inputs[0], output);
137+
inverse_impl(inputs[0], output, tri_, upper_);
93138
}
94139

95140
} // namespace mlx::core

mlx/linalg.cpp

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ std::vector<array> svd(const array& a, StreamOrDevice s /* = {} */) {
238238
{a});
239239
}
240240

241-
array inv(const array& a, StreamOrDevice s /* = {} */) {
241+
array inv_impl(const array& a, bool tri, bool upper, StreamOrDevice s) {
242242
if (a.dtype() != float32) {
243243
std::ostringstream msg;
244244
msg << "[linalg::inv] Arrays must type float32. Received array "
@@ -258,7 +258,21 @@ array inv(const array& a, StreamOrDevice s /* = {} */) {
258258
}
259259

260260
return array(
261-
a.shape(), a.dtype(), std::make_shared<Inverse>(to_stream(s)), {a});
261+
a.shape(),
262+
a.dtype(),
263+
std::make_shared<Inverse>(to_stream(s), tri, upper),
264+
{a});
265+
}
266+
267+
array inv(const array& a, StreamOrDevice s /* = {} */) {
268+
return inv_impl(a, /*tri=*/false, /*upper=*/true, s);
269+
}
270+
271+
array tri_inv(
272+
const array& a,
273+
bool upper /* = true */,
274+
StreamOrDevice s /* = {} */) {
275+
return inv_impl(a, /*tri=*/true, upper, s);
262276
}
263277

264278
array cholesky(
@@ -292,4 +306,37 @@ array cholesky(
292306
{a});
293307
}
294308

309+
array cholesky_inv(
310+
const array& L,
311+
bool upper /* = false */,
312+
StreamOrDevice s /* = {} */) {
313+
if (L.dtype() != float32) {
314+
std::ostringstream msg;
315+
msg << "[linalg::cholesky] Arrays must type float32. Received array "
316+
<< "with type " << L.dtype() << ".";
317+
throw std::invalid_argument(msg.str());
318+
}
319+
320+
if (L.ndim() < 2) {
321+
std::ostringstream msg;
322+
msg << "[linalg::cholesky] Arrays must have >= 2 dimensions. Received array "
323+
"with "
324+
<< L.ndim() << " dimensions.";
325+
throw std::invalid_argument(msg.str());
326+
}
327+
328+
if (L.shape(-1) != L.shape(-2)) {
329+
throw std::invalid_argument(
330+
"[linalg::cholesky] Cholesky inverse is only defined for square "
331+
"matrices.");
332+
}
333+
334+
array L_inv = tri_inv(L, upper, s);
335+
if (upper) {
336+
return matmul(L_inv, swapaxes(L_inv, -1, -2, s), s);
337+
} else {
338+
return matmul(swapaxes(L_inv, -1, -2, s), L_inv, s);
339+
}
340+
}
341+
295342
} // namespace mlx::core::linalg

mlx/linalg.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,10 @@ std::vector<array> svd(const array& a, StreamOrDevice s = {});
6666

6767
array inv(const array& a, StreamOrDevice s = {});
6868

69+
array tri_inv(const array& a, bool upper = false, StreamOrDevice s = {});
70+
6971
array cholesky(const array& a, bool upper = false, StreamOrDevice s = {});
7072

73+
array cholesky_inv(const array& a, bool upper = false, StreamOrDevice s = {});
74+
7175
} // namespace mlx::core::linalg

mlx/primitives.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2127,7 +2127,8 @@ class SVD : public Primitive {
21272127
/* Matrix inversion primitive. */
21282128
class Inverse : public UnaryPrimitive {
21292129
public:
2130-
explicit Inverse(Stream stream) : UnaryPrimitive(stream) {}
2130+
explicit Inverse(Stream stream, bool tri, bool upper)
2131+
: UnaryPrimitive(stream), tri_(tri), upper_(upper) {}
21312132

21322133
void eval_cpu(const std::vector<array>& inputs, array& output) override;
21332134
void eval_gpu(const std::vector<array>& inputs, array& output) override;
@@ -2137,6 +2138,8 @@ class Inverse : public UnaryPrimitive {
21372138

21382139
private:
21392140
void eval(const std::vector<array>& inputs, array& output);
2141+
bool tri_;
2142+
bool upper_;
21402143
};
21412144

21422145
class Cholesky : public UnaryPrimitive {

python/src/linalg.cpp

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,31 @@ void init_linalg(nb::module_& parent_module) {
261261
array: ``ainv`` such that ``dot(a, ainv) = dot(ainv, a) = eye(a.shape[0])``
262262
)pbdoc");
263263
m.def(
264+
"tri_inv",
265+
&tri_inv,
266+
"a"_a,
267+
"upper"_a,
268+
nb::kw_only(),
269+
"stream"_a = nb::none(),
270+
nb::sig(
271+
"def tri_inv(a: array, upper: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
272+
R"pbdoc(
273+
Compute the inverse of a triangular square matrix.
274+
275+
This function supports arrays with at least 2 dimensions. When the input
276+
has more than two dimensions, the inverse is computed for each matrix
277+
in the last two dimensions of ``a``.
278+
279+
Args:
280+
a (array): Input array.
281+
upper (array): Whether the array is upper or lower triangular. Defaults to ``False``.
282+
stream (Stream, optional): Stream or device. Defaults to ``None``
283+
in which case the default stream of the default device is used.
284+
285+
Returns:
286+
array: ``ainv`` such that ``dot(a, ainv) = dot(ainv, a) = eye(a.shape[0])``
287+
)pbdoc");
288+
m.def(
264289
"cholesky",
265290
&cholesky,
266291
"a"_a,
@@ -286,8 +311,46 @@ void init_linalg(nb::module_& parent_module) {
286311
in which case the default stream of the default device is used.
287312
288313
Returns:
289-
array: If ``upper = False``, it returns a lower trinagular ``L`` matrix such
314+
array: If ``upper = False``, it returns a lower triangular ``L`` matrix such
290315
that ``dot(L, L.T) = a``. If ``upper = True``, it returns an upper triangular
291316
``U`` matrix such that ``dot(U.T, U) = a``.
292317
)pbdoc");
318+
m.def(
319+
"cholesky_inv",
320+
&cholesky_inv,
321+
"a"_a,
322+
"upper"_a = false,
323+
nb::kw_only(),
324+
"stream"_a = nb::none(),
325+
nb::sig(
326+
"def cholesky_inv(L: array, upper: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
327+
R"pbdoc(
328+
Compute the inverse of a real symmetric positive semi-definite matrix using it's Cholesky decomposition L.
329+
330+
Let A be a real symmetric positive semi-definite matrix and L its Cholesky definition such that:
331+
332+
.. math::
333+
334+
\begin{aligned}
335+
\mathbf{A} = \mathbf{L}\mathbf{L}^T
336+
\end{aligned}
337+
338+
This function computes :math:`\mathbf{A}^{-1}`.
339+
340+
This function supports arrays with at least 2 dimensions. When the input
341+
has more than two dimensions, the Cholesky inverse is computed for each matrix
342+
in the last two dimensions of ``L``.
343+
344+
If the input matrix is not a triangular matrix behaviour is undefined.
345+
346+
Args:
347+
L (array): Input array.
348+
upper (bool, optional): If ``True``, return the upper triangular Cholesky factor.
349+
If ``False``, return the lower triangular Cholesky factor. Default: ``False``.
350+
stream (Stream, optional): Stream or device. Defaults to ``None``
351+
in which case the default stream of the default device is used.
352+
353+
Returns:
354+
array: :math:`A^{-1}` where :math:`\mathbf{A} = \mathbf{L}\mathbf{L}^T`.
355+
)pbdoc");
293356
}

0 commit comments

Comments
 (0)