Description
Description
For a linear model that looks like:
vector mu = X * b + x1 .* r1[i1] + x2 .* r2[i2] + x3 .* r3[i3] + x4 .* r4[i4] + ...
we can get a speed boost by doing all the random effects terms together and avoiding temporaries from the multi-indexes (r1[i1]
), the elementwise multiplications, and the additions.
With this model: https://github.com/bbbales2/computations_in_glms/blob/master/benchmark_model/mrp_varmat.stan
Replacing the code:
vector[N] mu_varmat = Intercept + Xc * b_varmat
+ r_age_varmat[J_age]
+ r_educ_varmat[J_educ]
+ r_educ_age_varmat[J_educ_age]
+ r_educ_eth_varmat[J_educ_eth]
+ r_eth_varmat[J_eth]
+ r_male_eth_varmat[J_male_eth]
+ r_state_varmat[J_state];
with:
vector[N] mu_varmat = Intercept + Xc * b_varmat +
ranef(J_age, r_age_varmat, J_educ, r_educ_varmat, J_educ_age, r_educ_age_varmat, J_educ_eth, r_educ_eth_varmat, J_eth, r_eth_varmat, J_male_eth, r_male_eth_varmat, J_state, r_state_varmat);
Makes the per-gradient varmat timings go from about 9ms on my computer to about 7ms when using the mrp_all.json
dataset in that repo. There's a discussion over here of a fancier ranef
function.
The very rough c++ implementation of ranef
is:
template <typename T,
require_var_matrix_t<T>* = nullptr>
inline T ranef(const std::vector<int>& i1, const T& v1,
const std::vector<int>& i2, const T& v2,
const std::vector<int>& i3, const T& v3,
const std::vector<int>& i4, const T& v4,
const std::vector<int>& i5, const T& v5,
const std::vector<int>& i6, const T& v6,
const std::vector<int>& i7, const T& v7,
std::ostream*) {
check_matching_sizes("dot_product", "i1", i1, "i2", i2);
typename T::value_type res_val(i1.size());
arena_t<std::vector<int>> ai1(i1.size());
arena_t<std::vector<int>> ai2(i2.size());
arena_t<std::vector<int>> ai3(i3.size());
arena_t<std::vector<int>> ai4(i4.size());
arena_t<std::vector<int>> ai5(i5.size());
arena_t<std::vector<int>> ai6(i6.size());
arena_t<std::vector<int>> ai7(i7.size());
for(size_t n = 0; n < i1.size(); ++n) {
res_val.coeffRef(n) = v1.val().coeffRef(i1[n] - 1) +
v2.val().coeffRef(i2[n] - 1) +
v3.val().coeffRef(i3[n] - 1) +
v4.val().coeffRef(i4[n] - 1) +
v5.val().coeffRef(i5[n] - 1) +
v6.val().coeffRef(i6[n] - 1) +
v7.val().coeffRef(i7[n] - 1);
ai1[n] = i1[n];
ai2[n] = i2[n];
ai3[n] = i3[n];
ai4[n] = i4[n];
ai5[n] = i5[n];
ai6[n] = i6[n];
ai7[n] = i7[n];
}
T res = res_val;
reverse_pass_callback([v1, ai1,
v2, ai2,
v3, ai3,
v4, ai4,
v5, ai5,
v6, ai6,
v7, ai7, res]() mutable {
for(size_t n = 0; n < ai1.size(); ++n) {
double adj = res.adj().coeffRef(n);
v1.adj().coeffRef(ai1[n] - 1) += adj;
v2.adj().coeffRef(ai2[n] - 1) += adj;
v3.adj().coeffRef(ai3[n] - 1) += adj;
v4.adj().coeffRef(ai4[n] - 1) += adj;
v5.adj().coeffRef(ai5[n] - 1) += adj;
v6.adj().coeffRef(ai6[n] - 1) += adj;
v7.adj().coeffRef(ai7[n] - 1) += adj;
}
});
return res;
}
@t4c1 should we implement a ranef
function like this? Or can we achieve the same thing transparently with expressions? Is the answer will be different for mat<var>
and var<mat>
?
Current Version:
v3.4.0