add lazy_convolution.hpp, minor improvements

adamant-pwn · adamant-pwn · commit 9efb2ed4b50b · 2026-01-05T01:05:39.000+01:00
diff --git a/cp-algo/math/convolution.hpp b/cp-algo/math/convolution.hpp
@@ -14,7 +14,12 @@ namespace cp_algo::math {
 // Writes the result into `a`; performs in-place when possible (modint path).
 template<class VecA, class VecB>
 void convolution_prefix(VecA& a, VecB const& b, size_t need) {
-    using T = typename std::decay_t<VecA>::value_type;
+    using T = std::decay_t<decltype(a[0])>;
+    if constexpr (modint_type<T>) {
+        // Use NTT-based truncated multiplication. Works in-place on `a`.
+        fft::mul_truncate(a, b, need);
+        return;
+    }
     size_t na = std::min(need, std::size(a));
     size_t nb = std::min(need, std::size(b));
     a.resize(na);
@@ -24,11 +29,7 @@ void convolution_prefix(VecA& a, VecB const& b, size_t need) {
         a.clear();
         return;
     }
-
-    if constexpr (modint_type<T>) {
-        // Use NTT-based truncated multiplication. Works in-place on `a`.
-        fft::mul_truncate(a, bv, need);
-    } else if constexpr (std::is_same_v<T, fft::point>) {
+    if constexpr (std::is_same_v<T, fft::point>) {
         size_t conv_len = na + nb - 1;
         size_t n = std::bit_ceil(conv_len);
         n = std::max(n, (size_t)fft::flen);
diff --git a/cp-algo/math/lazy_convolution.hpp b/cp-algo/math/lazy_convolution.hpp
@@ -0,0 +1,41 @@
+#ifndef CP_ALGO_MATH_LAZY_MULTIPLY_HPP
+#define CP_ALGO_MATH_LAZY_MULTIPLY_HPP
+
+#include "convolution.hpp"
+#include <algorithm>
+
+namespace cp_algo::math {
+    template<typename base>
+    auto lazy_multiply(base a0, base b0, auto &&get_ab, size_t n) {
+        big_vector<base> A = {a0}, B = {b0};
+        big_vector<base> C(n);
+        C[0] = a0 * b0;
+        auto cdq = [&](this auto &&cdq, size_t l, size_t r) -> void {
+            if (r - l == 1) {
+                auto [al, bl] = get_ab(A, B, C, l);
+                A.push_back(al);
+                B.push_back(bl);
+                C[l] += A[l] * B[0] + A[0] * B[l];
+                return;
+            }
+            auto m = (l + r) / 2;
+            cdq(l, m);
+            auto A_pref = std::span(A).subspan(0, std::min(m, r - l));
+            auto B_pref = std::span(B).subspan(0, std::min(l, r - l));
+            big_vector<base> A_suf(std::from_range, std::span(A).subspan(l, m - l));
+            big_vector<base> B_suf(std::from_range, std::span(B).subspan(l, m - l));
+            convolution_prefix(A_suf, B_pref, r - l);
+            convolution_prefix(B_suf, A_pref, r - l);
+            A_suf.resize(r - l);
+            B_suf.resize(r - l);
+            for(size_t i = m; i < r; i++) {
+                C[i] += A_suf[i - l] + B_suf[i - l];
+            }
+            cdq(m, r);
+        };
+        cdq(1, n);
+        return C;
+    }
+}
+
+#endif // CP_ALGO_MATH_LAZY_MULTIPLY_HPP
diff --git a/cp-algo/number_theory/modint.hpp b/cp-algo/number_theory/modint.hpp
@@ -25,7 +25,7 @@ namespace cp_algo::math {
         constexpr modint_base(Int2 rr) {
             to_modint().setr(UInt((rr + modmod()) % mod()));
         }
-        modint inv() const {
+        constexpr modint inv() const {
             return bpow(to_modint(), mod() - 2);
         }
         modint operator - () const {
diff --git a/cp-algo/util/simd.hpp b/cp-algo/util/simd.hpp
@@ -31,6 +31,7 @@ namespace cp_algo {
     using u16x4 = simd<uint16_t, 4>;
     using i16x4 = simd<int16_t, 4>;
     using u8x32 = simd<uint8_t, 32>;
+    using u8x16 = simd<uint8_t, 16>;
     using u8x8 = simd<uint8_t, 8>;
     using u8x4 = simd<uint8_t, 4>;
     using dx4 = simd<double, 4>;

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ namespace cp_algo::math {`
`25`	`25`	`constexpr modint_base(Int2 rr) {`
`26`	`26`	`to_modint().setr(UInt((rr + modmod()) % mod()));`
`27`	`27`	`}`
`28`		`- modint inv() const {`
	`28`	`+ constexpr modint inv() const {`
`29`	`29`	`return bpow(to_modint(), mod() - 2);`
`30`	`30`	`}`
`31`	`31`	`modint operator - () const {`