Subset compose + test

adamant-pwn · adamant-pwn · commit 0b656a5cf8ef · 2025-12-22T16:40:52.000+01:00
diff --git a/cp-algo/math/subset_convolution.hpp b/cp-algo/math/subset_convolution.hpp
@@ -11,18 +11,18 @@
 #include <cstring>
 CP_ALGO_SIMD_PRAGMA_PUSH
 namespace cp_algo::math {
-    const size_t logn = 20;
+    const size_t max_logn = 20;
     
     enum transform_dir { forw, inv };
     
     template<auto N, transform_dir direction>
     inline void or_transform(auto &&a) {
-        [[gnu::assume(N <= 1ull << 30)]];
+        [[gnu::assume(N <= 1 << 30)]];
         if constexpr (N <= 32) {
             for(size_t i = 1; i < N; i *= 2) {
                 for(size_t j = 0; j < N; j += 2 * i) {
                     for(size_t k = j; k < j + i; k++) {
-                        for(size_t z = 0; z < logn; z++) {
+                        for(size_t z = 0; z < max_logn; z++) {
                             if constexpr (direction == forw) {
                                 a[k + i][z] += a[k][z];
                             } else {
@@ -37,8 +37,8 @@ namespace cp_algo::math {
             or_transform<half, direction>(&a[0]);
             or_transform<half, direction>(&a[half]);
             for (size_t i = 0; i < half; i++) {
-                #pragma GCC unroll logn
-                for(size_t z = 0; z < logn; z++) {
+                #pragma GCC unroll max_logn
+                for(size_t z = 0; z < max_logn; z++) {
                     if constexpr (direction == forw) {
                         a[i + half][z] += a[i][z];
                     } else {
@@ -85,7 +85,7 @@ namespace cp_algo::math {
         // Create array buffers for each input
         auto create_buffers = [bottoms]<typename... Args>(const Args&...) {
             return std::make_tuple(
-                big_vector<std::array<typename std::decay_t<Args>::value_type, logn>>(bottoms)...
+                big_vector<std::array<typename std::decay_t<Args>::value_type, max_logn>>(bottoms)...
             );
         };
         auto buffers = std::apply(create_buffers, input_tuple);
@@ -130,8 +130,8 @@ namespace cp_algo::math {
             for(size_t i = 0; i < bottoms; i += K) {
                 std::apply([&](auto&... bufs) {
                     auto extract_one = [&](auto& buf) {
-                        std::array<u64x4, logn> aa;
-                        for(size_t j = 0; j < logn; j++) {
+                        std::array<u64x4, max_logn> aa;
+                        for(size_t j = 0; j < max_logn; j++) {
                             for(size_t z = 0; z < K; z++) {
                                 aa[j][z] = buf[i + z][j].getr();
                             }
@@ -145,7 +145,7 @@ namespace cp_algo::math {
                     // Write results back: only first array needs to be written
                     auto& first_buf = std::get<0>(std::forward_as_tuple(bufs...));
                     const auto& first_aa = std::get<0>(aa_tuple);
-                    for(size_t j = 0; j < logn; j++) {
+                    for(size_t j = 0; j < max_logn; j++) {
                         for(size_t z = 0; z < K; z++) {
                             first_buf[i + z][j].setr((uint32_t)first_aa[j][z]);
                         }
@@ -179,33 +179,40 @@ namespace cp_algo::math {
 
     template<typename base>
     big_vector<base> subset_convolution(std::span<base> inpa, std::span<base> inpb) {
-        auto outpa = on_rank_vectors([](auto &a, auto const& b) {
-            std::decay_t<decltype(a)> res = {};
-            const auto mod = base::mod();
-            const auto imod = math::inv2(-mod);
-            const auto r4 = u64x4() + uint64_t(-1) % mod + 1;
-            for(size_t i = 0; i < logn; i++) {
-                for(size_t j = 0; i + j + 1 < logn; j++) {
-                    res[i + j + 1] += (u64x4)_mm256_mul_epu32(__m256i(a[i]), __m256i(b[j]));
-                }
-                if (i == logn / 2) {
-                    for(size_t k = logn - 2; k < logn; k++) {
-                        res[k] = res[k] >= base::modmod8() ? res[k] - base::modmod8() : res[k];
+        big_vector<base> outpa;
+        with_bit_floor(std::size(inpa), [&]<auto N>() {
+            constexpr size_t lgn = std::bit_width(N) - 1;
+            [[gnu::assume(lgn <= max_logn)]];
+            outpa = on_rank_vectors([](auto &a, auto const& b) {
+                std::decay_t<decltype(a)> res = {};
+                const auto mod = base::mod();
+                const auto imod = math::inv2(-mod);
+                const auto modmod8 = base::modmod8();
+                const auto r4 = u64x4() + uint64_t(-1) % mod + 1;
+                auto add = [&](size_t i) {
+                    if constexpr (lgn) for(size_t j = 0; i + j + 1 < lgn; j++) {
+                        res[i + j + 1] += (u64x4)_mm256_mul_epu32(__m256i(a[i]), __m256i(b[j]));
                     }
+                };
+                if constexpr (lgn) for(size_t i = 0; i < lgn / 2; i++) { add(i); }
+                if constexpr (lgn >= 20) {
+                    res[lgn - 1] = res[lgn - 1] >= modmod8 ? res[lgn - 1] - modmod8 : res[lgn - 1];
+                    res[lgn - 2] = res[lgn - 2] >= modmod8 ? res[lgn - 2] - modmod8 : res[lgn - 2];
                 }
+                if constexpr (lgn) for(size_t i = lgn / 2; i < lgn; i++) { add(i); }
+                if constexpr (lgn) if constexpr (lgn) for(size_t k = 0; k < lgn; k++) {
+                    res[k] = montgomery_reduce(res[k], mod, imod);
+                    res[k] = montgomery_mul(res[k], r4, mod, imod);
+                    a[k] = res[k] >= mod ? res[k] - mod : res[k];
+                }
+            }, inpa, inpb);
+            
+            outpa[0] = inpa[0] * inpb[0];
+            for(size_t i = 1; i < std::size(inpa); i++) {
+                outpa[i] += inpa[i] * inpb[0] + inpa[0] * inpb[i];
             }
-            for(size_t k = 0; k < logn; k++) {
-                res[k] = montgomery_reduce(res[k], mod, imod);
-                res[k] = montgomery_mul(res[k], r4, mod, imod);
-                a[k] = res[k] >= mod ? res[k] - mod : res[k];
-            }
-        }, inpa, inpb);
-        
-        outpa[0] = inpa[0] * inpb[0];
-        for(size_t i = 1; i < std::size(inpa); i++) {
-            outpa[i] += inpa[i] * inpb[0] + inpa[0] * inpb[i];
-        }
-        checkpoint("fix 0");
+            checkpoint("fix 0");
+        });
         return outpa;
     }
 
@@ -215,11 +222,50 @@ namespace cp_algo::math {
             return big_vector<base>{1};
         }
         size_t N = std::size(inpa);
-        auto out0 = subset_exp<base>(std::span(inpa).first(N / 2));
+        auto out0 = subset_exp(std::span(inpa).first(N / 2));
         auto out1 = subset_convolution<base>(out0, std::span(inpa).last(N / 2));
         out0.insert(end(out0), begin(out1), end(out1));
+        cp_algo::checkpoint("extend out");
         return out0;
     }
+
+    template<typename base>
+    big_vector<big_vector<base>> subset_compose(big_vector<std::span<base>> fd, std::span<base> inpa) {
+        if (size(inpa) == 1) {
+            big_vector<big_vector<base>> res(size(fd), {base(0)});
+            big_vector<base> pw(size(fd[0]), 1);
+            for (size_t i = 1; i < size(fd[0]); i++) {
+                pw[i] = pw[i - 1] * inpa[0];
+            }
+            for (size_t i = 0; i < size(fd); i++) {
+                for (size_t j = 0; j < size(fd[i]); j++) {
+                    res[i][0] += pw[j] * fd[i][j];
+                }
+            }
+            cp_algo::checkpoint("base case");
+            return res;
+        }
+        size_t N = std::size(inpa);
+        big_vector<base> fdk(size(fd[0]));
+        for (size_t i = 0; i + 1 < size(fdk); i++) {
+            fdk[i] = fd.back()[i + 1] * base(i + 1);
+        }
+        fd.push_back(fdk);
+        cp_algo::checkpoint("fdk");
+        auto deeper = subset_compose(fd, std::span(inpa).first(N / 2));
+        for(size_t i = 0; i + 1 < size(fd); i++) {
+            auto next = subset_convolution<base>(deeper[i + 1], std::span(inpa).last(N / 2));
+            deeper[i].insert(end(deeper[i]), begin(next), end(next));
+        }
+        deeper.pop_back();
+        cp_algo::checkpoint("combine");
+        return deeper;
+    }
+
+    template<typename base>
+    big_vector<base> subset_compose(std::span<base> f, std::span<base> inpa) {
+        return subset_compose(big_vector{f}, inpa)[0];
+    }
 }
 #pragma GCC pop_options
 #endif // CP_ALGO_MATH_SUBSET_CONVOLUTION_HPP
diff --git a/verify/math/subset_compose.test.cpp b/verify/math/subset_compose.test.cpp
@@ -0,0 +1,42 @@
+// @brief Polynomial Composite Set Power Series
+#define PROBLEM "https://judge.yosupo.jp/problem/polynomial_composite_set_power_series"
+#pragma GCC optimize("O3,unroll-loops")
+#include <bits/allocator.h>
+#pragma GCC target("avx2")
+#include <iostream>
+#include "blazingio/blazingio.min.hpp"
+#define CP_ALGO_CHECKPOINT
+#include "cp-algo/number_theory/modint.hpp"
+#include "cp-algo/math/subset_convolution.hpp"
+#include <bits/stdc++.h>
+
+using namespace std;
+
+const int mod = 998244353;
+using base = cp_algo::math::modint<mod>;
+
+void solve() {
+    size_t M, n;
+    cin >> M >> n;
+    size_t N = 1 << n;
+    cp_algo::big_vector<base> f(M);
+    for(auto &it: f) {cin >> it;}
+    cp_algo::big_vector<base> a(N);
+    for(auto &it: a) {cin >> it;}
+    cp_algo::checkpoint("read");
+    auto c = cp_algo::math::subset_compose<base>(f, a);
+    for(auto &it: c) {cout << it << ' ';}
+    cp_algo::checkpoint("write");
+    cp_algo::checkpoint<1>();
+}
+
+signed main() {
+    //freopen("input.txt", "r", stdin);
+    ios::sync_with_stdio(0);
+    cin.tie(0);
+    int t;
+    t = 1;// cin >> t;
+    while(t--) {
+        solve();
+    }
+}