cp-algorithms
diff --git a/‎cp-algo/math/cvector.hpp
Lines changed: 116 additions & 116 deletions b/‎cp-algo/math/cvector.hpp
Lines changed: 116 additions & 116 deletions
diff --git a/‎cp-algo/math/fft.hpp
Lines changed: 46 additions & 6 deletions b/‎cp-algo/math/fft.hpp
Lines changed: 46 additions & 6 deletions
diff --git a/‎cp-algo/util/checkpoint.hpp
Lines changed: 17 additions & 0 deletions b/‎cp-algo/util/checkpoint.hpp
Lines changed: 17 additions & 0 deletions
@@ -1,39 +1,24 @@
 #ifndef CP_ALGO_MATH_CVECTOR_HPP
 #define CP_ALGO_MATH_CVECTOR_HPP
-#include <algorithm>
-#include <cassert>
-#include <complex>
-#include <vector>
+#include "../util/complex.hpp"
+#include "../util/checkpoint.hpp"
+#include <experimental/simd>
 #include <ranges>
 namespace cp_algo::math::fft {
     using ftype = double;
-    static constexpr size_t bytes = 32;
-    static constexpr size_t flen = bytes / sizeof(ftype);
-    using point = std::complex<ftype>;
-    using vftype [[gnu::vector_size(bytes)]] = ftype;
-    using vpoint = std::complex<vftype>;
+    using point = complex<ftype>;
+    using vftype = std::experimental::native_simd<ftype>;
+    using vpoint = complex<vftype>;
+    static constexpr size_t flen = vftype::size();
 
-#define WITH_IV(...)                             \
-  [&]<size_t ... i>(std::index_sequence<i...>) { \
-      return __VA_ARGS__;                        \
-  }(std::make_index_sequence<flen>());
-
-    template<typename ft>
-    constexpr ft to_ft(auto x) {
-        return ft{} + x;
-    }
-    template<typename pt>
-    constexpr pt to_pt(point r) {
-        using ft = std::conditional_t<std::is_same_v<point, pt>, ftype, vftype>;
-        return {to_ft<ft>(r.real()), to_ft<ft>(r.imag())};
-    }
     struct cvector {
-        static constexpr size_t pre_roots = 1 << 17;
+        static constexpr size_t pre_roots = 1 << 15;
         std::vector<vftype> x, y;
         cvector(size_t n) {
             n = std::max(flen, std::bit_ceil(n));
             x.resize(n / flen);
             y.resize(n / flen);
+            checkpoint("cvector create");
         }
         template<class pt = point>
         void set(size_t k, pt t) {
@@ -60,132 +45,147 @@ namespace cp_algo::math::fft {
         size_t size() const {
             return flen * std::size(x);
         }
+
+
+        static auto dot_block(size_t k, cvector const& A, cvector const& B) {
+            auto rt = eval_point(k / flen / 2);
+            if(k / flen % 2) {
+                rt = -rt;
+            }
+            auto [Bvx, Bvy] = B.vget(k);
+            auto [Brvx, Brvy] = vpoint(Bvx, Bvy) * vpoint(real(rt), imag(rt));
+            auto [Ax, Ay] = A.vget(k);
+            ftype Bx[2 * flen], By[2 * flen];
+            Bvx.copy_to(Bx + flen, std::experimental::vector_aligned);
+            Bvy.copy_to(By + flen, std::experimental::vector_aligned);
+            Brvx.copy_to(Bx, std::experimental::vector_aligned);
+            Brvy.copy_to(By, std::experimental::vector_aligned);
+            vpoint res = {0, 0};
+            for(size_t i = 0; i < flen; i++) {
+                vftype Bsx, Bsy;
+                Bsx.copy_from(Bx + flen - i, std::experimental::element_aligned);
+                Bsy.copy_from(By + flen - i, std::experimental::element_aligned);
+                res += vpoint(Ax[i], Ay[i]) * vpoint(Bsx, Bsy);
+            }
+            return res;
+        }
+
         void dot(cvector const& t) {
-            size_t n = size();
+            size_t n = this->size();
             for(size_t k = 0; k < n; k += flen) {
-                set(k, get<vpoint>(k) * t.get<vpoint>(k));
+                set(k, dot_block(k, *this, t));
             }
+            checkpoint("dot");
         }
-        static const cvector roots;
-        template<class pt = point>
-        static pt root(size_t n, size_t k) {
-            if(n < pre_roots) {
-                return roots.get<pt>(n + k);
+        static const cvector roots, evalp;
+        static std::array<size_t, pre_roots> eval_args;
+        
+        template<bool precalc = false>
+        static size_t eval_arg(size_t n) {
+            if(n < pre_roots && !precalc) {
+                return eval_args[n];
+            } else if(n == 0) {
+                return 0;
             } else {
-                auto arg = std::numbers::pi / ftype(n);
-                if constexpr(std::is_same_v<pt, point>) {
-                    return {cos(ftype(k) * arg), sin(ftype(k) * arg)};
-                } else {
-                    return WITH_IV(pt{vftype{cos(ftype(k + i) * arg)...},
-                                      vftype{sin(ftype(k + i) * arg)...}});
-                }
+                return eval_arg(n / 2) | (n & 1) << (std::bit_width(n) - 1);
             }
         }
-        template<class pt = point>
+        template< bool precalc = false>
+        static auto root(size_t n, size_t k) {
+            if(n < pre_roots && !precalc) {
+                return roots.get(n + k);
+            } else {
+                return polar(1., std::numbers::pi / (ftype)n * (ftype)k);
+            }
+        }
+        template< bool precalc = false>
+        static point eval_point(size_t n) {
+            if(n < pre_roots && !precalc) {
+                return evalp.get(n);
+            } else if(n == 0) {
+                return 1;
+            } else {
+                size_t N = std::bit_floor(n);
+                return root(2 * N, eval_arg(n));
+            }
+        }
+
+        template<bool precalc = false>
         static void exec_on_roots(size_t n, size_t m, auto &&callback) {
-            size_t step = sizeof(pt) / sizeof(point);
-            pt cur;
-            pt arg = to_pt<pt>(root<point>(n, step));
-            for(size_t i = 0; i < m; i += step) {
-                if(i % 64 == 0 || n < pre_roots) {
-                    cur = root<pt>(n, i);
+            point cur;
+            point arg = root<precalc>(n, 1);
+            for(size_t i = 0; i < m; i++) {
+                if(precalc || i % 32 == 0 || n < pre_roots) {
+                    cur = root<precalc>(n, i);
                 } else {
                     cur *= arg;
                 }
                 callback(i, cur);
             }
         }
+        static void exec_on_evals(size_t n, auto &&callback) {
+            for(size_t i = 0; i < n; i++) {
+                callback(i, eval_point(i));
+            }
+        }
 
         void ifft() {
             size_t n = size();
-            for(size_t i = 1; i < n; i *= 2) {
-                for(size_t j = 0; j < n; j += 2 * i) {
-                    auto butterfly = [&]<class pt>(size_t k, pt rt) {
-                        k += j;
-                        auto t = get<pt>(k + i) * conj(rt);
-                        set(k + i, get<pt>(k) - t);
-                        set(k, get<pt>(k) + t);
-                    };
-                    if(2 * i <= flen) {
-                        exec_on_roots(i, i, butterfly);
-                    } else {
-                        exec_on_roots<vpoint>(i, i, butterfly);
+            for(size_t i = flen; i <= n / 2; i *= 2) {
+                exec_on_evals(n / (2 * i), [&](size_t k, point rt) {
+                    k *= 2 * i;
+                    vpoint vrt = {real(rt), imag(rt)};
+                    for(size_t j = k; j < k + i; j += flen) {
+                        auto A = get<vpoint>(j) + get<vpoint>(j + i);
+                        auto B = get<vpoint>(j) - get<vpoint>(j + i);
+                        set(j, A);
+                        set(j + i, B * conj(vrt));
                     }
-                }
+                });
             }
+            checkpoint("ifft");
             for(size_t k = 0; k < n; k += flen) {
-                set(k, get<vpoint>(k) /= to_pt<vpoint>(ftype(n)));
+                set(k, get<vpoint>(k) /= (ftype)(n / flen));
             }
         }
         void fft() {
             size_t n = size();
-            for(size_t i = n / 2; i >= 1; i /= 2) {
-                for(size_t j = 0; j < n; j += 2 * i) {
-                    auto butterfly = [&]<class pt>(size_t k, pt rt) {
-                        k += j;
-                        auto A = get<pt>(k) + get<pt>(k + i);
-                        auto B = get<pt>(k) - get<pt>(k + i);
-                        set(k, A);
-                        set(k + i, B * rt);
-                    };
-                    if(2 * i <= flen) {
-                        exec_on_roots(i, i, butterfly);
-                    } else {
-                        exec_on_roots<vpoint>(i, i, butterfly);
+            for(size_t i = n / 2; i >= flen; i /= 2) {
+                exec_on_evals(n / (2 * i), [&](size_t k, point rt) {
+                    k *= 2 * i;
+                    vpoint vrt = {real(rt), imag(rt)};
+                    for(size_t j = k; j < k + i; j += flen) {
+                        auto t = get<vpoint>(j + i) * vrt;
+                        set(j + i, get<vpoint>(j) - t);
+                        set(j, get<vpoint>(j) + t);
                     }
-                }
+                });
             }
+            checkpoint("fft");
         }
     };
+    std::array<size_t, cvector::pre_roots> cvector::eval_args = []() {
+        std::array<size_t, pre_roots> res = {};
+        for(size_t i = 1; i < pre_roots; i++) {
+            res[i] = res[i >> 1] | (i & 1) << (std::bit_width(i) - 1);
+        }
+        return res;
+    }();
     const cvector cvector::roots = []() {
         cvector res(pre_roots);
         for(size_t n = 1; n < res.size(); n *= 2) {
-            auto base = std::polar(1., std::numbers::pi / ftype(n));
-            point cur = 1;
-            for(size_t k = 0; k < n; k++) {
-                if((k & 15) == 0) {
-                    cur = std::polar(1., std::numbers::pi * ftype(k) / ftype(n));
-                }
-                res.set(n + k, cur);
-                cur *= base;
-            }
+            cvector::exec_on_roots<true>(n, n, [&](size_t k, auto rt) {
+                res.set(n + k, rt);
+            });
         }
         return res;
     }();
-
-    template<typename base>
-    struct dft {
-        cvector A;
-        
-        dft(std::vector<base> const& a, size_t n): A(n) {
-            for(size_t i = 0; i < std::min(n, a.size()); i++) {
-                A.set(i, a[i]);
-            }
-            if(n) {
-                A.fft();
-            }
-        }
-
-        std::vector<base> operator *= (dft const& B) {
-            assert(A.size() == B.A.size());
-            size_t n = A.size();
-            if(!n) {
-                return std::vector<base>();
-            }
-            A.dot(B.A);
-            A.ifft();
-            std::vector<base> res(n);
-            for(size_t k = 0; k < n; k++) {
-                res[k] = A.get(k);
-            }
-            return res;
-        }
-
-        auto operator * (dft const& B) const {
-            return dft(*this) *= B;
+    const cvector cvector::evalp = []() {
+        cvector res(pre_roots);
+        for(size_t n = 0; n < res.size(); n++) {
+            res.set(n, cvector::eval_point<true>(n));
         }
-
-        point operator [](int i) const {return A.get(i);}
-    };
+        return res;
+    }();
 }
 #endif // CP_ALGO_MATH_CVECTOR_HPP
@@ -1,10 +1,13 @@
 #ifndef CP_ALGO_MATH_FFT_HPP
 #define CP_ALGO_MATH_FFT_HPP
 #include "../number_theory/modint.hpp"
+#include "../util/checkpoint.hpp"
 #include "cvector.hpp"
+#include <ranges>
+#include <iostream>
 namespace cp_algo::math::fft {
     template<modint_type base>
-    struct dft<base> {
+    struct dft {
         int split;
         cvector A, B;
 
@@ -18,6 +21,7 @@ namespace cp_algo::math::fft {
                 B.set(ti, B.get(ti) + quo * rt);
 
             });
+            checkpoint("dft init");
             if(n) {
                 A.fft();
                 B.fft();
@@ -31,12 +35,47 @@ namespace cp_algo::math::fft {
                 res = {};
                 return;
             }
-            for(size_t i = 0; i < n; i += flen) {
-                auto tmp = A.vget(i) * D.vget(i) + B.vget(i) * C.vget(i);
-                A.set(i, A.vget(i) * C.vget(i));
-                B.set(i, B.vget(i) * D.vget(i));
-                C.set(i, tmp);
+            for(size_t k = 0; k < n; k += flen) {
+                auto rt = cvector::eval_point(k / flen / 2);
+                if(k / flen % 2) {
+                    rt = -rt;
+                }
+                auto [Ax, Ay] = A.vget(k);
+                auto [Bx, By] = B.vget(k);
+                auto [Cvx, Cvy] = C.vget(k);
+                auto [Dvx, Dvy] = D.vget(k);
+                auto [Crvx, Crvy] = vpoint(Cvx, Cvy) * vpoint(real(rt), imag(rt));
+                auto [Drvx, Drvy] = vpoint(Dvx, Dvy) * vpoint(real(rt), imag(rt));
+                ftype Cx[2 * flen], Cy[2 * flen];
+                ftype Dx[2 * flen], Dy[2 * flen];
+                Cvx.copy_to(Cx + flen, std::experimental::vector_aligned);
+                Cvy.copy_to(Cy + flen, std::experimental::vector_aligned);
+                Dvx.copy_to(Dx + flen, std::experimental::vector_aligned);
+                Dvy.copy_to(Dy + flen, std::experimental::vector_aligned);
+                Crvx.copy_to(Cx, std::experimental::vector_aligned);
+                Crvy.copy_to(Cy, std::experimental::vector_aligned);
+                Drvx.copy_to(Dx, std::experimental::vector_aligned);
+                Drvy.copy_to(Dy, std::experimental::vector_aligned);
+                vpoint AC, AD, BC, BD;
+                AC = AD = BC = BD = {0, 0};
+                for(size_t i = 0; i < flen; i++) {
+                    vftype Csx, Csy, Dsx, Dsy;
+                    Csx.copy_from(Cx + flen - i, std::experimental::element_aligned);
+                    Csy.copy_from(Cy + flen - i, std::experimental::element_aligned);
+                    Dsx.copy_from(Dx + flen - i, std::experimental::element_aligned);
+                    Dsy.copy_from(Dy + flen - i, std::experimental::element_aligned);
+                    vpoint As = {Ax[i], Ay[i]}, Bs = {Bx[i], By[i]};
+                    vpoint Cs = {Csx, Csy}, Ds = {Dsx, Dsy};
+                    AC += As * Cs;
+                    AD += As * Ds;
+                    BC += Bs * Cs;
+                    BD += Bs * Ds;
+                }
+                A.set(k, AC);
+                C.set(k, AD + BC);
+                B.set(k, BD);
             }
+            checkpoint("dot");
             A.ifft();
             B.ifft();
             C.ifft();
@@ -58,6 +97,7 @@ namespace cp_algo::math::fft {
                 int64_t B2 = llround(imag(Bi));
                 res[n + i] = B0 + B1 * split + B2 * splitsplit;
             });
+            checkpoint("recover mod");
         }
         void mul_inplace(auto &&B, auto& res, size_t k) {
             mul(B.A, B.B, res, k);
 
@@ -0,0 +1,17 @@
+#ifndef CP_ALGO_UTIL_CHECKPOINT_HPP
+#define CP_ALGO_UTIL_CHECKPOINT_HPP
+#include <iostream>
+#include <chrono>
+#include <string>
+namespace cp_algo {
+    void checkpoint(std::string const& msg = "") {
+        static double last = 0;
+        double now = (double)clock() / CLOCKS_PER_SEC;
+        double delta = now - last;
+        last = now;
+        if(msg.size()) {
+            std::cerr << msg << ": " << delta * 1000 << " ms\n";
+        }
+    }
+}
+#endif // CP_ALGO_UTIL_CHECKPOINT_HPP