diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index 36b17925ffd..e9705cd653a 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -46,6 +46,7 @@ set(CUDA_HIP_SOURCES solver/cb_gmres_kernels.cpp sketch/count_sketch_kernels.cpp sketch/gaussian_sketch_kernels.cpp + sketch/sparse_stack_kernels.cpp solver/idr_kernels.cpp solver/multigrid_kernels.cpp stop/criterion_kernels.cpp diff --git a/common/cuda_hip/sketch/sparse_stack_kernels.cpp b/common/cuda_hip/sketch/sparse_stack_kernels.cpp new file mode 100644 index 00000000000..006ce921349 --- /dev/null +++ b/common/cuda_hip/sketch/sparse_stack_kernels.cpp @@ -0,0 +1,44 @@ +// SPDX-FileCopyrightText: 2017 - 2026 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/sketch/sparse_stack_kernels.hpp" + +#include + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace sparse_stack { + + +template +void generate(std::shared_ptr exec, + size_type sketch_size, size_type input_size, size_type zeta, + array& hash_map, array& signs, + uint64 seed) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSE_STACK_GENERATE); + +template +void apply(std::shared_ptr exec, size_type zeta, + const array& hash_map, const array& signs, + matrix::view::dense b, + matrix::view::dense x) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSE_STACK_APPLY); + +template +void rapply(std::shared_ptr exec, size_type zeta, + const array& hash_map, const array& signs, + matrix::view::dense b, + matrix::view::dense x) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSE_STACK_RAPPLY); + + +} // namespace sparse_stack +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 4e5ef948802..43afff6d5b4 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -95,8 +95,6 @@ target_sources( matrix/scaled_permutation.cpp matrix/sellp.cpp matrix/sparsity_csr.cpp - sketch/count_sketch.cpp - sketch/gaussian_sketch.cpp multigrid/fixed_coarsening.cpp multigrid/pgm.cpp preconditioner/batch_jacobi.cpp @@ -110,6 +108,9 @@ target_sources( reorder/mc64.cpp reorder/rcm.cpp reorder/scaled_reordered.cpp + sketch/count_sketch.cpp + sketch/gaussian_sketch.cpp + sketch/sparse_stack.cpp solver/batch_bicgstab.cpp solver/batch_cg.cpp solver/bicg.cpp diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 7cf914458e4..351f42c1884 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2026 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -57,6 +57,9 @@ #include "core/preconditioner/jacobi_kernels.hpp" #include "core/preconditioner/sor_kernels.hpp" #include "core/reorder/rcm_kernels.hpp" +#include "core/sketch/count_sketch_kernels.hpp" +#include "core/sketch/gaussian_sketch_kernels.hpp" +#include "core/sketch/sparse_stack_kernels.hpp" #include "core/solver/batch_bicgstab_kernels.hpp" #include "core/solver/batch_cg_kernels.hpp" #include "core/solver/bicg_kernels.hpp" @@ -76,8 +79,6 @@ #include "core/solver/multigrid_kernels.hpp" #include "core/solver/pipe_cg_kernels.hpp" #include "core/solver/upper_trs_kernels.hpp" -#include "core/sketch/count_sketch_kernels.hpp" -#include "core/sketch/gaussian_sketch_kernels.hpp" #include "core/stop/criterion_kernels.hpp" #include "core/stop/residual_norm_kernels.hpp" @@ -1190,6 +1191,16 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COUNT_SKETCH_RAPPLY); } // namespace count_sketch +namespace sparse_stack { + + +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSE_STACK_GENERATE); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSE_STACK_APPLY); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSE_STACK_RAPPLY); + + +} // namespace sparse_stack + } // namespace GKO_HOOK_MODULE } // namespace kernels diff --git a/core/sketch/count_sketch.cpp b/core/sketch/count_sketch.cpp index dfb716f0505..5c6dd65ddc9 100644 --- a/core/sketch/count_sketch.cpp +++ b/core/sketch/count_sketch.cpp @@ -50,6 +50,15 @@ CountSketch::create( } +template +std::unique_ptr> +CountSketch::create( + std::shared_ptr exec) +{ + return std::unique_ptr{new CountSketch(exec)}; +} + + template void CountSketch::apply_sketch_impl( const matrix::Dense* b, matrix::Dense* x) const diff --git a/core/sketch/gaussian_sketch.cpp b/core/sketch/gaussian_sketch.cpp index 4f11e2aefbf..f1c4cf48282 100644 --- a/core/sketch/gaussian_sketch.cpp +++ b/core/sketch/gaussian_sketch.cpp @@ -51,6 +51,15 @@ std::unique_ptr> GaussianSketch::create( } + +template +std::unique_ptr> GaussianSketch::create( + std::shared_ptr exec) +{ + return std::unique_ptr{new GaussianSketch(exec)}; +} + + template void GaussianSketch::apply_sketch_impl( const matrix::Dense* b, matrix::Dense* x) const diff --git a/core/sketch/sparse_stack.cpp b/core/sketch/sparse_stack.cpp new file mode 100644 index 00000000000..32a81a8ce36 --- /dev/null +++ b/core/sketch/sparse_stack.cpp @@ -0,0 +1,88 @@ +// SPDX-FileCopyrightText: 2017 - 2026 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include +#include + +#include "core/sketch/sparse_stack_kernels.hpp" + + +namespace gko { +namespace sketch { +namespace sparse_stack { +namespace { + + +GKO_REGISTER_OPERATION(generate, sparse_stack::generate); +GKO_REGISTER_OPERATION(apply, sparse_stack::apply); +GKO_REGISTER_OPERATION(rapply, sparse_stack::rapply); + + +} // anonymous namespace +} // namespace sparse_stack + + +template +SparseStack::SparseStack( + std::shared_ptr exec, size_type sketch_size, + size_type input_size, size_type zeta, uint64 seed) + : EnableLinOp>( + exec, dim<2>{sketch_size, input_size}), + zeta_{zeta}, + hash_map_{exec, input_size * zeta}, + signs_{exec, input_size * zeta}, + seed_{seed} +{ + exec->run(sparse_stack::make_generate(sketch_size, input_size, zeta_, + hash_map_, signs_, seed_)); +} + + +template +std::unique_ptr> +SparseStack::create(std::shared_ptr exec, + size_type sketch_size, + size_type input_size, size_type zeta, + uint64 seed) +{ + return std::unique_ptr{ + new SparseStack(exec, sketch_size, input_size, zeta, seed)}; +} + +template +std::unique_ptr> +SparseStack::create(std::shared_ptr exec) +{ + return std::unique_ptr{new SparseStack(exec)}; +} + + +template +void SparseStack::apply_sketch_impl( + const matrix::Dense* b, matrix::Dense* x) const +{ + this->get_executor()->run(sparse_stack::make_apply( + zeta_, hash_map_, signs_, b->get_const_device_view(), + x->get_device_view())); +} + + +template +void SparseStack::rapply_sketch_impl( + const matrix::Dense* b, matrix::Dense* x) const +{ + this->get_executor()->run(sparse_stack::make_rapply( + zeta_, hash_map_, signs_, b->get_const_device_view(), + x->get_device_view())); +} + + +#define GKO_DECLARE_SPARSE_STACK(ValueType, IndexType) \ + class SparseStack +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSE_STACK); + + +} // namespace sketch +} // namespace gko diff --git a/core/sketch/sparse_stack_kernels.hpp b/core/sketch/sparse_stack_kernels.hpp new file mode 100644 index 00000000000..a1c4ea94586 --- /dev/null +++ b/core/sketch/sparse_stack_kernels.hpp @@ -0,0 +1,61 @@ +// SPDX-FileCopyrightText: 2017 - 2026 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CORE_SKETCH_SPARSE_STACK_KERNELS_HPP_ +#define GKO_CORE_SKETCH_SPARSE_STACK_KERNELS_HPP_ + + +#include +#include +#include + +#include "core/base/kernel_declaration.hpp" + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_SPARSE_STACK_GENERATE(ValueType, IndexType) \ + void generate(std::shared_ptr exec, \ + size_type sketch_size, size_type input_size, size_type zeta, \ + array& hash_map, array& signs, \ + gko::uint64 seed) + +#define GKO_DECLARE_SPARSE_STACK_APPLY(ValueType, IndexType) \ + void apply(std::shared_ptr exec, size_type zeta, \ + const array& hash_map, \ + const array& signs, \ + matrix::view::dense b, \ + matrix::view::dense x) + +#define GKO_DECLARE_SPARSE_STACK_RAPPLY(ValueType, IndexType) \ + void rapply(std::shared_ptr exec, size_type zeta, \ + const array& hash_map, \ + const array& signs, \ + matrix::view::dense b, \ + matrix::view::dense x) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_SPARSE_STACK_GENERATE(ValueType, IndexType); \ + template \ + GKO_DECLARE_SPARSE_STACK_APPLY(ValueType, IndexType); \ + template \ + GKO_DECLARE_SPARSE_STACK_RAPPLY(ValueType, IndexType) + + +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(sparse_stack, + GKO_DECLARE_ALL_AS_TEMPLATES); + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_SKETCH_SPARSE_STACK_KERNELS_HPP_ diff --git a/core/test/sketch/CMakeLists.txt b/core/test/sketch/CMakeLists.txt index 115d551e467..bcefac2cce8 100644 --- a/core/test/sketch/CMakeLists.txt +++ b/core/test/sketch/CMakeLists.txt @@ -1,2 +1,3 @@ ginkgo_create_test(gaussian_sketch) ginkgo_create_test(count_sketch) +ginkgo_create_test(sparse_stack) diff --git a/core/test/sketch/sparse_stack.cpp b/core/test/sketch/sparse_stack.cpp new file mode 100644 index 00000000000..41fcb247918 --- /dev/null +++ b/core/test/sketch/sparse_stack.cpp @@ -0,0 +1,160 @@ +// SPDX-FileCopyrightText: 2017 - 2026 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include + +#include + +#include +#include + +#include "core/test/utils.hpp" + + +namespace { + + +template +class SparseStackCore : public ::testing::Test { +protected: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Dense = gko::matrix::Dense; + using Sketch = gko::sketch::SparseStack; + + SparseStackCore() : exec(gko::ReferenceExecutor::create()) {} + + std::shared_ptr exec; +}; + +TYPED_TEST_SUITE(SparseStackCore, gko::test::ValueIndexTypes, + PairTypenameNameGenerator); + + +TYPED_TEST(SparseStackCore, ReturnsCorrectSketchSize) +{ + auto sketch = TestFixture::Sketch::create(this->exec, 7, 20, 1, 42); + + EXPECT_EQ(sketch->get_sketch_size(), 7); +} + + +TYPED_TEST(SparseStackCore, ReturnsCorrectInputSize) +{ + auto sketch = TestFixture::Sketch::create(this->exec, 7, 20, 1, 42); + + EXPECT_EQ(sketch->get_input_size(), 20); +} + + +TYPED_TEST(SparseStackCore, ReturnsCorrectSeed) +{ + auto sketch = TestFixture::Sketch::create(this->exec, 7, 20, 1, 123); + + EXPECT_EQ(sketch->get_seed(), 123); +} + + +TYPED_TEST(SparseStackCore, HashMapHasCorrectSize) +{ + auto sketch = TestFixture::Sketch::create(this->exec, 7, 20, 4, 42); + + EXPECT_EQ(sketch->get_hash_map().get_size(), 80); + EXPECT_EQ(sketch->get_signs().get_size(), 80); +} + + +TYPED_TEST(SparseStackCore, SignsHasCorrectSize) +{ + auto sketch = TestFixture::Sketch::create(this->exec, 7, 20, 4, 42); + + EXPECT_EQ(sketch->get_signs().get_size(), 80); +} + + +TYPED_TEST(SparseStackCore, ApplyThrowsOnDimensionMismatch) +{ + using Dense = typename TestFixture::Dense; + + auto sketch = TestFixture::Sketch::create(this->exec, 3, 5, 1, 42); + // b has wrong number of rows (4 instead of 5) + auto b = Dense::create(this->exec, gko::dim<2>{4, 3}); + auto x = Dense::create(this->exec, gko::dim<2>{3, 3}); + + ASSERT_THROW(sketch->apply(b, x), gko::DimensionMismatch); +} + + +TYPED_TEST(SparseStackCore, ApplyThrowsOnOutputDimensionMismatch) +{ + using Dense = typename TestFixture::Dense; + + auto sketch = TestFixture::Sketch::create(this->exec, 3, 5, 1, 42); + auto b = Dense::create(this->exec, gko::dim<2>{5, 3}); + // x has wrong number of rows (4 instead of 3) + auto x = Dense::create(this->exec, gko::dim<2>{4, 3}); + + ASSERT_THROW(sketch->apply(b, x), gko::DimensionMismatch); +} + + +TYPED_TEST(SparseStackCore, ApplyThrowsOnColumnMismatch) +{ + using Dense = typename TestFixture::Dense; + + auto sketch = TestFixture::Sketch::create(this->exec, 3, 5, 1, 42); + auto b = Dense::create(this->exec, gko::dim<2>{5, 3}); + // x has wrong number of columns (2 instead of 3) + auto x = Dense::create(this->exec, gko::dim<2>{3, 2}); + + ASSERT_THROW(sketch->apply(b, x), gko::DimensionMismatch); +} + + +TYPED_TEST(SparseStackCore, RapplyThrowsOnDimensionMismatch) +{ + using Dense = typename TestFixture::Dense; + + auto sketch = TestFixture::Sketch::create(this->exec, 3, 5, 1, 42); + // rapply: b must have cols == 5 (input_size), this has 4 + auto b = Dense::create(this->exec, gko::dim<2>{4, 4}); + auto x = Dense::create(this->exec, gko::dim<2>{4, 3}); + + ASSERT_THROW(sketch->rapply(b, x), gko::DimensionMismatch); +} + + +TYPED_TEST(SparseStackCore, RapplyThrowsOnOutputColumnMismatch) +{ + using Dense = typename TestFixture::Dense; + + auto sketch = TestFixture::Sketch::create(this->exec, 3, 5, 4, 42); + auto b = Dense::create(this->exec, gko::dim<2>{4, 5}); + // x must have cols == 3 (sketch_size), this has 4 + auto x = Dense::create(this->exec, gko::dim<2>{4, 4}); + + ASSERT_THROW(sketch->rapply(b, x), gko::DimensionMismatch); +} + +TYPED_TEST(SparseStackCore, RapplyThrowsOnRowMismatch) +{ + using Dense = typename TestFixture::Dense; + + // S is 3x5 + auto sketch = TestFixture::Sketch::create(this->exec, 3, 5, 4, 42); + // b has 4 rows + auto b = Dense::create(this->exec, gko::dim<2>{4, 5}); + // Error: x has 5 rows instead of 3 + auto x = Dense::create(this->exec, gko::dim<2>{5, 3}); + + ASSERT_THROW(sketch->rapply(b, x), gko::DimensionMismatch); +} + + +} // namespace + diff --git a/examples/sketch-and-solve/sketch-and-solve.cpp b/examples/sketch-and-solve/sketch-and-solve.cpp index 0173e110561..6734288ac35 100644 --- a/examples/sketch-and-solve/sketch-and-solve.cpp +++ b/examples/sketch-and-solve/sketch-and-solve.cpp @@ -6,8 +6,6 @@ // where A is (m x n) with m >> n. Sketching reduces the system from // (m x n) to (k x n) with n < k << m, then solves the smaller problem. -#include - #include #include #include @@ -17,6 +15,8 @@ #include +#include + using ValueType = double; using RealValueType = gko::remove_complex; @@ -58,9 +58,9 @@ struct LeastSquaresProblem { std::unique_ptr x_true; }; -LeastSquaresProblem generate_problem( - std::shared_ptr exec, gko::size_type m, - gko::size_type n, unsigned int data_seed) +LeastSquaresProblem generate_problem(std::shared_ptr exec, + gko::size_type m, gko::size_type n, + unsigned int data_seed) { auto host = exec->get_master(); std::mt19937 rng(data_seed); @@ -96,10 +96,9 @@ LeastSquaresProblem generate_problem( // Solve the sketched least-squares problem: min_x || S*A*x - S*b ||_2 // via normal equations on the sketched system: (SA)^T SA x = (SA)^T Sb -void sketch_and_solve(std::shared_ptr exec, - const vec* A, const vec* b, vec* x, - gko::LinOp* sketch_op, const std::string& label, - int num_reps, long setup_us) +void sketch_and_solve(std::shared_ptr exec, const vec* A, + const vec* b, vec* x, gko::LinOp* sketch_op, + const std::string& label, int num_reps, long setup_us) { auto m = A->get_size()[0]; auto n = A->get_size()[1]; @@ -140,11 +139,10 @@ void sketch_and_solve(std::shared_ptr exec, auto solver = gko::solver::Gmres::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters( - static_cast(10 * n)), - gko::stop::ResidualNorm::build() - .with_reduction_factor(RealValueType{1e-14})) + .with_criteria(gko::stop::Iteration::build().with_max_iters( + static_cast(10 * n)), + gko::stop::ResidualNorm::build() + .with_reduction_factor(RealValueType{1e-14})) .on(exec) ->generate(gko::share(std::move(AtA))); @@ -195,11 +193,10 @@ void direct_solve(std::shared_ptr exec, const vec* A, auto solver = gko::solver::Gmres::build() - .with_criteria( - gko::stop::Iteration::build().with_max_iters( - static_cast(10 * n)), - gko::stop::ResidualNorm::build() - .with_reduction_factor(RealValueType{1e-14})) + .with_criteria(gko::stop::Iteration::build().with_max_iters( + static_cast(10 * n)), + gko::stop::ResidualNorm::build() + .with_reduction_factor(RealValueType{1e-14})) .on(exec) ->generate(gko::share(std::move(AtA))); @@ -222,10 +219,11 @@ void direct_solve(std::shared_ptr exec, const vec* A, auto rel_res = res_norm->at(0, 0) / b_norm->at(0, 0); std::cout << std::left << std::setw(16) << "Direct" << std::right - << std::setw(10) << "---" << " " << std::setw(10) << "---" - << " " << std::setw(10) << solve_us << " us" - << std::setw(14) << std::scientific << std::setprecision(4) - << rel_res << std::endl; + << std::setw(10) << "---" + << " " << std::setw(10) << "---" + << " " << std::setw(10) << solve_us << " us" << std::setw(14) + << std::scientific << std::setprecision(4) << rel_res + << std::endl; } @@ -246,6 +244,8 @@ int main(int argc, char* argv[]) cxxopts::value()->default_value("50")) ("k,sketch-size", "Sketch dimension (default: 4*n)", cxxopts::value()->default_value("0")) + ("z,zeta", "Non-zeros per column for SparseStack", + cxxopts::value()->default_value("4")) ("s,seed", "Random seed for sketch operators", cxxopts::value()->default_value("42")) ("d,data-seed", "Random seed for problem generation", @@ -270,6 +270,7 @@ int main(int argc, char* argv[]) auto data_seed = args["data-seed"].as(); auto num_reps = args["num-reps"].as(); auto sketch_type = args["sketch"].as(); + auto zeta = static_cast(args["zeta"].as()); auto k = static_cast(args["sketch-size"].as()); if (k == 0) { k = std::min(4 * n, m); @@ -281,8 +282,8 @@ int main(int argc, char* argv[]) std::cout << "Problem: " << m << " x " << n << " (overdetermined" << ", ratio " << m / n << ":1)" - << "\nSketch size: " << k << " (compression " - << std::fixed << std::setprecision(1) + << "\nSketch size: " << k << " (compression " << std::fixed + << std::setprecision(1) << static_cast(m) / static_cast(k) << "x)" << "\nExecutor: " << exec_name << " Seed: " << seed << " Reps: " << num_reps << "\n" @@ -294,23 +295,24 @@ int main(int argc, char* argv[]) // Header // Setup = time to create the sketch operator (generate random data) // Sketch = time to compute S*A and S*b - // Solve = time to form normal equations + CG solve (on sketched or full system) + // Solve = time to form normal equations + CG solve (on sketched or full + // system) std::cout << std::left << std::setw(16) << "Method" << std::right << std::setw(13) << "Setup" << std::setw(13) << "Sketch" - << std::setw(13) << "Solve" << std::setw(14) - << "||Ax-b||/||b||" << "\n" + << std::setw(13) << "Solve" << std::setw(14) << "||Ax-b||/||b||" + << "\n" << std::string(69, '-') << std::endl; if (sketch_type == "all" || sketch_type == "gaussian") { exec->synchronize(); auto t0 = std::chrono::steady_clock::now(); - auto gaussian = gko::sketch::GaussianSketch::create( - exec, k, m, seed); + auto gaussian = + gko::sketch::GaussianSketch::create(exec, k, m, seed); exec->synchronize(); auto t1 = std::chrono::steady_clock::now(); - auto setup_us = std::chrono::duration_cast( - t1 - t0) - .count(); + auto setup_us = + std::chrono::duration_cast(t1 - t0) + .count(); auto x = vec::create(exec, gko::dim<2>{n, 1}); sketch_and_solve(exec, problem.A.get(), problem.b.get(), x.get(), gaussian.get(), "Gaussian", num_reps, setup_us); @@ -323,14 +325,32 @@ int main(int argc, char* argv[]) exec, k, m, seed); exec->synchronize(); auto t1 = std::chrono::steady_clock::now(); - auto setup_us = std::chrono::duration_cast( - t1 - t0) - .count(); + auto setup_us = + std::chrono::duration_cast(t1 - t0) + .count(); auto x = vec::create(exec, gko::dim<2>{n, 1}); sketch_and_solve(exec, problem.A.get(), problem.b.get(), x.get(), cs.get(), "CountSketch", num_reps, setup_us); } + if (sketch_type == "all" || sketch_type == "sparsestack") { + exec->synchronize(); + auto t0 = std::chrono::steady_clock::now(); + + auto ss = gko::sketch::SparseStack::create( + exec, k, m, zeta, seed); + + exec->synchronize(); + auto t1 = std::chrono::steady_clock::now(); + auto setup_us = + std::chrono::duration_cast(t1 - t0) + .count(); + auto x = vec::create(exec, gko::dim<2>{n, 1}); + + sketch_and_solve(exec, problem.A.get(), problem.b.get(), x.get(), + ss.get(), "SparseStack", num_reps, setup_us); + } + // Direct baseline { auto x = vec::create(exec, gko::dim<2>{n, 1}); diff --git a/include/ginkgo/core/sketch/count_sketch.hpp b/include/ginkgo/core/sketch/count_sketch.hpp index 27125769a1d..271fe8ba84b 100644 --- a/include/ginkgo/core/sketch/count_sketch.hpp +++ b/include/ginkgo/core/sketch/count_sketch.hpp @@ -53,6 +53,14 @@ class CountSketch std::shared_ptr exec, size_type sketch_size, size_type input_size, uint64 seed); + /** + * Creates an uninitialized CountSketch operator. + * + * @param exec executor where the sketch lives + */ + static std::unique_ptr create( + std::shared_ptr exec); + /** Returns the random seed. */ uint64 get_seed() const { return seed_; } diff --git a/include/ginkgo/core/sketch/gaussian_sketch.hpp b/include/ginkgo/core/sketch/gaussian_sketch.hpp index 0d099aa4fc9..ae842c84d40 100644 --- a/include/ginkgo/core/sketch/gaussian_sketch.hpp +++ b/include/ginkgo/core/sketch/gaussian_sketch.hpp @@ -52,6 +52,14 @@ class GaussianSketch std::shared_ptr exec, size_type sketch_size, size_type input_size, uint64 seed); + /** + * Creates an uninitialized GaussianSketch operator. + * + * @param exec executor where the sketch lives + */ + static std::unique_ptr create( + std::shared_ptr exec); + /** Returns the random seed used to generate this sketch. */ uint64 get_seed() const { return seed_; } diff --git a/include/ginkgo/core/sketch/sparse_stack.hpp b/include/ginkgo/core/sketch/sparse_stack.hpp new file mode 100644 index 00000000000..c5fff4c4d6a --- /dev/null +++ b/include/ginkgo/core/sketch/sparse_stack.hpp @@ -0,0 +1,106 @@ +// SPDX-FileCopyrightText: 2017 - 2026 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_PUBLIC_CORE_SKETCH_SPARSE_STACK_HPP_ +#define GKO_PUBLIC_CORE_SKETCH_SPARSE_STACK_HPP_ + + +#include + +#include +#include +#include +#include +#include +#include + + +namespace gko { +namespace sketch { + + +/** + * SparseStack is a matrix-free sketch operator. + * + * Each input row i is mapped to zeta output rows hash_map[i * zeta + z] + * with signs signs[i * zeta + z] for z in [0, zeta). + * Only O(zeta * m) storage is needed (no explicit matrix). + * + * @tparam ValueType precision of matrix elements + * @tparam IndexType type for hash_map indices + */ +template +class SparseStack : public EnableLinOp, + SketchOperator> { + friend class EnableLinOp; + friend class EnablePolymorphicObject>; + +public: + using value_type = ValueType; + using index_type = IndexType; + + /** + * Creates a SparseStack operator. + * + * @param exec executor where the sketch lives + * @param sketch_size sketch dimension (number of output rows) + * @param input_size input dimension (number of input rows) + * @param zeta number of non-zeros per column + * @param seed random seed for reproducibility + */ + static std::unique_ptr create( + std::shared_ptr exec, size_type sketch_size, + size_type input_size, size_type zeta, uint64 seed); + + /** + * Creates an uninitialized SparseStack operator. + * + * @param exec executor where the sketch lives + */ + static std::unique_ptr create(std::shared_ptr exec); + + /** Returns the random seed. */ + uint64 get_seed() const { return seed_; } + + /** Returns the number of non-zeros per column. */ + size_type get_zeta() const { return zeta_; } + + /** Returns the hash map array (zeta * m entries, each in [0, k)). */ + const array& get_hash_map() const { return hash_map_; } + + /** Returns the signs array (zeta * m entries, each +1 or -1). */ + const array& get_signs() const { return signs_; } + +protected: + void apply_sketch_impl(const matrix::Dense* b, + matrix::Dense* x) const override; + + void rapply_sketch_impl(const matrix::Dense* b, + matrix::Dense* x) const override; + + explicit SparseStack(std::shared_ptr exec) + : EnableLinOp>(exec), + hash_map_{exec}, + signs_{exec}, + zeta_{1}, + seed_{0} + {} + + SparseStack(std::shared_ptr exec, size_type sketch_size, + size_type input_size, size_type zeta, uint64 seed); + +private: + size_type zeta_; + array hash_map_; + array signs_; + uint64 seed_; +}; + + +} // namespace sketch +} // namespace gko + + +#endif // GKO_PUBLIC_CORE_SKETCH_SPARSE_STACK_HPP_ diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index c2663a1803f..d696ac8f12b 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -53,6 +53,7 @@ target_sources( reorder/rcm_kernels.cpp sketch/count_sketch_kernels.cpp sketch/gaussian_sketch_kernels.cpp + sketch/sparse_stack_kernels.cpp solver/batch_bicgstab_kernels.cpp solver/batch_cg_kernels.cpp solver/cb_gmres_kernels.cpp diff --git a/omp/sketch/sparse_stack_kernels.cpp b/omp/sketch/sparse_stack_kernels.cpp new file mode 100644 index 00000000000..b72599532e1 --- /dev/null +++ b/omp/sketch/sparse_stack_kernels.cpp @@ -0,0 +1,109 @@ +// SPDX-FileCopyrightText: 2017 - 2026 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/sketch/sparse_stack_kernels.hpp" + +#include + +#include + +#include +#include + +namespace gko { +namespace kernels { +namespace omp { +namespace sparse_stack { + + +template +void generate(std::shared_ptr exec, + size_type sketch_size, size_type input_size, size_type zeta, + array& hash_map, array& signs, uint64 seed) +{ + auto hash_data = hash_map.get_data(); + auto sign_data = signs.get_data(); + std::mt19937_64 rng(seed); + std::uniform_int_distribution hash_dist( + 0, static_cast(std::max(static_cast(sketch_size - 1), static_cast(0)))); + std::bernoulli_distribution sign_dist(0.5); + for (size_type i = 0; i < input_size * zeta; ++i) { + hash_data[i] = hash_dist(rng); + sign_data[i] = sign_dist(rng) ? one() : -one(); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSE_STACK_GENERATE); + +template +void apply(std::shared_ptr exec, size_type zeta, + const array& hash_map, const array& signs, + matrix::view::dense b, + matrix::view::dense x) +{ + auto input_size = hash_map.get_size() / zeta; + auto num_cols = b.size[1]; + auto hash_data = hash_map.get_const_data(); + auto sign_data = signs.get_const_data(); + +#pragma omp parallel for + for (size_type row = 0; row < x.size[0]; ++row) { + for (size_type col = 0; col < num_cols; ++col) { + x(row, col) = zero(); + } + } + + for (size_type i = 0; i < input_size; ++i) { + for (size_type z = 0; z < zeta; ++z) { + auto idx = i * zeta + z; + auto target_row = hash_data[idx]; + auto sign = sign_data[idx]; + for (size_type col = 0; col < num_cols; ++col) { + x(target_row, col) += sign * b(i, col); + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSE_STACK_APPLY); + +template +void rapply(std::shared_ptr exec, size_type zeta, + const array& hash_map, const array& signs, + matrix::view::dense b, + matrix::view::dense x) +{ + auto input_size = hash_map.get_size() / zeta; + auto num_rows = b.size[0]; + auto hash_data = hash_map.get_const_data(); + auto sign_data = signs.get_const_data(); + +#pragma omp parallel for + for (size_type row = 0; row < x.size[0]; ++row) { + for (size_type col = 0; col < x.size[1]; ++col) { + x(row, col) = zero(); + } + } + + for (size_type i = 0; i < input_size; ++i) { + for (size_type z = 0; z < zeta; ++z) { + auto idx = i * zeta + z; + auto target_col = hash_data[idx]; + auto sign = sign_data[idx]; +#pragma omp parallel for + for (size_type row = 0; row < num_rows; ++row) { + x(row, target_col) += sign * b(row, i); + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSE_STACK_RAPPLY); + + +} // namespace sparse_stack +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index cd727e10e2a..67c2817eefc 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -54,6 +54,7 @@ target_sources( reorder/rcm_kernels.cpp sketch/count_sketch_kernels.cpp sketch/gaussian_sketch_kernels.cpp + sketch/sparse_stack_kernels.cpp solver/batch_bicgstab_kernels.cpp solver/batch_cg_kernels.cpp solver/bicg_kernels.cpp diff --git a/reference/sketch/sparse_stack_kernels.cpp b/reference/sketch/sparse_stack_kernels.cpp new file mode 100644 index 00000000000..00005f45bab --- /dev/null +++ b/reference/sketch/sparse_stack_kernels.cpp @@ -0,0 +1,107 @@ +// SPDX-FileCopyrightText: 2017 - 2026 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/sketch/sparse_stack_kernels.hpp" + +#include + +#include +#include + + +namespace gko { +namespace kernels { +namespace reference { +namespace sparse_stack { + + +template +void generate(std::shared_ptr exec, + size_type sketch_size, size_type input_size, size_type zeta, + array& hash_map, array& signs, uint64 seed) +{ + auto hash_data = hash_map.get_data(); + auto sign_data = signs.get_data(); + std::mt19937_64 rng(seed); + std::uniform_int_distribution hash_dist( + 0, static_cast(sketch_size - 1)); + std::bernoulli_distribution sign_dist(0.5); + for (size_type i = 0; i < input_size * zeta; ++i) { + hash_data[i] = hash_dist(rng); + sign_data[i] = sign_dist(rng) ? one() : -one(); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSE_STACK_GENERATE); + + +template +void apply(std::shared_ptr exec, size_type zeta, + const array& hash_map, const array& signs, + matrix::view::dense b, + matrix::view::dense x) +{ + auto input_size = hash_map.get_size() / zeta; + auto num_cols = b.size[1]; + auto hash_data = hash_map.get_const_data(); + auto sign_data = signs.get_const_data(); + // Zero output + for (size_type row = 0; row < x.size[0]; ++row) { + for (size_type col = 0; col < num_cols; ++col) { + x(row, col) = zero(); + } + } + // Scatter-add + for (size_type i = 0; i < input_size; ++i) { + for (size_type z = 0; z < zeta; ++z) { + auto idx = i * zeta + z; + auto target_row = hash_data[idx]; + auto sign = sign_data[idx]; + for (size_type col = 0; col < num_cols; ++col) { + x(target_row, col) += sign * b(i, col); + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSE_STACK_APPLY); + + +template +void rapply(std::shared_ptr exec, size_type zeta, + const array& hash_map, const array& signs, + matrix::view::dense b, + matrix::view::dense x) +{ + auto input_size = hash_map.get_size() / zeta; + auto num_rows = b.size[0]; + auto hash_data = hash_map.get_const_data(); + auto sign_data = signs.get_const_data(); + // Zero output + for (size_type row = 0; row < x.size[0]; ++row) { + for (size_type col = 0; col < x.size[1]; ++col) { + x(row, col) = zero(); + } + } + // Gather-accumulate + for (size_type i = 0; i < input_size; ++i) { + for (size_type z = 0; z < zeta; ++z) { + auto idx = i * zeta + z; + auto target_col = hash_data[idx]; + auto sign = sign_data[idx]; + for (size_type row = 0; row < num_rows; ++row) { + x(row, target_col) += sign * b(row, i); + } + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSE_STACK_RAPPLY); + + +} // namespace sparse_stack +} // namespace reference +} // namespace kernels +} // namespace gko