diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu index 5b4009d3b..8fbb804de 100644 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ b/benchmarks/hash_table/dynamic_map_bench.cu @@ -57,7 +57,7 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end) static void gen_final_size(benchmark::internal::Benchmark* b) { - for (auto size = 10'000'000; size <= 150'000'000; size += 20'000'000) { + for (auto size = 10'000'000; size <= 310'000'000; size += 20'000'000) { b->Args({size}); } } @@ -135,6 +135,128 @@ static void BM_dynamic_search_all(::benchmark::State& state) int64_t(state.range(0))); } +template +static void BM_dynamic_search_none(::benchmark::State& state) +{ + using map_type = cuco::dynamic_map; + + std::size_t num_keys = state.range(0); + std::size_t initial_size = 1 << 27; + + std::vector h_keys(num_keys); + std::vector> h_pairs(num_keys); + + generate_keys(h_keys.begin(), h_keys.end()); + + for (std::size_t i = 0; i < num_keys; ++i) { + Key key = h_keys[i] + num_keys; + Value val = h_keys[i] + num_keys; + h_pairs[i].first = key; + h_pairs[i].second = val; + } + + thrust::device_vector d_keys(h_keys); + thrust::device_vector> d_pairs(h_pairs); + thrust::device_vector d_results(num_keys); + + map_type map{initial_size, cuco::empty_key{-1}, cuco::empty_value{-1}}; + map.insert(d_pairs.begin(), d_pairs.end()); + + for (auto _ : state) { + cuda_event_timer raii{state}; + map.find(d_keys.begin(), d_keys.end(), d_results.begin()); + } + + state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * + int64_t(state.range(0))); +} + +template +static void BM_dynamic_erase_all(::benchmark::State& state) +{ + using map_type = cuco::dynamic_map; + + std::size_t num_keys = state.range(0); + std::size_t initial_size = 1 << 27; + + std::vector h_keys(num_keys); + std::vector> h_pairs(num_keys); + + generate_keys(h_keys.begin(), h_keys.end()); + + for (uint32_t i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_pairs[i].first = key; + h_pairs[i].second = val; + } + + thrust::device_vector> d_pairs(h_pairs); + thrust::device_vector d_keys(h_keys); + + std::size_t batch_size = 1E6; + for (auto _ : state) { + map_type map{initial_size, + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; + for (uint32_t i = 0; i < num_keys; i += batch_size) { + map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size); + } + { + cuda_event_timer raii{state}; + for (uint32_t i = 0; i < num_keys; i += batch_size) { + map.erase(d_keys.begin() + i, d_keys.begin() + i + batch_size); + } + } + } + + state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * + int64_t(state.range(0))); +} + +template +static void BM_dynamic_erase_none(::benchmark::State& state) +{ + using map_type = cuco::dynamic_map; + + std::size_t num_keys = state.range(0); + std::size_t initial_size = 1 << 27; + + std::vector h_keys(num_keys); + std::vector> h_pairs(num_keys); + + generate_keys(h_keys.begin(), h_keys.end()); + + for (std::size_t i = 0; i < num_keys; ++i) { + Key key = h_keys[i] + num_keys; + Value val = h_keys[i] + num_keys; + h_pairs[i].first = key; + h_pairs[i].second = val; + } + + thrust::device_vector> d_pairs(h_pairs); + thrust::device_vector d_keys(h_keys); + + std::size_t batch_size = 1E6; + for (auto _ : state) { + map_type map{initial_size, + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; + for (std::size_t i = 0; i < num_keys; i += batch_size) { + map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size); + } + { + cuda_event_timer raii{state}; + map.erase(d_keys.begin(), d_keys.end()); + } + } + + state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * + int64_t(state.range(0))); +} + BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) @@ -145,32 +267,37 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE) ->Apply(gen_final_size) ->UseManualTime(); -BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM) +BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIFORM) +BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); -BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::GAUSSIAN) +BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN) +BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); -BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE) +BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE) +BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIFORM) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); @@ -185,6 +312,26 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIFORM) ->Apply(gen_final_size) ->UseManualTime(); +BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIFORM) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::GAUSSIAN) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::GAUSSIAN) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) @@ -194,3 +341,19 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::GAUSSIAN) ->Unit(benchmark::kMillisecond) ->Apply(gen_final_size) ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::GAUSSIAN) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +// TODO: comprehensive tests for erase_none and search_none? +BENCHMARK_TEMPLATE(BM_dynamic_search_none, int32_t, int32_t, dist_type::UNIFORM) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::UNIFORM) + ->Unit(benchmark::kMillisecond) + ->Apply(gen_final_size) + ->UseManualTime(); diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu index c384e649f..1538a636a 100644 --- a/benchmarks/hash_table/static_map_bench.cu +++ b/benchmarks/hash_table/static_map_bench.cu @@ -155,6 +155,53 @@ static void BM_static_map_search_all(::benchmark::State& state) int64_t(state.range(0))); } +template +static void BM_static_map_search_none(::benchmark::State& state) +{ + using map_type = cuco::static_map; + + std::size_t num_keys = state.range(0); + float occupancy = state.range(1) / float{100}; + std::size_t size = num_keys / occupancy; + + map_type map{size, cuco::empty_key{-1}, cuco::empty_value{-1}}; + + std::vector h_keys(num_keys); + std::vector h_values(num_keys); + std::vector> h_pairs(num_keys); + std::vector h_results(num_keys); + + generate_keys(h_keys.begin(), h_keys.end()); + + for (std::size_t i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_pairs[i].first = key; + h_pairs[i].second = val; + } + + // diff keys + for (std::size_t i = 0; i < num_keys; ++i) { + h_keys[i] += num_keys; + } + + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_results(num_keys); + thrust::device_vector> d_pairs(h_pairs); + + map.insert(d_pairs.begin(), d_pairs.end()); + + for (auto _ : state) { + map.find(d_keys.begin(), d_keys.end(), d_results.begin()); + // TODO: get rid of sync and rewrite the benchmark with `nvbench` + // once https://github.com/NVIDIA/nvbench/pull/80 is merged + cudaDeviceSynchronize(); + } + + state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * + int64_t(state.range(0))); +} + template static void BM_static_map_erase_all(::benchmark::State& state) { @@ -198,6 +245,52 @@ static void BM_static_map_erase_all(::benchmark::State& state) int64_t(state.range(0))); } +template +static void BM_static_map_erase_none(::benchmark::State& state) +{ + using map_type = cuco::static_map; + + std::size_t num_keys = state.range(0); + float occupancy = state.range(1) / float{100}; + std::size_t size = num_keys / occupancy; + + map_type map{size, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; + + std::vector h_keys(num_keys); + std::vector h_values(num_keys); + std::vector> h_pairs(num_keys); + std::vector h_results(num_keys); + + generate_keys(h_keys.begin(), h_keys.end()); + + for (std::size_t i = 0; i < num_keys; ++i) { + Key key = h_keys[i]; + Value val = h_keys[i]; + h_pairs[i].first = key; + h_pairs[i].second = val; + } + + // diff keys + for (std::size_t i = 0; i < num_keys; ++i) { + h_keys[i] += num_keys; + } + + thrust::device_vector d_keys(h_keys); + thrust::device_vector d_results(num_keys); + thrust::device_vector> d_pairs(h_pairs); + + for (auto _ : state) { + state.PauseTiming(); + map.insert(d_pairs.begin(), d_pairs.end()); + state.ResumeTiming(); + + map.erase(d_keys.begin(), d_keys.end()); + } + + state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * + int64_t(state.range(0))); +} + BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy) @@ -252,6 +345,15 @@ BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::GAUSSI ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy); -BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIQUE) +// TODO: comprehensive tests for erase_all, erase_none and search_none +BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIFORM) + ->Unit(benchmark::kMillisecond) + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_search_none, int32_t, int32_t, dist_type::UNIFORM) + ->Unit(benchmark::kMillisecond) + ->Apply(generate_size_and_occupancy); + +BENCHMARK_TEMPLATE(BM_static_map_erase_none, int32_t, int32_t, dist_type::UNIFORM) ->Unit(benchmark::kMillisecond) ->Apply(generate_size_and_occupancy); diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 0c1d2e377..bb7986071 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,9 +21,11 @@ dynamic_map::dynamic_map( std::size_t initial_capacity, sentinel::empty_key empty_key_sentinel, sentinel::empty_value empty_value_sentinel, - Allocator const& alloc) + Allocator const& alloc, + cudaStream_t stream) : empty_key_sentinel_(empty_key_sentinel.value), empty_value_sentinel_(empty_value_sentinel.value), + erased_key_sentinel_(empty_key_sentinel.value), size_(0), capacity_(initial_capacity), min_insert_size_(1E4), @@ -34,21 +36,47 @@ dynamic_map::dynamic_map( initial_capacity, sentinel::empty_key{empty_key_sentinel}, sentinel::empty_value{empty_value_sentinel}, - alloc)); + alloc, + stream)); submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); - - CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type))); -} // namespace cuco + submap_num_successes_.push_back(submaps_[0]->num_successes_); +} template -dynamic_map::~dynamic_map() +dynamic_map::dynamic_map( + std::size_t initial_capacity, + sentinel::empty_key empty_key_sentinel, + sentinel::empty_value empty_value_sentinel, + sentinel::erased_key erased_key_sentinel, + Allocator const& alloc, + cudaStream_t stream) + : empty_key_sentinel_(empty_key_sentinel.value), + empty_value_sentinel_(empty_value_sentinel.value), + erased_key_sentinel_(erased_key_sentinel.value), + size_(0), + capacity_(initial_capacity), + min_insert_size_(1E4), + max_load_factor_(0.60), + alloc_{alloc} { - CUCO_ASSERT_CUDA_SUCCESS(cudaFree(num_successes_)); + CUCO_RUNTIME_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_, + "The empty key sentinel and erased key sentinel cannot be the same value."); + + submaps_.push_back(std::make_unique>( + initial_capacity, + sentinel::empty_key{empty_key_sentinel_}, + sentinel::empty_value{empty_value_sentinel_}, + sentinel::erased_key{erased_key_sentinel_}, + alloc, + stream)); + submap_views_.push_back(submaps_[0]->get_device_view()); + submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); + submap_num_successes_.push_back(submaps_[0]->num_successes_); } template -void dynamic_map::reserve(std::size_t n) +void dynamic_map::reserve(std::size_t n, cudaStream_t stream) { int64_t num_elements_remaining = n; uint32_t submap_idx = 0; @@ -62,14 +90,25 @@ void dynamic_map::reserve(std::size_t n) // if the submap does not exist yet, create it else { submap_capacity = capacity_; - submaps_.push_back(std::make_unique>( - submap_capacity, - sentinel::empty_key{empty_key_sentinel_}, - sentinel::empty_value{empty_value_sentinel_}, - alloc_)); + if (erased_key_sentinel_ != empty_key_sentinel_) { + submaps_.push_back(std::make_unique>( + submap_capacity, + sentinel::empty_key{empty_key_sentinel_}, + sentinel::empty_value{empty_value_sentinel_}, + sentinel::erased_key{erased_key_sentinel_}, + alloc_, + stream)); + } else { + submaps_.push_back(std::make_unique>( + submap_capacity, + sentinel::empty_key{empty_key_sentinel_}, + sentinel::empty_value{empty_value_sentinel_}, + alloc_, + stream)); + } + submap_num_successes_.push_back(submaps_[submap_idx]->num_successes_); submap_views_.push_back(submaps_[submap_idx]->get_device_view()); submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view()); - capacity_ *= 2; } @@ -80,13 +119,20 @@ void dynamic_map::reserve(std::size_t n) template template -void dynamic_map::insert(InputIt first, - InputIt last, - Hash hash, - KeyEqual key_equal) +void dynamic_map::insert( + InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) { + // TODO: memset an atomic variable is unsafe + static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type), + "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t)."); + + auto constexpr block_size = 128; + auto constexpr stride = 1; + auto constexpr tile_size = 4; + std::size_t num_to_insert = std::distance(first, last); - reserve(size_ + num_to_insert); + + reserve(size_ + num_to_insert, stream); uint32_t submap_idx = 0; while (num_to_insert > 0) { @@ -95,30 +141,29 @@ void dynamic_map::insert(InputIt first, // If we are tying to insert some of the remaining keys into this submap, we can insert // only if we meet the minimum insert size. if (capacity_remaining >= min_insert_size_) { - *num_successes_ = 0; - int device_id; - CUCO_CUDA_TRY(cudaGetDevice(&device_id)); - CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id)); + CUCO_CUDA_TRY( + cudaMemsetAsync(submap_num_successes_[submap_idx], 0, sizeof(atomic_ctr_type), stream)); - auto n = std::min(capacity_remaining, num_to_insert); - auto const block_size = 128; - auto const stride = 1; - auto const tile_size = 4; - auto const grid_size = (tile_size * n + stride * block_size - 1) / (stride * block_size); + auto const n = std::min(capacity_remaining, num_to_insert); + auto const grid_size = (tile_size * n + stride * block_size - 1) / (stride * block_size); detail::insert> - <<>>(first, - first + n, - submap_views_.data().get(), - submap_mutable_views_.data().get(), - num_successes_, - submap_idx, - submaps_.size(), - hash, - key_equal); - CUCO_CUDA_TRY(cudaDeviceSynchronize()); - - std::size_t h_num_successes = num_successes_->load(cuda::std::memory_order_relaxed); + <<>>(first, + first + n, + submap_views_.data().get(), + submap_mutable_views_.data().get(), + submap_num_successes_.data().get(), + submap_idx, + submaps_.size(), + hash, + key_equal); + + std::size_t h_num_successes; + CUCO_CUDA_TRY(cudaMemcpyAsync(&h_num_successes, + submap_num_successes_[submap_idx], + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost, + stream)); submaps_[submap_idx]->size_ += h_num_successes; size_ += h_num_successes; first += n; @@ -128,34 +173,88 @@ void dynamic_map::insert(InputIt first, } } +template +template +void dynamic_map::erase( + InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) +{ + // TODO: memset an atomic variable is unsafe + static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type), + "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t)."); + + auto constexpr block_size = 128; + auto constexpr stride = 1; + auto constexpr tile_size = 4; + + auto const num_keys = std::distance(first, last); + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + + // zero out submap success counters + for (uint32_t i = 0; i < submaps_.size(); ++i) { + CUCO_CUDA_TRY(cudaMemsetAsync(submap_num_successes_[i], 0, sizeof(atomic_ctr_type), stream)); + } + + auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long); + + detail::erase + <<>>(first, + first + num_keys, + submap_mutable_views_.data().get(), + submap_num_successes_.data().get(), + submaps_.size(), + hash, + key_equal); + + for (uint32_t i = 0; i < submaps_.size(); ++i) { + std::size_t h_submap_num_successes; + CUCO_CUDA_TRY(cudaMemcpyAsync(&h_submap_num_successes, + submap_num_successes_[i], + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost, + stream)); + submaps_[i]->size_ -= h_submap_num_successes; + size_ -= h_submap_num_successes; + } +} + template template -void dynamic_map::find( - InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) +void dynamic_map::find(InputIt first, + InputIt last, + OutputIt output_begin, + Hash hash, + KeyEqual key_equal, + cudaStream_t stream) { - auto num_keys = std::distance(first, last); - auto const block_size = 128; - auto const stride = 1; - auto const tile_size = 4; - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + auto constexpr block_size = 128; + auto constexpr stride = 1; + auto constexpr tile_size = 4; - detail::find<<>>( + auto const num_keys = std::distance(first, last); + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + + detail::find<<>>( first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal); CUCO_CUDA_TRY(cudaDeviceSynchronize()); } template template -void dynamic_map::contains( - InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) +void dynamic_map::contains(InputIt first, + InputIt last, + OutputIt output_begin, + Hash hash, + KeyEqual key_equal, + cudaStream_t stream) { - auto num_keys = std::distance(first, last); - auto const block_size = 128; - auto const stride = 1; - auto const tile_size = 4; - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + auto constexpr block_size = 128; + auto constexpr stride = 1; + auto constexpr tile_size = 4; + + auto const num_keys = std::distance(first, last); + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); - detail::contains<<>>( + detail::contains<<>>( first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal); CUCO_CUDA_TRY(cudaDeviceSynchronize()); } diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index f261b49aa..566576e1e 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -41,6 +41,7 @@ namespace cg = cooperative_groups; * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of key/value pairs * @param last End of the sequence of key/value pairs * @param submap_views Array of `static_map::device_view` objects used to @@ -71,7 +72,7 @@ __global__ void insert(InputIt first, Hash hash, KeyEqual key_equal) { - typedef cub::BlockReduce BlockReduce; + using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; @@ -97,8 +98,10 @@ __global__ void insert(InputIt first, tid += gridDim.x * blockDim.x; } - std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { *num_successes += block_num_successes; } + std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { + num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); + } } /** @@ -122,13 +125,14 @@ __global__ void insert(InputIt first, * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of key/value pairs * @param last End of the sequence of key/value pairs * @param submap_views Array of `static_map::device_view` objects used to * perform `contains` operations on each underlying `static_map` * @param submap_mutable_views Array of `static_map::device_mutable_view` objects * used to perform an `insert` into the target `static_map` submap - * @param num_successes The number of successfully inserted key/value pairs + * @param submap_num_successes The number of successfully inserted key/value pairs for each submap * @param insert_idx The index of the submap we are inserting into * @param num_submaps The total number of submaps in the map * @param hash The unary function to apply to hash each key @@ -147,13 +151,13 @@ __global__ void insert(InputIt first, InputIt last, viewT* submap_views, mutableViewT* submap_mutable_views, - atomicT* num_successes, + atomicT** submap_num_successes, uint32_t insert_idx, uint32_t num_submaps, Hash hash, KeyEqual key_equal) { - typedef cub::BlockReduce BlockReduce; + using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; @@ -182,8 +186,154 @@ __global__ void insert(InputIt first, it += (gridDim.x * blockDim.x) / tile_size; } - std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { *num_successes += block_num_successes; } + std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { + submap_num_successes[insert_idx]->fetch_add(block_num_successes, + cuda::std::memory_order_relaxed); + } +} + +/** + * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`. + * + * If the key `*(first + i)` exists in the map, its slot is erased and made available for future + insertions. + * Else, no effect. + * + * @tparam block_size The size of the thread block + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam mutableViewT Type of device view allowing modification of hash map storage + * @tparam atomicT Type of atomic storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to + * perform `erase` operations on each underlying `static_map` + * @param num_successes The number of successfully erased key/value pairs + * @param submap_num_successes The number of successfully erased key/value pairs + * in each submap + * @param num_submaps The number of submaps in the map + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ +template +__global__ void erase(InputIt first, + InputIt last, + mutableViewT* submap_mutable_views, + atomicT** submap_num_successes, + uint32_t num_submaps, + Hash hash, + KeyEqual key_equal) +{ + extern __shared__ unsigned long long submap_block_num_successes[]; + + auto tid = block_size * blockIdx.x + threadIdx.x; + auto it = first + tid; + + for (auto i = threadIdx.x; i < num_submaps; i += block_size) { + submap_block_num_successes[i] = 0; + } + __syncthreads(); + + while (it < last) { + for (auto i = 0; i < num_submaps; ++i) { + if (submap_mutable_views[i].erase(*it, hash, key_equal)) { + atomicAdd(&submap_block_num_successes[i], 1); + break; + } + } + it += gridDim.x * blockDim.x; + } + __syncthreads(); + + for (auto i = 0; i < num_submaps; ++i) { + if (threadIdx.x == 0) { + submap_num_successes[i]->fetch_add(static_cast(submap_block_num_successes[i]), + cuda::std::memory_order_relaxed); + } + } +} + +/** + * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`. + * + * If the key `*(first + i)` exists in the map, its slot is erased and made available for future + * insertions. + * Else, no effect. + * + * @tparam block_size The size of the thread block + * @tparam tile_size The number of threads in the Cooperative Groups used to perform erase + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam mutableViewT Type of device view allowing modification of hash map storage + * @tparam atomicT Type of atomic storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to + * perform `erase` operations on each underlying `static_map` + * @param num_successes The number of successfully erased key/value pairs + * @param submap_num_successes The number of successfully erased key/value pairs + * in each submap + * @param num_submaps The number of submaps in the map + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ +template +__global__ void erase(InputIt first, + InputIt last, + mutableViewT* submap_mutable_views, + atomicT** submap_num_successes, + uint32_t num_submaps, + Hash hash, + KeyEqual key_equal) +{ + extern __shared__ unsigned long long submap_block_num_successes[]; + + auto block = cg::this_thread_block(); + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = block_size * block.group_index().x + block.thread_rank(); + auto it = first + tid / tile_size; + + for (auto i = threadIdx.x; i < num_submaps; i += block_size) { + submap_block_num_successes[i] = 0; + } + block.sync(); + + while (it < last) { + auto erased = false; + int i = 0; + for (i = 0; i < num_submaps; ++i) { + erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal); + if (erased) { break; } + } + if (erased && tile.thread_rank() == 0) { atomicAdd(&submap_block_num_successes[i], 1); } + it += (gridDim.x * blockDim.x) / tile_size; + } + block.sync(); + + for (auto i = 0; i < num_submaps; ++i) { + if (threadIdx.x == 0) { + submap_num_successes[i]->fetch_add(static_cast(submap_block_num_successes[i]), + cuda::std::memory_order_relaxed); + } + } } /** @@ -191,6 +341,7 @@ __global__ void insert(InputIt first, * * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`. * Else, copies the empty value sentinel. + * * @tparam block_size The number of threads in the thread block * @tparam Value The mapped value type for the map * @tparam InputIt Device accessible input iterator whose `value_type` is @@ -200,6 +351,7 @@ __global__ void insert(InputIt first, * @tparam viewT Type of `static_map` device view * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param output_begin Beginning of the sequence of values retrieved for each key @@ -273,6 +425,7 @@ __global__ void find(InputIt first, * @tparam viewT Type of `static_map` device view * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param output_begin Beginning of the sequence of values retrieved for each key @@ -345,6 +498,7 @@ __global__ void find(InputIt first, * @tparam viewT Type of `static_map` device view * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param output_begin Beginning of the sequence of booleans for the presence of each key @@ -411,6 +565,7 @@ __global__ void contains(InputIt first, * @tparam viewT Type of `static_map` device view * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param output_begin Beginning of the sequence of booleans for the presence of each key diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index ee794e1c2..b28b2bad8 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -857,4 +857,4 @@ static_map::device_view::contains(CG const& g, current_slot = next_slot(g, current_slot); } } -} // namespace cuco +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index 705aae7fb..73c22997a 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -167,6 +167,28 @@ __global__ void insert( if (threadIdx.x == 0) { *num_successes += block_num_successes; } } +/** + * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`. + * + * If the key `*(first + i)` exists in the map, its slot is erased and made available for future + * insertions. + * Else, no effect. + * + * @tparam block_size The size of the thread block + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam atomicT Type of atomic storage + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param num_successes The number of successfully erased key/value pairs + * @param view Device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ template ; ///< Type of key/value pairs using key_type = Key; ///< Key type using mapped_type = Value; ///< Type of mapped values - using atomic_ctr_type = cuda::atomic; ///< Type of atomic counters - using view_type = typename static_map::device_view; ///< Device view type - using mutable_view_type = typename static_map::device_mutable_view; - ///< Device mutable view type + using atomic_ctr_type = cuda::atomic; ///< Atomic counter type + using view_type = + typename static_map::device_view; ///< Type for submap device view + using mutable_view_type = + typename static_map::device_mutable_view; ///< Type for submap mutable + ///< device view dynamic_map(dynamic_map const&) = delete; dynamic_map(dynamic_map&&) = delete; + dynamic_map& operator=(dynamic_map const&) = delete; dynamic_map& operator=(dynamic_map&&) = delete; /** - * @brief Construct a dynamically-sized map with the specified initial capacity, growth factor and - * sentinel values. + * @brief Constructs a dynamically-sized map with the specified initial capacity, growth factor + * and sentinel values. + * + * The capacity of the map will automatically increase as the user adds key/value pairs using + * `insert`. + * + * Capacity increases by a factor of growth_factor each time the size of the map exceeds a + * threshold occupancy. The performance of `find` and `contains` decreases somewhat each time the + * map's capacity grows. + * + * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and + * undefined behavior results from attempting to insert any key/value pair + * that contains either. + * + * @param initial_capacity The initial number of slots in the map + * @param empty_key_sentinel The reserved key value for empty slots + * @param empty_value_sentinel The reserved mapped value for empty slots + * @param alloc Allocator used to allocate submap device storage + * @param stream Stream used for executing the kernels + */ + dynamic_map(std::size_t initial_capacity, + sentinel::empty_key empty_key_sentinel, + sentinel::empty_value empty_value_sentinel, + Allocator const& alloc = Allocator{}, + cudaStream_t stream = nullptr); + + /** + * @brief Constructs a dynamically-sized map with erase capability. * * The capacity of the map will automatically increase as the user adds key/value pairs using * `insert`. @@ -133,18 +162,25 @@ class dynamic_map { * @param initial_capacity The initial number of slots in the map * @param empty_key_sentinel The reserved key value for empty slots * @param empty_value_sentinel The reserved mapped value for empty slots + * @param erased_key_sentinel The reserved key value for erased slots * @param alloc Allocator used to allocate submap device storage + * @param stream Stream used for executing the kernels + * + * @throw std::runtime error if the empty key sentinel and erased key sentinel + * are the same value */ dynamic_map(std::size_t initial_capacity, sentinel::empty_key empty_key_sentinel, sentinel::empty_value empty_value_sentinel, - Allocator const& alloc = Allocator{}); + sentinel::erased_key erased_key_sentinel, + Allocator const& alloc = Allocator{}, + cudaStream_t stream = nullptr); /** - * @brief Destroy the map and frees its contents + * @brief Destroys the map and frees its contents * */ - ~dynamic_map(); + ~dynamic_map() {} /** * @brief Grows the capacity of the map so there is enough space for `n` key/value pairs. @@ -152,8 +188,9 @@ class dynamic_map { * If there is already enough space for `n` key/value pairs, the capacity remains the same. * * @param n The number of key value pairs for which there must be space + * @param stream Stream used for executing the kernels */ - void reserve(std::size_t n); + void reserve(std::size_t n, cudaStream_t stream = nullptr); /** * @brief Inserts all key/value pairs in the range `[first, last)`. @@ -169,11 +206,55 @@ class dynamic_map { * @param last End of the sequence of key/value pairs * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality + * @param stream Stream used for executing the kernels */ template , typename KeyEqual = thrust::equal_to> - void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); + void insert(InputIt first, + InputIt last, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = nullptr); + + /** + * @brief Erases keys in the range `[first, last)`. + * + * For each key `k` in `[first, last)`, if `contains(k) == true), removes `k` and it's + * associated value from the map. Else, no effect. + * + * Side-effects: + * - `contains(k) == false` + * - `find(k) == end()` + * - `insert({k,v}) == true` + * - `get_size()` is reduced by the total number of erased keys + * + * This function synchronizes `stream`. + * + * Keep in mind that `erase` does not cause the map to shrink its memory allocation. + * + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `value_type` + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + * @param stream Stream used for executing the kernels + * + * @throw std::runtime_error if a unique erased key sentinel value was not + * provided at construction + */ + template , + typename KeyEqual = thrust::equal_to> + void erase(InputIt first, + InputIt last, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = nullptr); /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. @@ -187,11 +268,13 @@ class dynamic_map { * convertible to the map's `mapped_type` * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param output_begin Beginning of the sequence of values retrieved for each key * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality + * @param stream Stream used for executing the kernels */ template >> submaps_; ///< vector of pointers to each submap thrust::device_vector submap_views_; ///< vector of device views for each submap thrust::device_vector - submap_mutable_views_; ///< vector of mutable device views for each submap - std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert - atomic_ctr_type* num_successes_; ///< number of successfully inserted keys on insert - Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage + submap_mutable_views_; ///< vector of mutable device views for each submap + std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert + thrust::device_vector + submap_num_successes_; ///< Number of successfully erased keys for each submap + Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage }; } // namespace cuco diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 7674d5b20..7a240da03 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -1415,7 +1415,7 @@ class static_map { } private: - pair_atomic_type* slots_{nullptr}; ///< Pointer to flat slots storage + pair_atomic_type* slots_{}; ///< Pointer to flat slots storage std::size_t capacity_{}; ///< Total number of slots std::size_t size_{}; ///< Number of keys in map Key empty_key_sentinel_{}; ///< Key value that represents an empty slot diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index dd3ea3bc3..bfb9cfbf0 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -68,7 +68,8 @@ ConfigureTest(STATIC_MAP_TEST ################################################################################################### # - dynamic_map tests ----------------------------------------------------------------------------- ConfigureTest(DYNAMIC_MAP_TEST - dynamic_map/unique_sequence_test.cu) + dynamic_map/unique_sequence_test.cu + dynamic_map/erase_test.cu) ################################################################################################### # - static_multimap tests ------------------------------------------------------------------------- diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu new file mode 100644 index 000000000..fc3dc3c28 --- /dev/null +++ b/tests/dynamic_map/erase_test.cu @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +#include + +TEMPLATE_TEST_CASE_SIG("erase key", + "", + ((typename Key, typename Value), Key, Value), + (int32_t, int32_t), + (int32_t, int64_t), + (int64_t, int32_t), + (int64_t, int64_t)) +{ + constexpr std::size_t num_keys = 1'000'000; + cuco::dynamic_map map{num_keys * 2, + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; + + SECTION("Check single submap insert/erase") + { + thrust::device_vector d_keys(num_keys); + thrust::device_vector d_values(num_keys); + thrust::device_vector d_keys_exist(num_keys); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1); + thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1); + + auto pairs_begin = + thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); + + map.insert(pairs_begin, pairs_begin + num_keys); + + REQUIRE(map.get_size() == num_keys); + + map.erase(d_keys.begin(), d_keys.end()); + + // delete decreases count correctly + REQUIRE(map.get_size() == 0); + + map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); + + // keys were actaully deleted + REQUIRE(cuco::test::none_of(d_keys_exist.begin(), + d_keys_exist.end(), + [] __device__(const bool key_found) { return key_found; })); + + // ensures that map is reusing deleted slots + map.insert(pairs_begin, pairs_begin + num_keys); + + REQUIRE(map.get_size() == num_keys); + + map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); + + REQUIRE(cuco::test::all_of(d_keys_exist.begin(), + d_keys_exist.end(), + [] __device__(const bool key_found) { return key_found; })); + + // erase can act selectively + map.erase(d_keys.begin(), d_keys.begin() + num_keys / 2); + map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); + + REQUIRE(cuco::test::none_of(d_keys_exist.begin(), + d_keys_exist.begin() + num_keys / 2, + [] __device__(const bool key_found) { return key_found; })); + + REQUIRE(cuco::test::all_of(d_keys_exist.begin() + num_keys / 2, + d_keys_exist.end(), + [] __device__(const bool key_found) { return key_found; })); + + // clear map + map.erase(d_keys.begin() + num_keys / 2, d_keys.end()); + } + + SECTION("Check multiple submaps insert/erase") + { + constexpr std::size_t num = 4 * num_keys; + + thrust::device_vector d_keys(num); + thrust::device_vector d_values(num); + thrust::device_vector d_keys_exist(num); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1); + thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1); + + auto pairs_begin = + thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); + + map.insert(pairs_begin, pairs_begin + num); + + // map should resize twice if the erased slots are successfully reused + REQUIRE(map.get_capacity() == 2 * num); + // check that keys can be successfully deleted from only the first and second submaps + map.erase(d_keys.begin(), d_keys.begin() + 2 * num_keys); + map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); + + REQUIRE(cuco::test::none_of(d_keys_exist.begin(), + d_keys_exist.begin() + 2 * num_keys, + [] __device__(const bool key_found) { return key_found; })); + + REQUIRE(cuco::test::all_of(d_keys_exist.begin() + 2 * num_keys, + d_keys_exist.end(), + [] __device__(const bool key_found) { return key_found; })); + + REQUIRE(map.get_size() == 2 * num_keys); + // check that keys can be successfully deleted from all submaps (some will be unsuccessful + // erases) + map.erase(d_keys.begin(), d_keys.end()); + + map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); + + REQUIRE(cuco::test::none_of(d_keys_exist.begin(), + d_keys_exist.end(), + [] __device__(const bool key_found) { return key_found; })); + + REQUIRE(map.get_size() == 0); + } +} diff --git a/tests/dynamic_map/unique_sequence_test.cu b/tests/dynamic_map/unique_sequence_test.cu index d97bac0a0..b42a7fa5a 100644 --- a/tests/dynamic_map/unique_sequence_test.cu +++ b/tests/dynamic_map/unique_sequence_test.cu @@ -38,6 +38,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", (int64_t, int64_t)) { constexpr std::size_t num_keys{50'000'000}; + cuco::dynamic_map map{ 30'000'000, cuco::empty_key{-1}, cuco::empty_value{-1}};