diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu
index 5b4009d3b..8fbb804de 100644
--- a/benchmarks/hash_table/dynamic_map_bench.cu
+++ b/benchmarks/hash_table/dynamic_map_bench.cu
@@ -57,7 +57,7 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end)
 
 static void gen_final_size(benchmark::internal::Benchmark* b)
 {
-  for (auto size = 10'000'000; size <= 150'000'000; size += 20'000'000) {
+  for (auto size = 10'000'000; size <= 310'000'000; size += 20'000'000) {
     b->Args({size});
   }
 }
@@ -135,6 +135,128 @@ static void BM_dynamic_search_all(::benchmark::State& state)
                           int64_t(state.range(0)));
 }
 
+template <typename Key, typename Value, dist_type Dist>
+static void BM_dynamic_search_none(::benchmark::State& state)
+{
+  using map_type = cuco::dynamic_map<Key, Value>;
+
+  std::size_t num_keys     = state.range(0);
+  std::size_t initial_size = 1 << 27;
+
+  std::vector<Key> h_keys(num_keys);
+  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
+
+  generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
+
+  for (std::size_t i = 0; i < num_keys; ++i) {
+    Key key           = h_keys[i] + num_keys;
+    Value val         = h_keys[i] + num_keys;
+    h_pairs[i].first  = key;
+    h_pairs[i].second = val;
+  }
+
+  thrust::device_vector<Key> d_keys(h_keys);
+  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
+  thrust::device_vector<Value> d_results(num_keys);
+
+  map_type map{initial_size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+  map.insert(d_pairs.begin(), d_pairs.end());
+
+  for (auto _ : state) {
+    cuda_event_timer raii{state};
+    map.find(d_keys.begin(), d_keys.end(), d_results.begin());
+  }
+
+  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
+                          int64_t(state.range(0)));
+}
+
+template <typename Key, typename Value, dist_type Dist>
+static void BM_dynamic_erase_all(::benchmark::State& state)
+{
+  using map_type = cuco::dynamic_map<Key, Value>;
+
+  std::size_t num_keys     = state.range(0);
+  std::size_t initial_size = 1 << 27;
+
+  std::vector<Key> h_keys(num_keys);
+  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
+
+  generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
+
+  for (uint32_t i = 0; i < num_keys; ++i) {
+    Key key           = h_keys[i];
+    Value val         = h_keys[i];
+    h_pairs[i].first  = key;
+    h_pairs[i].second = val;
+  }
+
+  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
+  thrust::device_vector<Key> d_keys(h_keys);
+
+  std::size_t batch_size = 1E6;
+  for (auto _ : state) {
+    map_type map{initial_size,
+                 cuco::empty_key<Key>{-1},
+                 cuco::empty_value<Value>{-1},
+                 cuco::erased_key<Key>{-2}};
+    for (uint32_t i = 0; i < num_keys; i += batch_size) {
+      map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size);
+    }
+    {
+      cuda_event_timer raii{state};
+      for (uint32_t i = 0; i < num_keys; i += batch_size) {
+        map.erase(d_keys.begin() + i, d_keys.begin() + i + batch_size);
+      }
+    }
+  }
+
+  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
+                          int64_t(state.range(0)));
+}
+
+template <typename Key, typename Value, dist_type Dist>
+static void BM_dynamic_erase_none(::benchmark::State& state)
+{
+  using map_type = cuco::dynamic_map<Key, Value>;
+
+  std::size_t num_keys     = state.range(0);
+  std::size_t initial_size = 1 << 27;
+
+  std::vector<Key> h_keys(num_keys);
+  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
+
+  generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
+
+  for (std::size_t i = 0; i < num_keys; ++i) {
+    Key key           = h_keys[i] + num_keys;
+    Value val         = h_keys[i] + num_keys;
+    h_pairs[i].first  = key;
+    h_pairs[i].second = val;
+  }
+
+  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
+  thrust::device_vector<Key> d_keys(h_keys);
+
+  std::size_t batch_size = 1E6;
+  for (auto _ : state) {
+    map_type map{initial_size,
+                 cuco::empty_key<Key>{-1},
+                 cuco::empty_value<Value>{-1},
+                 cuco::erased_key<Key>{-2}};
+    for (std::size_t i = 0; i < num_keys; i += batch_size) {
+      map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size);
+    }
+    {
+      cuda_event_timer raii{state};
+      map.erase(d_keys.begin(), d_keys.end());
+    }
+  }
+
+  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
+                          int64_t(state.range(0)));
+}
+
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
@@ -145,32 +267,37 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE)
   ->Apply(gen_final_size)
   ->UseManualTime();
 
-BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM)
+BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
 
-BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIFORM)
+BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
 
-BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::GAUSSIAN)
+BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
 
-BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN)
+BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
 
-BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE)
+BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
 
-BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE)
+BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIFORM)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
@@ -185,6 +312,26 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIFORM)
   ->Apply(gen_final_size)
   ->UseManualTime();
 
+BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::UNIFORM)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::GAUSSIAN)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int32_t, int32_t, dist_type::GAUSSIAN)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
 BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
@@ -194,3 +341,19 @@ BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::GAUSSIAN)
   ->Unit(benchmark::kMillisecond)
   ->Apply(gen_final_size)
   ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_dynamic_erase_all, int64_t, int64_t, dist_type::GAUSSIAN)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
+// TODO: comprehensive tests for erase_none and search_none?
+BENCHMARK_TEMPLATE(BM_dynamic_search_none, int32_t, int32_t, dist_type::UNIFORM)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_dynamic_erase_none, int32_t, int32_t, dist_type::UNIFORM)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(gen_final_size)
+  ->UseManualTime();
diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu
index c384e649f..1538a636a 100644
--- a/benchmarks/hash_table/static_map_bench.cu
+++ b/benchmarks/hash_table/static_map_bench.cu
@@ -155,6 +155,53 @@ static void BM_static_map_search_all(::benchmark::State& state)
                           int64_t(state.range(0)));
 }
 
+template <typename Key, typename Value, dist_type Dist>
+static void BM_static_map_search_none(::benchmark::State& state)
+{
+  using map_type = cuco::static_map<Key, Value>;
+
+  std::size_t num_keys = state.range(0);
+  float occupancy      = state.range(1) / float{100};
+  std::size_t size     = num_keys / occupancy;
+
+  map_type map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+
+  std::vector<Key> h_keys(num_keys);
+  std::vector<Value> h_values(num_keys);
+  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
+  std::vector<Value> h_results(num_keys);
+
+  generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
+
+  for (std::size_t i = 0; i < num_keys; ++i) {
+    Key key           = h_keys[i];
+    Value val         = h_keys[i];
+    h_pairs[i].first  = key;
+    h_pairs[i].second = val;
+  }
+
+  // diff keys
+  for (std::size_t i = 0; i < num_keys; ++i) {
+    h_keys[i] += num_keys;
+  }
+
+  thrust::device_vector<Key> d_keys(h_keys);
+  thrust::device_vector<Value> d_results(num_keys);
+  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
+
+  map.insert(d_pairs.begin(), d_pairs.end());
+
+  for (auto _ : state) {
+    map.find(d_keys.begin(), d_keys.end(), d_results.begin());
+    // TODO: get rid of sync and rewrite the benchmark with `nvbench`
+    // once https://github.com/NVIDIA/nvbench/pull/80 is merged
+    cudaDeviceSynchronize();
+  }
+
+  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
+                          int64_t(state.range(0)));
+}
+
 template <typename Key, typename Value, dist_type Dist>
 static void BM_static_map_erase_all(::benchmark::State& state)
 {
@@ -198,6 +245,52 @@ static void BM_static_map_erase_all(::benchmark::State& state)
                           int64_t(state.range(0)));
 }
 
+template <typename Key, typename Value, dist_type Dist>
+static void BM_static_map_erase_none(::benchmark::State& state)
+{
+  using map_type = cuco::static_map<Key, Value>;
+
+  std::size_t num_keys = state.range(0);
+  float occupancy      = state.range(1) / float{100};
+  std::size_t size     = num_keys / occupancy;
+
+  map_type map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}, cuco::erased_key{-2}};
+
+  std::vector<Key> h_keys(num_keys);
+  std::vector<Value> h_values(num_keys);
+  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
+  std::vector<Value> h_results(num_keys);
+
+  generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
+
+  for (std::size_t i = 0; i < num_keys; ++i) {
+    Key key           = h_keys[i];
+    Value val         = h_keys[i];
+    h_pairs[i].first  = key;
+    h_pairs[i].second = val;
+  }
+
+  // diff keys
+  for (std::size_t i = 0; i < num_keys; ++i) {
+    h_keys[i] += num_keys;
+  }
+
+  thrust::device_vector<Key> d_keys(h_keys);
+  thrust::device_vector<bool> d_results(num_keys);
+  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+    map.insert(d_pairs.begin(), d_pairs.end());
+    state.ResumeTiming();
+
+    map.erase(d_keys.begin(), d_keys.end());
+  }
+
+  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
+                          int64_t(state.range(0)));
+}
+
 BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy)
@@ -252,6 +345,15 @@ BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::GAUSSI
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy);
 
-BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIQUE)
+// TODO: comprehensive tests for erase_all, erase_none and search_none
+BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIFORM)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(generate_size_and_occupancy);
+
+BENCHMARK_TEMPLATE(BM_static_map_search_none, int32_t, int32_t, dist_type::UNIFORM)
+  ->Unit(benchmark::kMillisecond)
+  ->Apply(generate_size_and_occupancy);
+
+BENCHMARK_TEMPLATE(BM_static_map_erase_none, int32_t, int32_t, dist_type::UNIFORM)
   ->Unit(benchmark::kMillisecond)
   ->Apply(generate_size_and_occupancy);
diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index 0c1d2e377..bb7986071 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,9 +21,11 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
   std::size_t initial_capacity,
   sentinel::empty_key<Key> empty_key_sentinel,
   sentinel::empty_value<Value> empty_value_sentinel,
-  Allocator const& alloc)
+  Allocator const& alloc,
+  cudaStream_t stream)
   : empty_key_sentinel_(empty_key_sentinel.value),
     empty_value_sentinel_(empty_value_sentinel.value),
+    erased_key_sentinel_(empty_key_sentinel.value),
     size_(0),
     capacity_(initial_capacity),
     min_insert_size_(1E4),
@@ -34,21 +36,47 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
     initial_capacity,
     sentinel::empty_key<Key>{empty_key_sentinel},
     sentinel::empty_value<Value>{empty_value_sentinel},
-    alloc));
+    alloc,
+    stream));
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
-
-  CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type)));
-}  // namespace cuco
+  submap_num_successes_.push_back(submaps_[0]->num_successes_);
+}
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-dynamic_map<Key, Value, Scope, Allocator>::~dynamic_map()
+dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
+  std::size_t initial_capacity,
+  sentinel::empty_key<Key> empty_key_sentinel,
+  sentinel::empty_value<Value> empty_value_sentinel,
+  sentinel::erased_key<Key> erased_key_sentinel,
+  Allocator const& alloc,
+  cudaStream_t stream)
+  : empty_key_sentinel_(empty_key_sentinel.value),
+    empty_value_sentinel_(empty_value_sentinel.value),
+    erased_key_sentinel_(erased_key_sentinel.value),
+    size_(0),
+    capacity_(initial_capacity),
+    min_insert_size_(1E4),
+    max_load_factor_(0.60),
+    alloc_{alloc}
 {
-  CUCO_ASSERT_CUDA_SUCCESS(cudaFree(num_successes_));
+  CUCO_RUNTIME_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_,
+                       "The empty key sentinel and erased key sentinel cannot be the same value.");
+
+  submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
+    initial_capacity,
+    sentinel::empty_key<Key>{empty_key_sentinel_},
+    sentinel::empty_value<Value>{empty_value_sentinel_},
+    sentinel::erased_key<Key>{erased_key_sentinel_},
+    alloc,
+    stream));
+  submap_views_.push_back(submaps_[0]->get_device_view());
+  submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
+  submap_num_successes_.push_back(submaps_[0]->num_successes_);
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
+void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n, cudaStream_t stream)
 {
   int64_t num_elements_remaining = n;
   uint32_t submap_idx            = 0;
@@ -62,14 +90,25 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
     // if the submap does not exist yet, create it
     else {
       submap_capacity = capacity_;
-      submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
-        submap_capacity,
-        sentinel::empty_key<Key>{empty_key_sentinel_},
-        sentinel::empty_value<Value>{empty_value_sentinel_},
-        alloc_));
+      if (erased_key_sentinel_ != empty_key_sentinel_) {
+        submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
+          submap_capacity,
+          sentinel::empty_key<Key>{empty_key_sentinel_},
+          sentinel::empty_value<Value>{empty_value_sentinel_},
+          sentinel::erased_key<Key>{erased_key_sentinel_},
+          alloc_,
+          stream));
+      } else {
+        submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
+          submap_capacity,
+          sentinel::empty_key<Key>{empty_key_sentinel_},
+          sentinel::empty_value<Value>{empty_value_sentinel_},
+          alloc_,
+          stream));
+      }
+      submap_num_successes_.push_back(submaps_[submap_idx]->num_successes_);
       submap_views_.push_back(submaps_[submap_idx]->get_device_view());
       submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view());
-
       capacity_ *= 2;
     }
 
@@ -80,13 +119,20 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename InputIt, typename Hash, typename KeyEqual>
-void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
-                                                       InputIt last,
-                                                       Hash hash,
-                                                       KeyEqual key_equal)
+void dynamic_map<Key, Value, Scope, Allocator>::insert(
+  InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream)
 {
+  // TODO: memset an atomic variable is unsafe
+  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type),
+                "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t).");
+
+  auto constexpr block_size = 128;
+  auto constexpr stride     = 1;
+  auto constexpr tile_size  = 4;
+
   std::size_t num_to_insert = std::distance(first, last);
-  reserve(size_ + num_to_insert);
+
+  reserve(size_ + num_to_insert, stream);
 
   uint32_t submap_idx = 0;
   while (num_to_insert > 0) {
@@ -95,30 +141,29 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
     // If we are tying to insert some of the remaining keys into this submap, we can insert
     // only if we meet the minimum insert size.
     if (capacity_remaining >= min_insert_size_) {
-      *num_successes_ = 0;
-      int device_id;
-      CUCO_CUDA_TRY(cudaGetDevice(&device_id));
-      CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id));
+      CUCO_CUDA_TRY(
+        cudaMemsetAsync(submap_num_successes_[submap_idx], 0, sizeof(atomic_ctr_type), stream));
 
-      auto n                = std::min(capacity_remaining, num_to_insert);
-      auto const block_size = 128;
-      auto const stride     = 1;
-      auto const tile_size  = 4;
-      auto const grid_size  = (tile_size * n + stride * block_size - 1) / (stride * block_size);
+      auto const n         = std::min(capacity_remaining, num_to_insert);
+      auto const grid_size = (tile_size * n + stride * block_size - 1) / (stride * block_size);
 
       detail::insert<block_size, tile_size, cuco::pair_type<key_type, mapped_type>>
-        <<<grid_size, block_size>>>(first,
-                                    first + n,
-                                    submap_views_.data().get(),
-                                    submap_mutable_views_.data().get(),
-                                    num_successes_,
-                                    submap_idx,
-                                    submaps_.size(),
-                                    hash,
-                                    key_equal);
-      CUCO_CUDA_TRY(cudaDeviceSynchronize());
-
-      std::size_t h_num_successes = num_successes_->load(cuda::std::memory_order_relaxed);
+        <<<grid_size, block_size, 0, stream>>>(first,
+                                               first + n,
+                                               submap_views_.data().get(),
+                                               submap_mutable_views_.data().get(),
+                                               submap_num_successes_.data().get(),
+                                               submap_idx,
+                                               submaps_.size(),
+                                               hash,
+                                               key_equal);
+
+      std::size_t h_num_successes;
+      CUCO_CUDA_TRY(cudaMemcpyAsync(&h_num_successes,
+                                    submap_num_successes_[submap_idx],
+                                    sizeof(atomic_ctr_type),
+                                    cudaMemcpyDeviceToHost,
+                                    stream));
       submaps_[submap_idx]->size_ += h_num_successes;
       size_ += h_num_successes;
       first += n;
@@ -128,34 +173,88 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
   }
 }
 
+template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
+template <typename InputIt, typename Hash, typename KeyEqual>
+void dynamic_map<Key, Value, Scope, Allocator>::erase(
+  InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream)
+{
+  // TODO: memset an atomic variable is unsafe
+  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type),
+                "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t).");
+
+  auto constexpr block_size = 128;
+  auto constexpr stride     = 1;
+  auto constexpr tile_size  = 4;
+
+  auto const num_keys  = std::distance(first, last);
+  auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
+
+  // zero out submap success counters
+  for (uint32_t i = 0; i < submaps_.size(); ++i) {
+    CUCO_CUDA_TRY(cudaMemsetAsync(submap_num_successes_[i], 0, sizeof(atomic_ctr_type), stream));
+  }
+
+  auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long);
+
+  detail::erase<block_size, tile_size>
+    <<<grid_size, block_size, temp_storage_size, stream>>>(first,
+                                                           first + num_keys,
+                                                           submap_mutable_views_.data().get(),
+                                                           submap_num_successes_.data().get(),
+                                                           submaps_.size(),
+                                                           hash,
+                                                           key_equal);
+
+  for (uint32_t i = 0; i < submaps_.size(); ++i) {
+    std::size_t h_submap_num_successes;
+    CUCO_CUDA_TRY(cudaMemcpyAsync(&h_submap_num_successes,
+                                  submap_num_successes_[i],
+                                  sizeof(atomic_ctr_type),
+                                  cudaMemcpyDeviceToHost,
+                                  stream));
+    submaps_[i]->size_ -= h_submap_num_successes;
+    size_ -= h_submap_num_successes;
+  }
+}
+
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename InputIt, typename OutputIt, typename Hash, typename KeyEqual>
-void dynamic_map<Key, Value, Scope, Allocator>::find(
-  InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal)
+void dynamic_map<Key, Value, Scope, Allocator>::find(InputIt first,
+                                                     InputIt last,
+                                                     OutputIt output_begin,
+                                                     Hash hash,
+                                                     KeyEqual key_equal,
+                                                     cudaStream_t stream)
 {
-  auto num_keys         = std::distance(first, last);
-  auto const block_size = 128;
-  auto const stride     = 1;
-  auto const tile_size  = 4;
-  auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
+  auto constexpr block_size = 128;
+  auto constexpr stride     = 1;
+  auto constexpr tile_size  = 4;
 
-  detail::find<block_size, tile_size, Value><<<grid_size, block_size>>>(
+  auto const num_keys  = std::distance(first, last);
+  auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
+
+  detail::find<block_size, tile_size, Value><<<grid_size, block_size, 0, stream>>>(
     first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal);
   CUCO_CUDA_TRY(cudaDeviceSynchronize());
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename InputIt, typename OutputIt, typename Hash, typename KeyEqual>
-void dynamic_map<Key, Value, Scope, Allocator>::contains(
-  InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal)
+void dynamic_map<Key, Value, Scope, Allocator>::contains(InputIt first,
+                                                         InputIt last,
+                                                         OutputIt output_begin,
+                                                         Hash hash,
+                                                         KeyEqual key_equal,
+                                                         cudaStream_t stream)
 {
-  auto num_keys         = std::distance(first, last);
-  auto const block_size = 128;
-  auto const stride     = 1;
-  auto const tile_size  = 4;
-  auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
+  auto constexpr block_size = 128;
+  auto constexpr stride     = 1;
+  auto constexpr tile_size  = 4;
+
+  auto const num_keys  = std::distance(first, last);
+  auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
 
-  detail::contains<block_size, tile_size><<<grid_size, block_size>>>(
+  detail::contains<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
     first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal);
   CUCO_CUDA_TRY(cudaDeviceSynchronize());
 }
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index f261b49aa..566576e1e 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -41,6 +41,7 @@ namespace cg = cooperative_groups;
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of key/value pairs
  * @param last End of the sequence of key/value pairs
  * @param submap_views Array of `static_map::device_view` objects used to
@@ -71,7 +72,7 @@ __global__ void insert(InputIt first,
                        Hash hash,
                        KeyEqual key_equal)
 {
-  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
+  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_successes = 0;
 
@@ -97,8 +98,10 @@ __global__ void insert(InputIt first,
     tid += gridDim.x * blockDim.x;
   }
 
-  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
-  if (threadIdx.x == 0) { *num_successes += block_num_successes; }
+  std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
+  if (threadIdx.x == 0) {
+    num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
+  }
 }
 
 /**
@@ -122,13 +125,14 @@ __global__ void insert(InputIt first,
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of key/value pairs
  * @param last End of the sequence of key/value pairs
  * @param submap_views Array of `static_map::device_view` objects used to
  * perform `contains` operations on each underlying `static_map`
  * @param submap_mutable_views Array of `static_map::device_mutable_view` objects
  * used to perform an `insert` into the target `static_map` submap
- * @param num_successes The number of successfully inserted key/value pairs
+ * @param submap_num_successes The number of successfully inserted key/value pairs for each submap
  * @param insert_idx The index of the submap we are inserting into
  * @param num_submaps The total number of submaps in the map
  * @param hash The unary function to apply to hash each key
@@ -147,13 +151,13 @@ __global__ void insert(InputIt first,
                        InputIt last,
                        viewT* submap_views,
                        mutableViewT* submap_mutable_views,
-                       atomicT* num_successes,
+                       atomicT** submap_num_successes,
                        uint32_t insert_idx,
                        uint32_t num_submaps,
                        Hash hash,
                        KeyEqual key_equal)
 {
-  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
+  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_successes = 0;
 
@@ -182,8 +186,154 @@ __global__ void insert(InputIt first,
     it += (gridDim.x * blockDim.x) / tile_size;
   }
 
-  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
-  if (threadIdx.x == 0) { *num_successes += block_num_successes; }
+  std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
+  if (threadIdx.x == 0) {
+    submap_num_successes[insert_idx]->fetch_add(block_num_successes,
+                                                cuda::std::memory_order_relaxed);
+  }
+}
+
+/**
+ * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
+ *
+ * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
+   insertions.
+ * Else, no effect.
+ *
+ * @tparam block_size The size of the thread block
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `key_type`
+ * @tparam mutableViewT Type of device view allowing modification of hash map storage
+ * @tparam atomicT Type of atomic storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ *
+ * @param first Beginning of the sequence of keys
+ * @param last End of the sequence of keys
+ * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to
+ * perform `erase` operations on each underlying `static_map`
+ * @param num_successes The number of successfully erased key/value pairs
+ * @param submap_num_successes The number of successfully erased key/value pairs
+ * in each submap
+ * @param num_submaps The number of submaps in the map
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function to compare two keys for equality
+ */
+template <uint32_t block_size,
+          typename InputIt,
+          typename mutableViewT,
+          typename atomicT,
+          typename Hash,
+          typename KeyEqual>
+__global__ void erase(InputIt first,
+                      InputIt last,
+                      mutableViewT* submap_mutable_views,
+                      atomicT** submap_num_successes,
+                      uint32_t num_submaps,
+                      Hash hash,
+                      KeyEqual key_equal)
+{
+  extern __shared__ unsigned long long submap_block_num_successes[];
+
+  auto tid = block_size * blockIdx.x + threadIdx.x;
+  auto it  = first + tid;
+
+  for (auto i = threadIdx.x; i < num_submaps; i += block_size) {
+    submap_block_num_successes[i] = 0;
+  }
+  __syncthreads();
+
+  while (it < last) {
+    for (auto i = 0; i < num_submaps; ++i) {
+      if (submap_mutable_views[i].erase(*it, hash, key_equal)) {
+        atomicAdd(&submap_block_num_successes[i], 1);
+        break;
+      }
+    }
+    it += gridDim.x * blockDim.x;
+  }
+  __syncthreads();
+
+  for (auto i = 0; i < num_submaps; ++i) {
+    if (threadIdx.x == 0) {
+      submap_num_successes[i]->fetch_add(static_cast<std::size_t>(submap_block_num_successes[i]),
+                                         cuda::std::memory_order_relaxed);
+    }
+  }
+}
+
+/**
+ * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
+ *
+ * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
+ * insertions.
+ * Else, no effect.
+ *
+ * @tparam block_size The size of the thread block
+ * @tparam tile_size The number of threads in the Cooperative Groups used to perform erase
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `key_type`
+ * @tparam mutableViewT Type of device view allowing modification of hash map storage
+ * @tparam atomicT Type of atomic storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ *
+ * @param first Beginning of the sequence of keys
+ * @param last End of the sequence of keys
+ * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to
+ * perform `erase` operations on each underlying `static_map`
+ * @param num_successes The number of successfully erased key/value pairs
+ * @param submap_num_successes The number of successfully erased key/value pairs
+ * in each submap
+ * @param num_submaps The number of submaps in the map
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function to compare two keys for equality
+ */
+template <uint32_t block_size,
+          uint32_t tile_size,
+          typename InputIt,
+          typename mutableViewT,
+          typename atomicT,
+          typename Hash,
+          typename KeyEqual>
+__global__ void erase(InputIt first,
+                      InputIt last,
+                      mutableViewT* submap_mutable_views,
+                      atomicT** submap_num_successes,
+                      uint32_t num_submaps,
+                      Hash hash,
+                      KeyEqual key_equal)
+{
+  extern __shared__ unsigned long long submap_block_num_successes[];
+
+  auto block = cg::this_thread_block();
+  auto tile  = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tid   = block_size * block.group_index().x + block.thread_rank();
+  auto it    = first + tid / tile_size;
+
+  for (auto i = threadIdx.x; i < num_submaps; i += block_size) {
+    submap_block_num_successes[i] = 0;
+  }
+  block.sync();
+
+  while (it < last) {
+    auto erased = false;
+    int i       = 0;
+    for (i = 0; i < num_submaps; ++i) {
+      erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal);
+      if (erased) { break; }
+    }
+    if (erased && tile.thread_rank() == 0) { atomicAdd(&submap_block_num_successes[i], 1); }
+    it += (gridDim.x * blockDim.x) / tile_size;
+  }
+  block.sync();
+
+  for (auto i = 0; i < num_submaps; ++i) {
+    if (threadIdx.x == 0) {
+      submap_num_successes[i]->fetch_add(static_cast<std::size_t>(submap_block_num_successes[i]),
+                                         cuda::std::memory_order_relaxed);
+    }
+  }
 }
 
 /**
@@ -191,6 +341,7 @@ __global__ void insert(InputIt first,
  *
  * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`.
  * Else, copies the empty value sentinel.
+ *
  * @tparam block_size The number of threads in the thread block
  * @tparam Value The mapped value type for the map
  * @tparam InputIt Device accessible input iterator whose `value_type` is
@@ -200,6 +351,7 @@ __global__ void insert(InputIt first,
  * @tparam viewT Type of `static_map` device view
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
  * @param last End of the sequence of keys
  * @param output_begin Beginning of the sequence of values retrieved for each key
@@ -273,6 +425,7 @@ __global__ void find(InputIt first,
  * @tparam viewT Type of `static_map` device view
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
  * @param last End of the sequence of keys
  * @param output_begin Beginning of the sequence of values retrieved for each key
@@ -345,6 +498,7 @@ __global__ void find(InputIt first,
  * @tparam viewT Type of `static_map` device view
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
  * @param last End of the sequence of keys
  * @param output_begin Beginning of the sequence of booleans for the presence of each key
@@ -411,6 +565,7 @@ __global__ void contains(InputIt first,
  * @tparam viewT Type of `static_map` device view
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
  * @param last End of the sequence of keys
  * @param output_begin Beginning of the sequence of booleans for the presence of each key
diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl
index ee794e1c2..b28b2bad8 100644
--- a/include/cuco/detail/static_map.inl
+++ b/include/cuco/detail/static_map.inl
@@ -857,4 +857,4 @@ static_map<Key, Value, Scope, Allocator>::device_view::contains(CG const& g,
     current_slot = next_slot(g, current_slot);
   }
 }
-}  // namespace cuco
+}  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh
index 705aae7fb..73c22997a 100644
--- a/include/cuco/detail/static_map_kernels.cuh
+++ b/include/cuco/detail/static_map_kernels.cuh
@@ -167,6 +167,28 @@ __global__ void insert(
   if (threadIdx.x == 0) { *num_successes += block_num_successes; }
 }
 
+/**
+ * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
+ *
+ * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
+ * insertions.
+ * Else, no effect.
+ *
+ * @tparam block_size The size of the thread block
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `key_type`
+ * @tparam atomicT Type of atomic storage
+ * @tparam viewT Type of device view allowing access of hash map storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ *
+ * @param first Beginning of the sequence of keys
+ * @param last End of the sequence of keys
+ * @param num_successes The number of successfully erased key/value pairs
+ * @param view Device view used to access the hash map's slot storage
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function to compare two keys for equality
+ */
 template <std::size_t block_size,
           typename InputIt,
           typename atomicT,
@@ -196,6 +218,29 @@ __global__ void erase(
   }
 }
 
+/**
+ * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
+ *
+ * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
+ * insertions.
+ * Else, no effect.
+ *
+ * @tparam block_size The size of the thread block
+ * @tparam tile_size The number of threads in the Cooperative Groups used to perform erase
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `key_type`
+ * @tparam atomicT Type of atomic storage
+ * @tparam viewT Type of device view allowing access of hash map storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ *
+ * @param first Beginning of the sequence of keys
+ * @param last End of the sequence of keys
+ * @param num_successes The number of successfully erased key/value pairs
+ * @param view Device view used to access the hash map's slot storage
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function to compare two keys for equality
+ */
 template <std::size_t block_size,
           uint32_t tile_size,
           typename InputIt,
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index 35a3cdf4a..f2285efc8 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -105,19 +105,48 @@ class dynamic_map {
   using value_type      = cuco::pair_type<Key, Value>;       ///< Type of key/value pairs
   using key_type        = Key;                               ///< Key type
   using mapped_type     = Value;                             ///< Type of mapped values
-  using atomic_ctr_type = cuda::atomic<std::size_t, Scope>;  ///< Type of atomic counters
-  using view_type = typename static_map<Key, Value, Scope>::device_view;  ///< Device view type
-  using mutable_view_type = typename static_map<Key, Value, Scope>::device_mutable_view;
-  ///< Device mutable view type
+  using atomic_ctr_type = cuda::atomic<std::size_t, Scope>;  ///< Atomic counter type
+  using view_type =
+    typename static_map<Key, Value, Scope>::device_view;  ///< Type for submap device view
+  using mutable_view_type =
+    typename static_map<Key, Value, Scope>::device_mutable_view;  ///< Type for submap mutable
+                                                                  ///< device view
 
   dynamic_map(dynamic_map const&) = delete;
   dynamic_map(dynamic_map&&)      = delete;
+
   dynamic_map& operator=(dynamic_map const&) = delete;
   dynamic_map& operator=(dynamic_map&&) = delete;
 
   /**
-   * @brief Construct a dynamically-sized map with the specified initial capacity, growth factor and
-   * sentinel values.
+   * @brief Constructs a dynamically-sized map with the specified initial capacity, growth factor
+   * and sentinel values.
+   *
+   * The capacity of the map will automatically increase as the user adds key/value pairs using
+   * `insert`.
+   *
+   * Capacity increases by a factor of growth_factor each time the size of the map exceeds a
+   * threshold occupancy. The performance of `find` and `contains` decreases somewhat each time the
+   * map's capacity grows.
+   *
+   * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and
+   * undefined behavior results from attempting to insert any key/value pair
+   * that contains either.
+   *
+   * @param initial_capacity The initial number of slots in the map
+   * @param empty_key_sentinel The reserved key value for empty slots
+   * @param empty_value_sentinel The reserved mapped value for empty slots
+   * @param alloc Allocator used to allocate submap device storage
+   * @param stream Stream used for executing the kernels
+   */
+  dynamic_map(std::size_t initial_capacity,
+              sentinel::empty_key<Key> empty_key_sentinel,
+              sentinel::empty_value<Value> empty_value_sentinel,
+              Allocator const& alloc = Allocator{},
+              cudaStream_t stream    = nullptr);
+
+  /**
+   * @brief Constructs a dynamically-sized map with erase capability.
    *
    * The capacity of the map will automatically increase as the user adds key/value pairs using
    * `insert`.
@@ -133,18 +162,25 @@ class dynamic_map {
    * @param initial_capacity The initial number of slots in the map
    * @param empty_key_sentinel The reserved key value for empty slots
    * @param empty_value_sentinel The reserved mapped value for empty slots
+   * @param erased_key_sentinel The reserved key value for erased slots
    * @param alloc Allocator used to allocate submap device storage
+   * @param stream Stream used for executing the kernels
+   *
+   * @throw std::runtime error if the empty key sentinel and erased key sentinel
+   * are the same value
    */
   dynamic_map(std::size_t initial_capacity,
               sentinel::empty_key<Key> empty_key_sentinel,
               sentinel::empty_value<Value> empty_value_sentinel,
-              Allocator const& alloc = Allocator{});
+              sentinel::erased_key<Key> erased_key_sentinel,
+              Allocator const& alloc = Allocator{},
+              cudaStream_t stream    = nullptr);
 
   /**
-   * @brief Destroy the map and frees its contents
+   * @brief Destroys the map and frees its contents
    *
    */
-  ~dynamic_map();
+  ~dynamic_map() {}
 
   /**
    * @brief Grows the capacity of the map so there is enough space for `n` key/value pairs.
@@ -152,8 +188,9 @@ class dynamic_map {
    * If there is already enough space for `n` key/value pairs, the capacity remains the same.
    *
    * @param n The number of key value pairs for which there must be space
+   * @param stream Stream used for executing the kernels
    */
-  void reserve(std::size_t n);
+  void reserve(std::size_t n, cudaStream_t stream = nullptr);
 
   /**
    * @brief Inserts all key/value pairs in the range `[first, last)`.
@@ -169,11 +206,55 @@ class dynamic_map {
    * @param last End of the sequence of key/value pairs
    * @param hash The unary function to apply to hash each key
    * @param key_equal The binary function to compare two keys for equality
+   * @param stream Stream used for executing the kernels
    */
   template <typename InputIt,
             typename Hash     = cuco::murmurhash3_32<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
-  void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{});
+  void insert(InputIt first,
+              InputIt last,
+              Hash hash           = Hash{},
+              KeyEqual key_equal  = KeyEqual{},
+              cudaStream_t stream = nullptr);
+
+  /**
+   * @brief Erases keys in the range `[first, last)`.
+   *
+   * For each key `k` in `[first, last)`, if `contains(k) == true), removes `k` and it's
+   * associated value from the map. Else, no effect.
+   *
+   *  Side-effects:
+   *  - `contains(k) == false`
+   *  - `find(k) == end()`
+   *  - `insert({k,v}) == true`
+   *  - `get_size()` is reduced by the total number of erased keys
+   *
+   * This function synchronizes `stream`.
+   *
+   * Keep in mind that `erase` does not cause the map to shrink its memory allocation.
+   *
+   * @tparam InputIt Device accessible input iterator whose `value_type` is
+   * convertible to the map's `value_type`
+   * @tparam Hash Unary callable type
+   * @tparam KeyEqual Binary callable type
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param hash The unary function to apply to hash each key
+   * @param key_equal The binary function to compare two keys for equality
+   * @param stream Stream used for executing the kernels
+   *
+   * @throw std::runtime_error if a unique erased key sentinel value was not
+   * provided at construction
+   */
+  template <typename InputIt,
+            typename Hash     = cuco::murmurhash3_32<key_type>,
+            typename KeyEqual = thrust::equal_to<key_type>>
+  void erase(InputIt first,
+             InputIt last,
+             Hash hash           = Hash{},
+             KeyEqual key_equal  = KeyEqual{},
+             cudaStream_t stream = nullptr);
 
   /**
    * @brief Finds the values corresponding to all keys in the range `[first, last)`.
@@ -187,11 +268,13 @@ class dynamic_map {
    * convertible to the map's `mapped_type`
    * @tparam Hash Unary callable type
    * @tparam KeyEqual Binary callable type
+   *
    * @param first Beginning of the sequence of keys
    * @param last End of the sequence of keys
    * @param output_begin Beginning of the sequence of values retrieved for each key
    * @param hash The unary function to apply to hash each key
    * @param key_equal The binary function to compare two keys for equality
+   * @param stream Stream used for executing the kernels
    */
   template <typename InputIt,
             typename OutputIt,
@@ -200,8 +283,9 @@ class dynamic_map {
   void find(InputIt first,
             InputIt last,
             OutputIt output_begin,
-            Hash hash          = Hash{},
-            KeyEqual key_equal = KeyEqual{});
+            Hash hash           = Hash{},
+            KeyEqual key_equal  = KeyEqual{},
+            cudaStream_t stream = nullptr);
 
   /**
    * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
@@ -214,11 +298,13 @@ class dynamic_map {
    * convertible to the map's `mapped_type`
    * @tparam Hash Unary callable type
    * @tparam KeyEqual Binary callable type
+   *
    * @param first Beginning of the sequence of keys
    * @param last End of the sequence of keys
    * @param output_begin Beginning of the sequence of booleans for the presence of each key
    * @param hash The unary function to apply to hash each key
    * @param key_equal The binary function to compare two keys for equality
+   * @param stream Stream used for executing the kernels
    */
   template <typename InputIt,
             typename OutputIt,
@@ -227,8 +313,9 @@ class dynamic_map {
   void contains(InputIt first,
                 InputIt last,
                 OutputIt output_begin,
-                Hash hash          = Hash{},
-                KeyEqual key_equal = KeyEqual{});
+                Hash hash           = Hash{},
+                KeyEqual key_equal  = KeyEqual{},
+                cudaStream_t stream = nullptr);
 
   /**
    * @brief Gets the current number of elements in the map
@@ -254,18 +341,22 @@ class dynamic_map {
  private:
   key_type empty_key_sentinel_{};       ///< Key value that represents an empty slot
   mapped_type empty_value_sentinel_{};  ///< Initial value of empty slot
-  std::size_t size_{};                  ///< Number of keys in the map
-  std::size_t capacity_{};              ///< Maximum number of keys that can be inserted
-  float max_load_factor_{};             ///< Max load factor before capacity growth
+  key_type erased_key_sentinel_{};      ///< Key value that represents an erased slot
+
+  // TODO: initialize this
+  std::size_t size_{};       ///< Number of keys in the map
+  std::size_t capacity_{};   ///< Maximum number of keys that can be inserted
+  float max_load_factor_{};  ///< Max load factor before capacity growth
 
   std::vector<std::unique_ptr<static_map<key_type, mapped_type, Scope>>>
     submaps_;                                      ///< vector of pointers to each submap
   thrust::device_vector<view_type> submap_views_;  ///< vector of device views for each submap
   thrust::device_vector<mutable_view_type>
-    submap_mutable_views_;          ///< vector of mutable device views for each submap
-  std::size_t min_insert_size_{};   ///< min remaining capacity of submap for insert
-  atomic_ctr_type* num_successes_;  ///< number of successfully inserted keys on insert
-  Allocator alloc_{};  ///< Allocator passed to submaps to allocate their device storage
+    submap_mutable_views_;         ///< vector of mutable device views for each submap
+  std::size_t min_insert_size_{};  ///< min remaining capacity of submap for insert
+  thrust::device_vector<atomic_ctr_type*>
+    submap_num_successes_;  ///< Number of successfully erased keys for each submap
+  Allocator alloc_{};       ///< Allocator passed to submaps to allocate their device storage
 };
 }  // namespace cuco
 
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index 7674d5b20..7a240da03 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -1415,7 +1415,7 @@ class static_map {
   }
 
  private:
-  pair_atomic_type* slots_{nullptr};            ///< Pointer to flat slots storage
+  pair_atomic_type* slots_{};                   ///< Pointer to flat slots storage
   std::size_t capacity_{};                      ///< Total number of slots
   std::size_t size_{};                          ///< Number of keys in map
   Key empty_key_sentinel_{};                    ///< Key value that represents an empty slot
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index dd3ea3bc3..bfb9cfbf0 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -68,7 +68,8 @@ ConfigureTest(STATIC_MAP_TEST
 ###################################################################################################
 # - dynamic_map tests -----------------------------------------------------------------------------
 ConfigureTest(DYNAMIC_MAP_TEST
-    dynamic_map/unique_sequence_test.cu)
+    dynamic_map/unique_sequence_test.cu
+    dynamic_map/erase_test.cu)
 
 ###################################################################################################
 # - static_multimap tests -------------------------------------------------------------------------
diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu
new file mode 100644
index 000000000..fc3dc3c28
--- /dev/null
+++ b/tests/dynamic_map/erase_test.cu
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/dynamic_map.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+
+#include <catch2/catch.hpp>
+
+TEMPLATE_TEST_CASE_SIG("erase key",
+                       "",
+                       ((typename Key, typename Value), Key, Value),
+                       (int32_t, int32_t),
+                       (int32_t, int64_t),
+                       (int64_t, int32_t),
+                       (int64_t, int64_t))
+{
+  constexpr std::size_t num_keys = 1'000'000;
+  cuco::dynamic_map<Key, Value> map{num_keys * 2,
+                                    cuco::empty_key<Key>{-1},
+                                    cuco::empty_value<Value>{-1},
+                                    cuco::erased_key<Key>{-2}};
+
+  SECTION("Check single submap insert/erase")
+  {
+    thrust::device_vector<Key> d_keys(num_keys);
+    thrust::device_vector<Value> d_values(num_keys);
+    thrust::device_vector<bool> d_keys_exist(num_keys);
+
+    thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1);
+    thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1);
+
+    auto pairs_begin =
+      thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin()));
+
+    map.insert(pairs_begin, pairs_begin + num_keys);
+
+    REQUIRE(map.get_size() == num_keys);
+
+    map.erase(d_keys.begin(), d_keys.end());
+
+    // delete decreases count correctly
+    REQUIRE(map.get_size() == 0);
+
+    map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
+
+    // keys were actaully deleted
+    REQUIRE(cuco::test::none_of(d_keys_exist.begin(),
+                                d_keys_exist.end(),
+                                [] __device__(const bool key_found) { return key_found; }));
+
+    // ensures that map is reusing deleted slots
+    map.insert(pairs_begin, pairs_begin + num_keys);
+
+    REQUIRE(map.get_size() == num_keys);
+
+    map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
+
+    REQUIRE(cuco::test::all_of(d_keys_exist.begin(),
+                               d_keys_exist.end(),
+                               [] __device__(const bool key_found) { return key_found; }));
+
+    // erase can act selectively
+    map.erase(d_keys.begin(), d_keys.begin() + num_keys / 2);
+    map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
+
+    REQUIRE(cuco::test::none_of(d_keys_exist.begin(),
+                                d_keys_exist.begin() + num_keys / 2,
+                                [] __device__(const bool key_found) { return key_found; }));
+
+    REQUIRE(cuco::test::all_of(d_keys_exist.begin() + num_keys / 2,
+                               d_keys_exist.end(),
+                               [] __device__(const bool key_found) { return key_found; }));
+
+    // clear map
+    map.erase(d_keys.begin() + num_keys / 2, d_keys.end());
+  }
+
+  SECTION("Check multiple submaps insert/erase")
+  {
+    constexpr std::size_t num = 4 * num_keys;
+
+    thrust::device_vector<Key> d_keys(num);
+    thrust::device_vector<Value> d_values(num);
+    thrust::device_vector<bool> d_keys_exist(num);
+
+    thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1);
+    thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1);
+
+    auto pairs_begin =
+      thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin()));
+
+    map.insert(pairs_begin, pairs_begin + num);
+
+    // map should resize twice if the erased slots are successfully reused
+    REQUIRE(map.get_capacity() == 2 * num);
+    // check that keys can be successfully deleted from only the first and second submaps
+    map.erase(d_keys.begin(), d_keys.begin() + 2 * num_keys);
+    map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
+
+    REQUIRE(cuco::test::none_of(d_keys_exist.begin(),
+                                d_keys_exist.begin() + 2 * num_keys,
+                                [] __device__(const bool key_found) { return key_found; }));
+
+    REQUIRE(cuco::test::all_of(d_keys_exist.begin() + 2 * num_keys,
+                               d_keys_exist.end(),
+                               [] __device__(const bool key_found) { return key_found; }));
+
+    REQUIRE(map.get_size() == 2 * num_keys);
+    // check that keys can be successfully deleted from all submaps (some will be unsuccessful
+    // erases)
+    map.erase(d_keys.begin(), d_keys.end());
+
+    map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
+
+    REQUIRE(cuco::test::none_of(d_keys_exist.begin(),
+                                d_keys_exist.end(),
+                                [] __device__(const bool key_found) { return key_found; }));
+
+    REQUIRE(map.get_size() == 0);
+  }
+}
diff --git a/tests/dynamic_map/unique_sequence_test.cu b/tests/dynamic_map/unique_sequence_test.cu
index d97bac0a0..b42a7fa5a 100644
--- a/tests/dynamic_map/unique_sequence_test.cu
+++ b/tests/dynamic_map/unique_sequence_test.cu
@@ -38,6 +38,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys",
                        (int64_t, int64_t))
 {
   constexpr std::size_t num_keys{50'000'000};
+
   cuco::dynamic_map<Key, Value> map{
     30'000'000, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};