NVIDIA · mythrocks · Mar 24, 2026 · Mar 10, 2026 · Mar 11, 2026 · Mar 12, 2026
diff --git a/src/main/cpp/benchmarks/bloom_filter.cu b/src/main/cpp/benchmarks/bloom_filter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,17 +22,51 @@
 #include <hash/hash.hpp>
 #include <nvbench/nvbench.cuh>
 
-static void bloom_filter_put(nvbench::state& state)
+static void bloom_filter_put_v1(nvbench::state& state)
+{
+  constexpr int num_rows   = 150'000'000;
+  constexpr int num_hashes = 3;
+
+  cudf::size_type const bloom_filter_bytes = state.get_int64("bloom_filter_bytes");
+  cudf::size_type const bloom_filter_longs = bloom_filter_bytes / sizeof(int64_t);
+  auto bloom_filter                        = spark_rapids_jni::bloom_filter_create(
+    spark_rapids_jni::bloom_filter_version_1, num_hashes, bloom_filter_longs);
+
+  data_profile_builder builder;
+  builder.no_validity();
+  auto const src   = create_random_table({{cudf::type_id::INT64}}, row_count{num_rows}, builder);
+  auto const input = spark_rapids_jni::xxhash64(*src);
+
+  auto const stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch, auto& timer) {
+               timer.start();
+               spark_rapids_jni::bloom_filter_put(*bloom_filter, *input);
+               stream.synchronize();
+               timer.stop();
+             });
+
+  size_t const bytes_read    = num_rows * sizeof(int64_t);
+  size_t const bytes_written = num_rows * sizeof(cudf::bitmask_type) * num_hashes;
+  auto const time            = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(std::size_t{num_rows}, "Rows Inserted");
+  state.add_global_memory_reads(bytes_read, "Bytes read");
+  state.add_global_memory_writes(bytes_written, "Bytes written");
+  state.add_element_count(static_cast<double>(bytes_written) / time, "Write bytes/sec");
+}
+
+static void bloom_filter_put_v2(nvbench::state& state)
 {
   constexpr int num_rows   = 150'000'000;
   constexpr int num_hashes = 3;
 
   // create the bloom filter
   cudf::size_type const bloom_filter_bytes = state.get_int64("bloom_filter_bytes");
   cudf::size_type const bloom_filter_longs = bloom_filter_bytes / sizeof(int64_t);
-  auto bloom_filter = spark_rapids_jni::bloom_filter_create(num_hashes, bloom_filter_longs);
+  auto bloom_filter                        = spark_rapids_jni::bloom_filter_create(
+    spark_rapids_jni::bloom_filter_version_2, num_hashes, bloom_filter_longs);
 
-  // create a column of hashed values
   data_profile_builder builder;
   builder.no_validity();
   auto const src   = create_random_table({{cudf::type_id::INT64}}, row_count{num_rows}, builder);
@@ -57,7 +91,12 @@ static void bloom_filter_put(nvbench::state& state)
   state.add_element_count(static_cast<double>(bytes_written) / time, "Write bytes/sec");
 }
 
-NVBENCH_BENCH(bloom_filter_put)
-  .set_name("Bloom Filter Put")
+NVBENCH_BENCH(bloom_filter_put_v1)
+  .set_name("Bloom Filter Put V1")
+  .add_int64_axis("bloom_filter_bytes",
+                  {512 * 1024, 1024 * 1024, 2 * 1024 * 1024, 4 * 1024 * 1024, 8 * 1024 * 1024});
+
+NVBENCH_BENCH(bloom_filter_put_v2)
+  .set_name("Bloom Filter Put V2")
   .add_int64_axis("bloom_filter_bytes",
                   {512 * 1024, 1024 * 1024, 2 * 1024 * 1024, 4 * 1024 * 1024, 8 * 1024 * 1024});
diff --git a/src/main/cpp/src/BloomFilterJni.cpp b/src/main/cpp/src/BloomFilterJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,17 +20,32 @@
 #include "jni_utils.hpp"
 #include "utilities.hpp"
 
+#include <limits>
+
 extern "C" {
 
 JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_BloomFilter_creategpu(
-  JNIEnv* env, jclass, jint numHashes, jlong bloomFilterBits)
+  JNIEnv* env, jclass, jint version, jint numHashes, jlong bloomFilterBits, jint seed)
 {
   JNI_TRY
   {
     cudf::jni::auto_set_device(env);
 
-    int bloom_filter_longs = static_cast<int>((bloomFilterBits + 63) / 64);
-    auto bloom_filter      = spark_rapids_jni::bloom_filter_create(numHashes, bloom_filter_longs);
+    // TODO (future): There is an impedance mismatch between the C++ and Java APIs.
+    // This seems to have been introduced in https://github.com/NVIDIA/spark-rapids-jni/pull/1303.
+    // The Java API accepts a long for the bloom filter bit count, but the C++ API accepts an int.
+    // This means that the Java API can represent a bloom filter bit count that is too large to
+    // be represented as an int in the C++ API.
+    // We should fix this by changing the C++ API to accept a long for the bloom filter bit count.
+    // We will address this in a future PR.  For now, we add error checking to avoid overflow.
+    JNI_ARG_CHECK(env,
+                  bloomFilterBits >= 0 && bloomFilterBits <= std::numeric_limits<int>::max() - 63,
+                  "bloom filter bit count overflows int when converted to longs",
+                  0);
+    auto const bloom_filter_longs_long = (bloomFilterBits + 63) / 64;
+    auto const bloom_filter_longs      = static_cast<int>(bloom_filter_longs_long);
+    auto bloom_filter =
+      spark_rapids_jni::bloom_filter_create(version, numHashes, bloom_filter_longs, seed);
     return reinterpret_cast<jlong>(bloom_filter.release());
   }
   JNI_CATCH(env, 0);