apache
diff --git a/‎native/spark-expr/src/bloom_filter/bloom_filter_agg.rs‎
Lines changed: 47 additions & 5 deletions b/‎native/spark-expr/src/bloom_filter/bloom_filter_agg.rs‎
Lines changed: 47 additions & 5 deletions
diff --git a/‎native/spark-expr/src/bloom_filter/spark_bit_array.rs‎
Lines changed: 44 additions & 29 deletions b/‎native/spark-expr/src/bloom_filter/spark_bit_array.rs‎
Lines changed: 44 additions & 29 deletions
diff --git a/‎native/spark-expr/src/bloom_filter/spark_bloom_filter.rs‎
Lines changed: 157 additions & 8 deletions b/‎native/spark-expr/src/bloom_filter/spark_bloom_filter.rs‎
Lines changed: 157 additions & 8 deletions
@@ -25,7 +25,7 @@ use crate::bloom_filter::spark_bloom_filter::{SparkBloomFilter, SparkBloomFilter
 use arrow::array::ArrayRef;
 use arrow::array::BinaryArray;
 use datafusion::common::{downcast_value, ScalarValue};
-use datafusion::error::Result;
+use datafusion::error::{DataFusionError, Result};
 use datafusion::logical_expr::function::{AccumulatorArgs, StateFieldsArgs};
 use datafusion::logical_expr::{AggregateUDFImpl, Signature};
 use datafusion::physical_expr::expressions::Literal;
@@ -141,15 +141,30 @@ impl Accumulator for SparkBloomFilter {
                 ScalarValue::Utf8(Some(value)) => {
                     self.put_binary(value.as_bytes());
                 }
-                _ => {
-                    unreachable!()
+                // Spark's BloomFilterAggregate.update ignores null inputs.
+                ScalarValue::Int8(None)
+                | ScalarValue::Int16(None)
+                | ScalarValue::Int32(None)
+                | ScalarValue::Int64(None)
+                | ScalarValue::Utf8(None) => {}
+                other => {
+                    return Err(DataFusionError::Internal(format!(
+                        "bloom_filter_agg received an unsupported input type: {other:?}"
+                    )));
                 }
             }
             Ok(())
         })
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
+        // Spark's BloomFilterAggregate.eval returns NULL when no bit is set,
+        // i.e. the aggregate saw no non-null input. Mirror that here so an
+        // empty-input bloom_filter_agg yields NULL rather than a serialized
+        // empty filter.
+        if self.cardinality() == 0 {
+            return Ok(ScalarValue::Binary(None));
+        }
         Ok(ScalarValue::Binary(Some(self.spark_serialization())))
     }
 
@@ -173,7 +188,34 @@ impl Accumulator for SparkBloomFilter {
         );
         assert_eq!(states[0].len(), 1);
         let state_sv = downcast_value!(states[0], BinaryArray);
-        self.merge_filter(state_sv.value_data());
-        Ok(())
+        self.merge_filter(state_sv.value_data())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Spark's BloomFilterAggregate.eval returns NULL when the filter saw no
+    /// non-null input (cardinality 0); an untouched accumulator must match.
+    #[test]
+    fn evaluate_empty_filter_yields_null() {
+        let num_bits = 1024;
+        let num_hash = spark_bloom_filter::optimal_num_hash_functions(100, num_bits);
+        let mut acc = SparkBloomFilter::new(SparkBloomFilterVersion::V1, num_hash, num_bits, 0);
+        assert_eq!(acc.evaluate().unwrap(), ScalarValue::Binary(None));
+    }
+
+    /// A filter with at least one set bit serializes to a non-null binary.
+    #[test]
+    fn evaluate_non_empty_filter_yields_binary() {
+        let num_bits = 1024;
+        let num_hash = spark_bloom_filter::optimal_num_hash_functions(100, num_bits);
+        let mut acc = SparkBloomFilter::new(SparkBloomFilterVersion::V1, num_hash, num_bits, 0);
+        acc.put_long(42);
+        assert!(matches!(
+            acc.evaluate().unwrap(),
+            ScalarValue::Binary(Some(_))
+        ));
     }
 }
@@ -15,9 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::datatypes::ToByteSlice;
-use std::iter::zip;
-
 /// A simple bit array implementation that simulates the behavior of Spark's BitArray which is
 /// used in the BloomFilter implementation. Some methods are not implemented as they are not
 /// required for the current use case.
@@ -61,41 +58,28 @@ impl SparkBitArray {
         self.word_size() as u64 * 64
     }
 
-    pub fn byte_size(&self) -> usize {
-        self.word_size() * 8
-    }
-
     pub fn word_size(&self) -> usize {
         self.data.len()
     }
 
-    #[allow(dead_code)] // this is only called from tests
-    pub fn cardinality(&self) -> usize {
-        self.bit_count
-    }
-
-    pub fn to_bytes(&self) -> Vec<u8> {
-        Vec::from(self.data.to_byte_slice())
-    }
-
     pub fn data(&self) -> Vec<u64> {
         self.data.clone()
     }
 
-    // Combines SparkBitArrays, however other is a &[u8] because we anticipate to come from an
-    // Arrow ScalarValue::Binary which is a byte vector underneath, rather than a word vector.
-    pub fn merge_bits(&mut self, other: &[u8]) {
-        assert_eq!(self.byte_size(), other.len());
+    /// Number of set bits in the array. Mirrors Spark's `BitArray.cardinality()`.
+    pub fn cardinality(&self) -> usize {
+        self.bit_count
+    }
+
+    /// OR-merge `incoming` (big-endian `u64` words, one per word in `self`) into
+    /// `self.data` in place and refresh `bit_count` in the same pass. The caller
+    /// is responsible for ensuring `incoming.len() == self.word_size() * 8`.
+    pub fn merge_be_words(&mut self, incoming: &[u8]) {
+        debug_assert_eq!(self.data.len() * 8, incoming.len());
         let mut bit_count: usize = 0;
-        // For each word, merge the bits into self, and accumulate a new bit_count.
-        for i in zip(
-            self.data.iter_mut(),
-            other
-                .chunks(8)
-                .map(|chunk| u64::from_ne_bytes(chunk.try_into().unwrap())),
-        ) {
-            *i.0 |= i.1;
-            bit_count += i.0.count_ones() as usize;
+        for (word, chunk) in self.data.iter_mut().zip(incoming.chunks_exact(8)) {
+            *word |= u64::from_be_bytes(chunk.try_into().unwrap());
+            bit_count += word.count_ones() as usize;
         }
         self.bit_count = bit_count;
     }
@@ -108,6 +92,37 @@ pub fn num_words(num_bits: usize) -> usize {
 #[cfg(test)]
 mod test {
     use super::*;
+    use arrow::datatypes::ToByteSlice;
+    use std::iter::zip;
+
+    impl SparkBitArray {
+        fn byte_size(&self) -> usize {
+            self.word_size() * 8
+        }
+
+        fn to_bytes(&self) -> Vec<u8> {
+            Vec::from(self.data.to_byte_slice())
+        }
+
+        /// Combines SparkBitArrays, however other is a &[u8] because we anticipate to come from
+        /// an Arrow ScalarValue::Binary which is a byte vector underneath, rather than a word
+        /// vector.
+        fn merge_bits(&mut self, other: &[u8]) {
+            assert_eq!(self.byte_size(), other.len());
+            let mut bit_count: usize = 0;
+            // For each word, merge the bits into self, and accumulate a new bit_count.
+            for i in zip(
+                self.data.iter_mut(),
+                other
+                    .chunks(8)
+                    .map(|chunk| u64::from_ne_bytes(chunk.try_into().unwrap())),
+            ) {
+                *i.0 |= i.1;
+                bit_count += i.0.count_ones() as usize;
+            }
+            self.bit_count = bit_count;
+        }
+    }
 
     #[test]
     fn test_spark_bit_array() {
 
@@ -17,6 +17,7 @@
 
 use arrow::array::{ArrowNativeTypeOp, BooleanArray, Int64Array};
 use arrow::datatypes::ToByteSlice;
+use datafusion::common::{DataFusionError, Result as DFResult};
 use std::cmp;
 
 use crate::bloom_filter::spark_bit_array;
@@ -271,17 +272,72 @@ impl SparkBloomFilter {
             .collect()
     }
 
+    /// Number of set bits in the underlying bit array. Mirrors Spark's
+    /// `BloomFilter.cardinality()`: a filter that has seen no non-null input
+    /// has cardinality 0.
+    pub fn cardinality(&self) -> usize {
+        self.bits.cardinality()
+    }
+
     pub fn state_as_bytes(&self) -> Vec<u8> {
-        self.bits.to_bytes()
+        self.spark_serialization()
     }
 
-    pub fn merge_filter(&mut self, other: &[u8]) {
-        assert_eq!(
-            other.len(),
-            self.bits.byte_size(),
-            "Cannot merge SparkBloomFilters with different lengths."
-        );
-        self.bits.merge_bits(other);
+    pub fn merge_filter(&mut self, other: &[u8]) -> DFResult<()> {
+        let mut offset = 0;
+
+        let version_int = read_num_be_bytes!(i32, 4, other[offset..]);
+        offset += 4;
+        if version_int != self.version.to_int() {
+            return Err(DataFusionError::Internal(format!(
+                "BloomFilter merge: version mismatch (got {}, expected {})",
+                version_int,
+                self.version.to_int(),
+            )));
+        }
+
+        let num_hash = read_num_be_bytes!(i32, 4, other[offset..]) as u32;
+        offset += 4;
+        if num_hash != self.num_hash_functions {
+            return Err(DataFusionError::Internal(format!(
+                "BloomFilter merge: num_hash_functions mismatch (got {}, expected {})",
+                num_hash, self.num_hash_functions,
+            )));
+        }
+
+        if let SparkBloomFilterVersion::V2 = self.version {
+            let seed = read_num_be_bytes!(i32, 4, other[offset..]);
+            offset += 4;
+            if seed != self.seed {
+                return Err(DataFusionError::Internal(format!(
+                    "BloomFilter merge: seed mismatch (got {}, expected {})",
+                    seed, self.seed,
+                )));
+            }
+        }
+
+        let num_words = read_num_be_bytes!(i32, 4, other[offset..]) as usize;
+        offset += 4;
+        if num_words != self.bits.word_size() {
+            return Err(DataFusionError::Internal(format!(
+                "BloomFilter merge: num_words mismatch (got {}, expected {})",
+                num_words,
+                self.bits.word_size(),
+            )));
+        }
+
+        let expected_bytes = num_words * 8;
+        if other.len() - offset < expected_bytes {
+            return Err(DataFusionError::Internal(format!(
+                "BloomFilter merge: truncated bit array (got {} bytes, expected {})",
+                other.len() - offset,
+                expected_bytes,
+            )));
+        }
+
+        self.bits
+            .merge_be_words(&other[offset..offset + expected_bytes]);
+        Ok(())
     }
 }
 
@@ -396,4 +452,97 @@ mod tests {
         buf.extend_from_slice(&[0u8; 32]); // 4 words * 8 bytes
         let _ = SparkBloomFilter::from(buf.as_slice());
     }
+
+    /// Two V1 filters with identical parameters. Populate the first, serialize via
+    /// state_as_bytes, merge into the empty second, and verify the second contains
+    /// everything the first did. Exercises the aggregator state → merge_batch path.
+    #[test]
+    fn state_round_trip_v1_merge() {
+        let num_bits = 1024;
+        let num_hash = optimal_num_hash_functions(100, num_bits);
+        let mut a = SparkBloomFilter::new(SparkBloomFilterVersion::V1, num_hash, num_bits, 0);
+        for v in [1_i64, 7, 42, 99, -3, i64::MAX] {
+            a.put_long(v);
+        }
+
+        let mut b = SparkBloomFilter::new(SparkBloomFilterVersion::V1, num_hash, num_bits, 0);
+        b.merge_filter(&a.state_as_bytes()).unwrap();
+
+        for v in [1_i64, 7, 42, 99, -3, i64::MAX] {
+            assert!(b.might_contain_long(v), "missing {v} after merge");
+        }
+    }
+
+    /// V2 default seed (0) round-trip through state_as_bytes → merge_filter.
+    #[test]
+    fn state_round_trip_v2_default_seed() {
+        let num_bits = 1024;
+        let num_hash = optimal_num_hash_functions(100, num_bits);
+        let mut a = SparkBloomFilter::new(SparkBloomFilterVersion::V2, num_hash, num_bits, 0);
+        for v in [11_i64, 222, 3333] {
+            a.put_long(v);
+        }
+
+        let mut b = SparkBloomFilter::new(SparkBloomFilterVersion::V2, num_hash, num_bits, 0);
+        b.merge_filter(&a.state_as_bytes()).unwrap();
+
+        for v in [11_i64, 222, 3333] {
+            assert!(b.might_contain_long(v));
+        }
+    }
+
+    /// V2 non-zero seed round-trip; verifies the seed field is parsed and that
+    /// both filters use the same seed-dependent hash scattering.
+    #[test]
+    fn state_round_trip_v2_nonzero_seed() {
+        let num_bits = 1024;
+        let num_hash = optimal_num_hash_functions(100, num_bits);
+        let seed = 0x5eed_5eed_u32 as i32;
+        let mut a = SparkBloomFilter::new(SparkBloomFilterVersion::V2, num_hash, num_bits, seed);
+        a.put_long(123);
+
+        let mut b = SparkBloomFilter::new(SparkBloomFilterVersion::V2, num_hash, num_bits, seed);
+        b.merge_filter(&a.state_as_bytes()).unwrap();
+
+        assert!(b.might_contain_long(123));
+    }
+
+    fn assert_merge_err_contains(filter: &mut SparkBloomFilter, buf: &[u8], needle: &str) {
+        let err = filter.merge_filter(buf).unwrap_err().to_string();
+        assert!(err.contains(needle), "expected `{needle}` in error: {err}");
+    }
+
+    #[test]
+    fn merge_rejects_version_mismatch() {
+        let num_bits = 1024;
+        let num_hash = optimal_num_hash_functions(100, num_bits);
+        let a = SparkBloomFilter::new(SparkBloomFilterVersion::V2, num_hash, num_bits, 0);
+        let mut b = SparkBloomFilter::new(SparkBloomFilterVersion::V1, num_hash, num_bits, 0);
+        assert_merge_err_contains(&mut b, &a.state_as_bytes(), "version mismatch");
+    }
+
+    #[test]
+    fn merge_rejects_num_hash_mismatch() {
+        let num_bits = 1024;
+        let a = SparkBloomFilter::new(SparkBloomFilterVersion::V1, 5, num_bits, 0);
+        let mut b = SparkBloomFilter::new(SparkBloomFilterVersion::V1, 7, num_bits, 0);
+        assert_merge_err_contains(&mut b, &a.state_as_bytes(), "num_hash_functions mismatch");
+    }
+
+    #[test]
+    fn merge_rejects_seed_mismatch_v2() {
+        let num_bits = 1024;
+        let num_hash = optimal_num_hash_functions(100, num_bits);
+        let a = SparkBloomFilter::new(SparkBloomFilterVersion::V2, num_hash, num_bits, 1);
+        let mut b = SparkBloomFilter::new(SparkBloomFilterVersion::V2, num_hash, num_bits, 2);
+        assert_merge_err_contains(&mut b, &a.state_as_bytes(), "seed mismatch");
+    }
+
+    #[test]
+    fn merge_rejects_num_words_mismatch() {
+        let num_hash = 5;
+        let a = SparkBloomFilter::new(SparkBloomFilterVersion::V1, num_hash, 512, 0);
+        let mut b = SparkBloomFilter::new(SparkBloomFilterVersion::V1, num_hash, 1024, 0);
+        assert_merge_err_contains(&mut b, &a.state_as_bytes(), "num_words mismatch");
+    }
 }