Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions polars_hash/polars_hash/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,26 @@ def xxhash64(self, *, seed: int = 0) -> pl.Expr:
kwargs={"seed": seed},
)

def xxh3_64(self, *, seed: int = 0) -> pl.Expr:
"""Takes Utf8 as input and returns uint32 hash with XXH3 64bit."""
return register_plugin_function(
plugin_path=Path(__file__).parent,
function_name="xxh3_64",
args=self._expr,
is_elementwise=True,
kwargs={"seed": seed},
)

def xxh3_128(self, *, seed: int = 0) -> pl.Expr:
"""Takes Utf8 as input and returns binary hash with XXH3 128bit."""
return register_plugin_function(
plugin_path=Path(__file__).parent,
function_name="xxh3_128",
args=self._expr,
is_elementwise=True,
kwargs={"seed": seed},
)


@pl.api.register_expr_namespace("geohash")
class GeoHashingNameSpace:
Expand Down
18 changes: 18 additions & 0 deletions polars_hash/src/expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -343,3 +343,21 @@ fn xxhash64(inputs: &[Series], kwargs: SeedKwargs64bit) -> PolarsResult<Series>
let out: ChunkedArray<UInt64Type> = unary_elementwise(ca, seeded_hash_function);
Ok(out.into_series())
}

#[polars_expr(output_type=UInt64)]
fn xxh3_64(inputs: &[Series], kwargs: SeedKwargs64bit) -> PolarsResult<Series> {
let seeded_hash_function = |v| xxhash3_64(v, kwargs.seed);

let ca = inputs[0].str()?;
let out: ChunkedArray<UInt64Type> = unary_elementwise(ca, seeded_hash_function);
Ok(out.into_series())
}

#[polars_expr(output_type=Binary)]
fn xxh3_128(inputs: &[Series], kwargs: SeedKwargs64bit) -> PolarsResult<Series> {
let seeded_hash_function = |v| xxhash3_128(v, kwargs.seed);

let ca = inputs[0].str()?;
let out: ChunkedArray<BinaryType> = unary_elementwise(ca, seeded_hash_function);
Ok(out.into_series())
}
14 changes: 14 additions & 0 deletions polars_hash/src/xxhash_hashers.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use xxhash_rust::xxh3::xxh3_128_with_seed;
use xxhash_rust::xxh3::xxh3_64_with_seed;
use xxhash_rust::xxh32::xxh32;
use xxhash_rust::xxh64::xxh64;

Expand All @@ -8,3 +10,15 @@ pub fn xxhash_32(value: Option<&str>, seed: u32) -> Option<u32> {
pub fn xxhash_64(value: Option<&str>, seed: u64) -> Option<u64> {
value.map(|v| xxh64(v.as_bytes(), seed))
}

pub fn xxhash3_64(value: Option<&str>, seed: u64) -> Option<u64> {
value.map(|v| xxh3_64_with_seed(v.as_bytes(), seed))
}

pub fn xxhash3_128(value: Option<&str>, seed: u64) -> Option<Vec<u8>> {
value.map(|v| {
xxh3_128_with_seed(v.as_bytes(), seed)
.to_le_bytes()
.to_vec()
})
}
84 changes: 84 additions & 0 deletions polars_hash/tests/test_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,3 +371,87 @@ def test_forced_missing_seed_errors(hash_fn_expr):

with pytest.raises(ComputeError, match="expected u32"):
df.select(hash_fn_expr)


def test_xxh3_64():
df = pl.DataFrame({"literal": ["hello_world", None, ""]})
result = df.select(plh.col("literal").nchash.xxh3_64())

expected = pl.DataFrame(
[
pl.Series(
"literal",
[
7060460777671424209,
None,
3244421341483603138,
],
dtype=pl.UInt64,
),
]
)

assert_frame_equal(result, expected)


def test_xxh3_64_seeded():
df = pl.DataFrame({"literal": ["hello_world", None, ""]})
result = df.select(plh.col("literal").nchash.xxh3_64(seed=42))

expected = pl.DataFrame(
[
pl.Series(
"literal",
[
827481053383045869,
None,
12693748630217917650,
],
dtype=pl.UInt64,
),
]
)

assert_frame_equal(result, expected)


def test_xxh3_128():
df = pl.DataFrame({"literal": ["hello_world", None, ""]})
result = df.select(plh.col("literal").nchash.xxh3_128())

expected = pl.DataFrame(
[
pl.Series(
"literal",
[
b'\x03o\xfe!^\x18\xfbg"\xc6=\xaf^\x1c\xd3\xbe',
None,
b"\x7fI\x8dF$\xc3\x01`\xd8\x98G\x01\xd3\x06\xaa\x99",
],
dtype=pl.Binary,
),
]
)

assert_frame_equal(result, expected)


def test_xxh3_128_seeded():
df = pl.DataFrame({"literal": ["hello_world", None, ""]})
result = df.select(plh.col("literal").nchash.xxh3_128(seed=42))

expected = pl.DataFrame(
[
pl.Series(
"literal",
[
b"BM\xd8\x9d\x8dX]|k\xd9\xb9\xc0|\xea\xc7\xec",
None,
b"d\x91$\xfe\xe9\t\x1d</\xaf\xf73\xcd\n\xc2\x16",
],
dtype=pl.Binary,
),
]
)

assert_frame_equal(result, expected)
Loading