diff --git a/.gitignore b/.gitignore index 81af57d..c91baaa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,109 @@ /pyo3-polars/target -Cargo.lock -.idea/ -venv/ -target/ rust-toolchain.toml *.pyc -*.so \ No newline at end of file +*.dll +*.pyd + +# Created by https://www.toptal.com/developers/gitignore/api/linux,rust,python,osx +# Edit at https://www.toptal.com/developers/gitignore?templates=linux,rust,python,osx +### OSX ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# PyBuilder +.pybuilder/ +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +### Rust ### +# Generated by Cargo +# will have compiled files and executables +debug/ + +# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries +# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html +Cargo.lock + +# These are backup files generated by rustfmt +**/*.rs.bk + +# MSVC Windows builds of rustc generate these, which store debugging information +*.pdb + +# End of https://www.toptal.com/developers/gitignore/api/linux,rust,python,osx \ No newline at end of file diff --git a/polars_hash/Cargo.toml b/polars_hash/Cargo.toml index 192bd5d..e9fce25 100644 --- a/polars_hash/Cargo.toml +++ b/polars_hash/Cargo.toml @@ -23,6 +23,7 @@ md5 = { version = "0.7.0" } h3o = { version = "0.6.4" } xxhash-rust = { version = "0.8.12", features = ["xxh32", "xxh64"] } mur3 = { version = "0.1.0" } +hex = {version = "0.4"} [target.'cfg(target_os = "linux")'.dependencies] diff --git a/polars_hash/polars_hash/__init__.py b/polars_hash/polars_hash/__init__.py index dec2158..607632f 100644 --- a/polars_hash/polars_hash/__init__.py +++ b/polars_hash/polars_hash/__init__.py @@ -106,6 +106,16 @@ def sha3_224(self) -> pl.Expr: is_elementwise=True, ) + def sha3_shake128(self, *, length: int) -> pl.Expr: + """Takes Utf8 as input and returns utf8 hash with shake128 from SHA-3 family.""" + return register_plugin_function( + plugin_path=Path(__file__).parent, + function_name="sha3_shake128", + args=self._expr, + is_elementwise=True, + kwargs={"length": length}, + ) + def blake3(self) -> pl.Expr: """Takes Utf8 as input and returns utf8 hash with blake3.""" return register_plugin_function( diff --git a/polars_hash/src/expressions.rs b/polars_hash/src/expressions.rs index 5d52770..f3f3507 100644 --- a/polars_hash/src/expressions.rs +++ b/polars_hash/src/expressions.rs @@ -30,6 +30,11 @@ struct SeedKwargs64bit { seed: u64, } +#[derive(Deserialize)] +struct LengthKwargs { + length: usize, +} + pub fn blake3_hash_str(value: &str, output: &mut string::String) { let hash = blake3::hash(value.as_bytes()); write!(output, "{}", hash).unwrap() @@ -186,6 +191,17 @@ fn sha3_224(inputs: &[Series]) -> PolarsResult { Ok(out.into_series()) } +#[polars_expr(output_type=String)] +fn sha3_shake128(inputs: &[Series], kwargs: LengthKwargs) -> PolarsResult { + + let ca = inputs[0].str()?; + let out: StringChunked = ca.apply_into_string_amortized(|value: &str, output: &mut string::String| { + sha3_shake128_hash(value, output, kwargs.length) + }); + + Ok(out.into_series()) +} + #[polars_expr(output_type=String)] fn ghash_encode(inputs: &[Series]) -> PolarsResult { let ca = inputs[0].struct_()?; diff --git a/polars_hash/src/sha_hashers.rs b/polars_hash/src/sha_hashers.rs index 2d991b1..6d8328a 100644 --- a/polars_hash/src/sha_hashers.rs +++ b/polars_hash/src/sha_hashers.rs @@ -1,6 +1,6 @@ use sha1::Sha1; use sha2::{Digest, Sha224, Sha256, Sha384, Sha512}; -use sha3::{Sha3_224, Sha3_256, Sha3_384, Sha3_512}; +use sha3::{digest::{ExtendableOutput, Update, XofReader}, Sha3_224, Sha3_256, Sha3_384, Sha3_512, Shake128}; use std::fmt::Write; pub fn sha1_hash(value: &str, output: &mut String) { @@ -47,3 +47,12 @@ pub fn sha3_224_hash(value: &str, output: &mut String) { let hash = Sha3_224::digest(value); write!(output, "{:x}", hash).unwrap() } + +pub fn sha3_shake128_hash(value: &str, output: &mut String, length: usize) { + let mut hasher = Shake128::default(); + hasher.update(value.as_bytes()); + let mut reader = hasher.finalize_xof(); + let mut result = vec![0u8; length]; + reader.read(&mut result); + write!(output, "{}", hex::encode(result)).unwrap() +} diff --git a/polars_hash/tests/test_hash.py b/polars_hash/tests/test_hash.py index 958964f..113a71c 100644 --- a/polars_hash/tests/test_hash.py +++ b/polars_hash/tests/test_hash.py @@ -36,6 +36,21 @@ def test_sha256(): assert_frame_equal(result, expected) +def test_sha3_shake128(): + result = pl.select(pl.lit("hello_world").chash.sha3_shake128(length=10)) # type: ignore + + expected = pl.DataFrame( + [ + pl.Series( + "literal", + ["6b57b385e070e3534257"], + dtype=pl.Utf8, + ), + ] + ) + assert_frame_equal(result, expected) + + def test_wyhash_str(): result = pl.select(pl.lit("hello_world").nchash.wyhash()) # type: ignore