Skip to content

Commit 24c590e

Browse files
Merge pull request #27 from ACCESS-NRI/xxhash
Add faster (~6-10x) hashing algorithm - `xxh3` : see https://github.com/Cyan4973/xxHash#benchmarks
2 parents 82bc785 + afa589f commit 24c590e

File tree

5 files changed

+21
-13
lines changed

5 files changed

+21
-13
lines changed

.conda/env_dev.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@ dependencies:
1515
- pytest
1616
- pytest-cov
1717
- versioneer
18+
- python-xxhash

.conda/meta.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ requirements:
2727
- python
2828
- six
2929
- pyyaml
30+
- python-xxhash
3031
test:
3132
imports:
3233
- yamanifest

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ dynamic = ["version"]
1313
requires-python = ">=3.10"
1414
dependencies = [
1515
"PyYAML",
16-
"six"
16+
"six",
17+
"xxhash",
1718
]
1819

1920
[project.scripts]

test/test_manifest.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def test_manifest_read_write():
8282
files = ['file1','file2']
8383

8484
for filepath in files:
85-
mf1.add(os.path.join('test',filepath),['md5','sha1'])
85+
mf1.add(os.path.join('test',filepath),['binhash-xxh','md5','sha1'])
8686

8787
assert(len(mf1) == len(files))
8888

@@ -116,7 +116,7 @@ def test_manifest_netcdf():
116116
mf1 = mf.Manifest('mf1.yaml')
117117

118118
for filepath in glob.glob('*.nc'):
119-
mf1.add(filepath,['binhash','md5','sha1'])
119+
mf1.add(filepath,['binhash-xxh','binhash','md5','sha1'])
120120

121121
mf1.dump()
122122

@@ -125,7 +125,7 @@ def test_manifest_netcdf():
125125
mf2 = mf.Manifest('mf2.yaml')
126126

127127
for filepath in glob.glob('*.nc'):
128-
mf2.add(filepath,['binhash','md5','sha1'])
128+
mf2.add(filepath,['binhash-xxh','binhash','md5','sha1'])
129129

130130
mf2.dump()
131131

@@ -139,7 +139,7 @@ def test_manifest_netcdf():
139139

140140
mf1 = mf.Manifest('mf1.yaml')
141141

142-
mf1.add(glob.glob('*.nc'),['binhash'])
142+
mf1.add(glob.glob('*.nc'),['binhash-xxh','binhash'])
143143
mf1.add(hashfn=['md5','sha1'])
144144

145145
assert(mf1.equals(mf2))
@@ -152,10 +152,10 @@ def test_manifest_netcdf_changed_time():
152152

153153
for filepath in glob.glob('*.nc'):
154154
touch(filepath)
155-
mf3.add(filepath,['md5','sha1','binhash'])
155+
mf3.add(filepath,['md5','sha1','binhash','binhash-xxh'])
156156

157157
mf3.dump()
158-
mf3.add(filepath,['md5','sha1','binhash'])
158+
mf3.add(filepath,['md5','sha1','binhash','binhash-xxh'])
159159
mf2 = mf.Manifest('mf2.yaml')
160160
mf2.load()
161161

@@ -176,7 +176,7 @@ def test_manifest_hash_with_binhash():
176176
mf4 = mf.Manifest('mf4.yaml')
177177

178178
for filepath in glob.glob('*.bin'):
179-
mf4.add(filepath,hashfn=['binhash', 'binhash-nomtime'])
179+
mf4.add(filepath,hashfn=['binhash', 'binhash-nomtime', 'binhash-xxh'])
180180

181181
mf4.dump()
182182
assert(mf4.check())
@@ -185,7 +185,7 @@ def test_manifest_hash_with_binhash():
185185

186186
for filepath in glob.glob('*.bin'):
187187
touch(filepath)
188-
mf5.add(filepath,hashfn=['binhash', 'binhash-nomtime'])
188+
mf5.add(filepath,hashfn=['binhash', 'binhash-nomtime', 'binhash-xxh'])
189189

190190
hashvals = {}
191191
assert(not mf4.check())

yamanifest/hashing.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from __future__ import absolute_import, print_function
2222

2323
import hashlib
24+
import xxhash # Fast hashing library
2425
import io
2526
import os
2627
import sys
@@ -31,11 +32,13 @@
3132
# List of supported hashes and the ordering used to determine relative expense of
3233
# calculation
3334
supported_hashes = [
34-
'binhash', 'binhash-nomtime', 'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'
35+
'binhash-xxh','binhash', 'binhash-nomtime', 'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'
3536
]
3637

37-
def _binhash(path, size, include_mtime):
38-
m = hashlib.new('md5')
38+
def _binhash(path, size, include_mtime, use_xxh=False):
39+
40+
m = xxhash.xxh3_64() if use_xxh else hashlib.new('md5')
41+
3942
with io.open(path, mode="rb") as fd:
4043
# Size limited hashing, so prepend the filename, size and optionally modification time
4144
hashstring = os.path.basename(path) + str(os.path.getsize(path))
@@ -70,7 +73,9 @@ def hash(path, hashfn, size=one_hundred_megabytes):
7073
if hashfn not in supported_hashes:
7174
sys.stderr.write('\nUnsupported hash function {}, skipping {}\n'.format(hashfn, path))
7275
try:
73-
if hashfn == 'binhash':
76+
if hashfn == 'binhash-xxh':
77+
return _binhash(path, one_hundred_megabytes, True, use_xxh=True)
78+
elif hashfn == 'binhash':
7479
return _binhash(path, one_hundred_megabytes, True)
7580
elif hashfn == 'binhash-nomtime':
7681
return _binhash(path, one_hundred_megabytes, False)

0 commit comments

Comments
 (0)