11"""
2- borghash - hashtable implementations in cython.
3-
42HashTable: low-level ht mapping fully random bytes keys to bytes values.
53 key and value length can be chosen, but is fixed afterwards.
64 the keys and values are stored in arrays separate from the hashtable.
75 the hashtable only stores the 32bit indexes into the key/value arrays.
8-
9- HashTableNT: wrapper around HashTable, providing namedtuple values and serialization.
106"""
117from __future__ import annotations
128from typing import BinaryIO, Iterator, Any
@@ -15,10 +11,7 @@ from libc.stdlib cimport malloc, free, realloc
1511from libc.string cimport memcpy, memset, memcmp
1612from libc.stdint cimport uint8_t, uint32_t
1713
18- from collections import namedtuple
1914from collections.abc import Mapping
20- import json
21- import struct
2215
2316MAGIC = b" BORGHASH"
2417assert len (MAGIC) == 8
@@ -335,246 +328,3 @@ cdef class HashTable:
335328 "resize_table": self.stats_resize_table ,
336329 "resize_kv": self.stats_resize_kv ,
337330 }
338-
339-
340- cdef class HashTableNT:
341- def __init__(self , items = None , *,
342- key_size: int = 0 , value_format: str = " " , value_type: Any = None ,
343- capacity: int = MIN_CAPACITY) -> None:
344- if not key_size:
345- raise ValueError("key_size must be specified and must be > 0.")
346- if not value_format:
347- raise ValueError("value_format must be specified and must be non-empty.")
348- if value_type is None:
349- raise ValueError("value_type must be specified (a namedtuple type corresponding to value_format ).")
350- self.key_size = key_size
351- self.value_struct = struct .Struct(value_format)
352- self.value_size = self .value_struct.size
353- self.value_type = value_type
354- self.inner = HashTable(key_size = self .key_size, value_size = self .value_size, capacity = capacity)
355- _fill(self , items )
356-
357- def clear(self ) -> None:
358- self.inner.clear()
359-
360- def _check_key(self , key: bytes ) -> None:
361- if not isinstance(key , bytes ):
362- raise TypeError (f" Expected an instance of bytes, got {type(key)}" )
363- if len (key) != self .key_size:
364- raise ValueError (f" Key must be {self.key_size} bytes long" )
365-
366- def _to_binary_value (self , value: Any ) -> bytes:
367- if not isinstance(value , self.value_type ):
368- if isinstance (value, tuple ):
369- value = self .value_type(* value)
370- else :
371- raise TypeError (f" Expected an instance of {self.value_type}, got {type(value)}" )
372- return self .value_struct.pack(* value)
373-
374- def _to_namedtuple_value (self , binary_value: bytes ) -> Any:
375- unpacked_data = self .value_struct.unpack(binary_value)
376- return self.value_type(*unpacked_data )
377-
378- def _set_raw(self , key: bytes , value: bytes ) -> None:
379- self.inner[key] = value
380-
381- def _get_raw(self , key: bytes ) -> bytes:
382- return self.inner[key]
383-
384- def __setitem__(self , key: bytes , value: Any ) -> None:
385- self._check_key(key )
386- self.inner[key] = self._to_binary_value(value )
387-
388- def __getitem__(self , key: bytes ) -> Any:
389- self._check_key(key )
390- binary_value = self .inner[key]
391- return self._to_namedtuple_value(binary_value )
392-
393- def __delitem__(self , key: bytes ) -> None:
394- self._check_key(key )
395- del self.inner[key]
396-
397- def __contains__(self , key: bytes ) -> bool:
398- self._check_key(key )
399- return key in self.inner
400-
401- def items(self ) -> Iterator[tuple[bytes , Any]]:
402- for key , binary_value in self.inner.items():
403- yield (key, self ._to_namedtuple_value(binary_value))
404-
405- def __len__ (self ) -> int:
406- return len(self.inner )
407-
408- def get(self , key: bytes , default: Any = None ) -> Any:
409- self._check_key(key )
410- try:
411- binary_value = self .inner[key]
412- except KeyError:
413- return default
414- else:
415- return self._to_namedtuple_value(binary_value )
416-
417- def setdefault(self , key: bytes , default: Any ) -> Any:
418- self._check_key(key )
419- binary_default = self ._to_binary_value(default)
420- binary_value = self .inner.setdefault(key, binary_default)
421- return self._to_namedtuple_value(binary_value )
422-
423- def pop(self , key: bytes , default: Any = _NoDefault) -> Any:
424- self._check_key(key )
425- try:
426- binary_value = self .inner.pop(key)
427- except KeyError:
428- if default is _NoDefault:
429- raise
430- return default
431- else:
432- return self._to_namedtuple_value(binary_value )
433-
434- def k_to_idx(self , key: bytes ) -> int:
435- return self.inner.k_to_idx(key )
436-
437- def idx_to_k(self , idx: int ) -> bytes:
438- return self.inner.idx_to_k(idx )
439-
440- def kv_to_idx(self , key: bytes , value: Any ) -> int:
441- binary_value = self ._to_binary_value(value)
442- return self.inner.kv_to_idx(key , binary_value )
443-
444- def idx_to_kv(self , idx: int ) -> tuple[bytes , Any]:
445- key , binary_value = self .inner.idx_to_kv(idx)
446- return key , self._to_namedtuple_value(binary_value )
447-
448- @property
449- def stats(self ) -> dict[str , int]:
450- return self.inner.stats
451-
452- def write(self , file: BinaryIO|str|bytes ):
453- if isinstance (file , (str , bytes)):
454- with open (file , ' wb' ) as fd:
455- self ._write_fd(fd)
456- else :
457- self ._write_fd(file )
458-
459- def _write_fd (self , fd: BinaryIO ):
460- meta = {
461- ' key_size' : self .key_size,
462- ' value_size' : self .value_size,
463- ' value_format' : self .value_struct.format,
464- ' value_type_name' : self .value_type.__name__ ,
465- ' value_type_fields' : self .value_type._fields,
466- ' capacity' : self .inner.capacity,
467- ' used' : self .inner.used, # count of keys / values
468- }
469- meta_bytes = json.dumps(meta).encode(" utf-8" )
470- meta_size = len (meta_bytes)
471- header_bytes = struct .pack(HEADER_FMT, MAGIC, VERSION, meta_size)
472- fd.write(header_bytes)
473- fd.write(meta_bytes)
474- count = 0
475- for key, value in self .inner.items():
476- fd.write(key)
477- fd.write(value)
478- count += 1
479- assert count == self .inner.used
480-
481- @classmethod
482- def read (cls , file: BinaryIO|str|bytes ):
483- if isinstance (file , (str , bytes)):
484- with open (file , ' rb' ) as fd:
485- return cls ._read_fd(fd)
486- else :
487- return cls ._read_fd(file )
488-
489- @classmethod
490- def _read_fd (cls , fd: BinaryIO ):
491- header_size = struct .calcsize(HEADER_FMT)
492- header_bytes = fd.read(header_size)
493- if len (header_bytes) < header_size:
494- raise ValueError (f" Invalid file, file is too short." )
495- magic, version, meta_size = struct .unpack(HEADER_FMT, header_bytes)
496- if magic != MAGIC:
497- raise ValueError (f" Invalid file, magic {MAGIC.decode()} not found." )
498- if version != VERSION:
499- raise ValueError (f" Unsupported file version {version}." )
500- meta_bytes = fd.read(meta_size)
501- if len (meta_bytes) < meta_size:
502- raise ValueError (f" Invalid file, file is too short." )
503- meta = json.loads(meta_bytes.decode(" utf-8" ))
504- value_type = namedtuple(meta[' value_type_name' ], meta[' value_type_fields' ])
505- ht = cls (key_size = meta[' key_size' ], value_format = meta[' value_format' ], value_type = value_type, capacity = meta[' capacity' ])
506- count = 0
507- ksize, vsize = meta[' key_size' ], meta[' value_size' ]
508- for i in range (meta[' used' ]):
509- key = fd.read(ksize)
510- value = fd.read(vsize)
511- ht._set_raw(key, value)
512- return ht
513-
514- def size (self ) -> int:
515- """
516- do a rough worst-case estimate of the on-disk size when using .write().
517-
518- the serialized size of the metadata is a bit hard to predict , but we cover that with one_time_overheads.
519- """
520- one_time_overheads = 4096 # very rough
521- N = self .inner.used
522- return int(N * (self.key_size + self.value_size ) + one_time_overheads )
523-
524-
525- def demo():
526- print (" BorgHash demo" )
527- print (" =============" )
528- print (" Code:" )
529- code = """
530- from tempfile import NamedTemporaryFile
531- from time import time
532-
533- count = 50000
534- value_type = namedtuple("Chunk", ["refcount", "size"])
535- # 256bit (32Byte) key, 2x 32bit (4Byte) values
536- ht = HashTableNT(key_size=32, value_format="<II", value_type=value_type)
537-
538- t0 = time()
539- for i in range(count):
540- # make up a 256bit key from i, first 32bits need to be well distributed.
541- key = f"{i:4x}{' '*28}".encode()
542- value = value_type(refcount=i, size=i * 2)
543- ht[key] = value
544- assert len(ht) == count
545-
546- t1 = time()
547- found = 0
548- for key, value in ht.items():
549- i = int(key.decode(), 16)
550- expected_value = value_type(refcount=i, size=i * 2)
551- assert ht[key] == expected_value
552- found += 1
553- assert found == count
554-
555- t2 = time()
556- ht_written = ht
557- with NamedTemporaryFile(prefix="borghash-demo-ht-read", suffix=".tmp", delete=False) as tmpfile:
558- ht_written.write(tmpfile)
559- filename = tmpfile.name
560- assert len(ht_written) == count, f"{len(ht_written)} != {count}"
561-
562- t3 = time()
563- ht_read = HashTableNT.read(filename)
564- assert len(ht_read) == count, f"{len(ht_read)} != {count}"
565-
566- t4 = time()
567- for i in range(count):
568- # make up a 256bit key from i, first 32bits need to be well distributed.
569- key = f"{i:4x}{' '*28}".encode()
570- expected_value = value_type(refcount=i, size=i * 2)
571- assert ht_read.pop(key) == expected_value
572- assert len(ht_read) == 0
573-
574- t5 = time()
575- print("Result:")
576- print(f"HashTableNT in-memory ops (count={count}): insert: {t1-t0:.3f}s, lookup: {t2-t1:.3f}s, pop: {t5-t4:.3f}s.")
577- print(f"HashTableNT serialization (count={count}): write: {t3-t2:.3f}s, read: {t4-t3:.3f}s.")
578- """
579- print (code)
580- exec (code)
0 commit comments