Skip to content

Commit efab864

Browse files
Merge pull request #25 from borgbackup/package
move code to a Package
2 parents fb09377 + 9a27697 commit efab864

File tree

14 files changed

+321
-279
lines changed

14 files changed

+321
-279
lines changed

.github/workflows/ci.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ jobs:
5454
python -m pip install --upgrade pip setuptools
5555
pip install -r requirements.d/dev.txt
5656
- name: Install borghash
57-
run: pip install -ve .
57+
run: |
58+
python setup.py build_ext --inplace
59+
python -m build
60+
pip install -v dist/borghash*.tar.gz
5861
- name: run tox env
5962
run: tox --skip-missing-interpreters

.gitignore

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
.idea
22
.pytest_cache
33
.tox
4+
build
45
dist
56
__pycache__
6-
src/borghash.egg-info
77
src/borghash/_version.py
8-
src/borghash/borghash.cpp
9-
src/*.so
8+
**/*.so
9+
**/*.c
10+
**/*.egg-info

README.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,15 @@ Results on an Apple MacBook Pro (M3 Pro CPU) are like:
129129
HashTableNT serialization (count=50000): write: 0.020s, read: 0.021s.
130130

131131

132+
Building / Installing
133+
---------------------
134+
::
135+
136+
python setup.py build_ext --inplace
137+
python -m build
138+
pip install dist/borghash*.tar.gz
139+
140+
132141
State of this project
133142
---------------------
134143

pyproject.toml

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,19 +30,10 @@ dependencies = []
3030
"Changelog" = "https://github.com/borgbackup/borghash/blob/master/changes.rst"
3131

3232
[project.scripts]
33-
borghash-demo = "borghash:demo"
34-
35-
[tool.setuptools]
36-
# See also the MANIFEST.in file.
37-
# We want to install all the files in the package directories...
38-
include-package-data = true
39-
40-
[tool.setuptools.exclude-package-data]
41-
# ...except the source files which have been compiled (C extensions):
42-
"*" = ["*.c", "*.h", "*.pyx"]
33+
borghash-demo = "borghash.__main__:demo"
4334

4435
[build-system]
45-
requires = ["setuptools", "wheel", "Cython>=3.0.3", "setuptools_scm[toml]>=6.2"]
36+
requires = ["setuptools", "wheel", "setuptools_scm[toml]>=6.2"]
4637
build-backend = "setuptools.build_meta"
4738

4839
[tool.setuptools_scm]

requirements.d/dev.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ pytest
33
pytest-benchmark
44
build
55
twine
6+
Cython

setup.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,21 @@
1-
from setuptools import setup
2-
from Cython.Build import cythonize
1+
from setuptools import Extension, setup
2+
3+
try:
4+
from Cython.Build import cythonize
5+
except ImportError:
6+
cythonize = None # we don't have cython installed
7+
8+
ext = '.pyx' if cythonize else '.c'
9+
10+
extensions = [
11+
Extension("borghash.HashTable", ["src/borghash/HashTable" + ext]),
12+
Extension("borghash.HashTableNT", ["src/borghash/HashTableNT" + ext]),
13+
]
14+
15+
if cythonize:
16+
extensions = cythonize(extensions, language_level="3str")
317

418
setup(
5-
package_data=dict(borghash=["borghash.pxd"]),
6-
ext_modules=cythonize("borghash.pyx")
19+
package_data={"borghash": ["*.pxd", "*.pyx"]},
20+
ext_modules=extensions,
721
)

borghash.pxd renamed to src/borghash/HashTable.pxd

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ from libc.stdint cimport uint8_t, uint32_t
22

33
cdef class HashTable:
44
cdef int ksize, vsize
5-
cdef int initial_capacity, capacity, used, tombstones
5+
cdef readonly int capacity, used
6+
cdef int initial_capacity, tombstones
67
cdef float max_load_factor, min_load_factor, shrink_factor, grow_factor
78
cdef uint32_t* table
89
cdef int kv_capacity, kv_used
@@ -16,11 +17,3 @@ cdef class HashTable:
1617
cdef int _lookup_index(self, uint8_t* key_ptr, int* index_ptr)
1718
cdef void _resize_table(self, int new_capacity)
1819
cdef void _resize_kv(self, int new_capacity)
19-
20-
21-
cdef class HashTableNT:
22-
cdef int key_size
23-
cdef object value_type
24-
cdef object value_struct
25-
cdef int value_size
26-
cdef HashTable inner

borghash.pyx renamed to src/borghash/HashTable.pyx

Lines changed: 0 additions & 250 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,8 @@
11
"""
2-
borghash - hashtable implementations in cython.
3-
42
HashTable: low-level ht mapping fully random bytes keys to bytes values.
53
key and value length can be chosen, but is fixed afterwards.
64
the keys and values are stored in arrays separate from the hashtable.
75
the hashtable only stores the 32bit indexes into the key/value arrays.
8-
9-
HashTableNT: wrapper around HashTable, providing namedtuple values and serialization.
106
"""
117
from __future__ import annotations
128
from typing import BinaryIO, Iterator, Any
@@ -15,10 +11,7 @@ from libc.stdlib cimport malloc, free, realloc
1511
from libc.string cimport memcpy, memset, memcmp
1612
from libc.stdint cimport uint8_t, uint32_t
1713

18-
from collections import namedtuple
1914
from collections.abc import Mapping
20-
import json
21-
import struct
2215

2316
MAGIC = b"BORGHASH"
2417
assert len(MAGIC) == 8
@@ -335,246 +328,3 @@ cdef class HashTable:
335328
"resize_table": self.stats_resize_table,
336329
"resize_kv": self.stats_resize_kv,
337330
}
338-
339-
340-
cdef class HashTableNT:
341-
def __init__(self, items=None, *,
342-
key_size: int = 0, value_format: str = "", value_type: Any = None,
343-
capacity: int = MIN_CAPACITY) -> None:
344-
if not key_size:
345-
raise ValueError("key_size must be specified and must be > 0.")
346-
if not value_format:
347-
raise ValueError("value_format must be specified and must be non-empty.")
348-
if value_type is None:
349-
raise ValueError("value_type must be specified (a namedtuple type corresponding to value_format).")
350-
self.key_size = key_size
351-
self.value_struct = struct.Struct(value_format)
352-
self.value_size = self.value_struct.size
353-
self.value_type = value_type
354-
self.inner = HashTable(key_size=self.key_size, value_size=self.value_size, capacity=capacity)
355-
_fill(self, items)
356-
357-
def clear(self) -> None:
358-
self.inner.clear()
359-
360-
def _check_key(self, key: bytes) -> None:
361-
if not isinstance(key, bytes):
362-
raise TypeError(f"Expected an instance of bytes, got {type(key)}")
363-
if len(key) != self.key_size:
364-
raise ValueError(f"Key must be {self.key_size} bytes long")
365-
366-
def _to_binary_value(self, value: Any) -> bytes:
367-
if not isinstance(value, self.value_type):
368-
if isinstance(value, tuple):
369-
value = self.value_type(*value)
370-
else:
371-
raise TypeError(f"Expected an instance of {self.value_type}, got {type(value)}")
372-
return self.value_struct.pack(*value)
373-
374-
def _to_namedtuple_value(self, binary_value: bytes) -> Any:
375-
unpacked_data = self.value_struct.unpack(binary_value)
376-
return self.value_type(*unpacked_data)
377-
378-
def _set_raw(self, key: bytes, value: bytes) -> None:
379-
self.inner[key] = value
380-
381-
def _get_raw(self, key: bytes) -> bytes:
382-
return self.inner[key]
383-
384-
def __setitem__(self, key: bytes, value: Any) -> None:
385-
self._check_key(key)
386-
self.inner[key] = self._to_binary_value(value)
387-
388-
def __getitem__(self, key: bytes) -> Any:
389-
self._check_key(key)
390-
binary_value = self.inner[key]
391-
return self._to_namedtuple_value(binary_value)
392-
393-
def __delitem__(self, key: bytes) -> None:
394-
self._check_key(key)
395-
del self.inner[key]
396-
397-
def __contains__(self, key: bytes) -> bool:
398-
self._check_key(key)
399-
return key in self.inner
400-
401-
def items(self) -> Iterator[tuple[bytes, Any]]:
402-
for key, binary_value in self.inner.items():
403-
yield (key, self._to_namedtuple_value(binary_value))
404-
405-
def __len__(self) -> int:
406-
return len(self.inner)
407-
408-
def get(self, key: bytes, default: Any = None) -> Any:
409-
self._check_key(key)
410-
try:
411-
binary_value = self.inner[key]
412-
except KeyError:
413-
return default
414-
else:
415-
return self._to_namedtuple_value(binary_value)
416-
417-
def setdefault(self, key: bytes, default: Any) -> Any:
418-
self._check_key(key)
419-
binary_default = self._to_binary_value(default)
420-
binary_value = self.inner.setdefault(key, binary_default)
421-
return self._to_namedtuple_value(binary_value)
422-
423-
def pop(self, key: bytes, default: Any = _NoDefault) -> Any:
424-
self._check_key(key)
425-
try:
426-
binary_value = self.inner.pop(key)
427-
except KeyError:
428-
if default is _NoDefault:
429-
raise
430-
return default
431-
else:
432-
return self._to_namedtuple_value(binary_value)
433-
434-
def k_to_idx(self, key: bytes) -> int:
435-
return self.inner.k_to_idx(key)
436-
437-
def idx_to_k(self, idx: int) -> bytes:
438-
return self.inner.idx_to_k(idx)
439-
440-
def kv_to_idx(self, key: bytes, value: Any) -> int:
441-
binary_value = self._to_binary_value(value)
442-
return self.inner.kv_to_idx(key, binary_value)
443-
444-
def idx_to_kv(self, idx: int) -> tuple[bytes, Any]:
445-
key, binary_value = self.inner.idx_to_kv(idx)
446-
return key, self._to_namedtuple_value(binary_value)
447-
448-
@property
449-
def stats(self) -> dict[str, int]:
450-
return self.inner.stats
451-
452-
def write(self, file: BinaryIO|str|bytes):
453-
if isinstance(file, (str, bytes)):
454-
with open(file, 'wb') as fd:
455-
self._write_fd(fd)
456-
else:
457-
self._write_fd(file)
458-
459-
def _write_fd(self, fd: BinaryIO):
460-
meta = {
461-
'key_size': self.key_size,
462-
'value_size': self.value_size,
463-
'value_format': self.value_struct.format,
464-
'value_type_name': self.value_type.__name__,
465-
'value_type_fields': self.value_type._fields,
466-
'capacity': self.inner.capacity,
467-
'used': self.inner.used, # count of keys / values
468-
}
469-
meta_bytes = json.dumps(meta).encode("utf-8")
470-
meta_size = len(meta_bytes)
471-
header_bytes = struct.pack(HEADER_FMT, MAGIC, VERSION, meta_size)
472-
fd.write(header_bytes)
473-
fd.write(meta_bytes)
474-
count = 0
475-
for key, value in self.inner.items():
476-
fd.write(key)
477-
fd.write(value)
478-
count += 1
479-
assert count == self.inner.used
480-
481-
@classmethod
482-
def read(cls, file: BinaryIO|str|bytes):
483-
if isinstance(file, (str, bytes)):
484-
with open(file, 'rb') as fd:
485-
return cls._read_fd(fd)
486-
else:
487-
return cls._read_fd(file)
488-
489-
@classmethod
490-
def _read_fd(cls, fd: BinaryIO):
491-
header_size = struct.calcsize(HEADER_FMT)
492-
header_bytes = fd.read(header_size)
493-
if len(header_bytes) < header_size:
494-
raise ValueError(f"Invalid file, file is too short.")
495-
magic, version, meta_size = struct.unpack(HEADER_FMT, header_bytes)
496-
if magic != MAGIC:
497-
raise ValueError(f"Invalid file, magic {MAGIC.decode()} not found.")
498-
if version != VERSION:
499-
raise ValueError(f"Unsupported file version {version}.")
500-
meta_bytes = fd.read(meta_size)
501-
if len(meta_bytes) < meta_size:
502-
raise ValueError(f"Invalid file, file is too short.")
503-
meta = json.loads(meta_bytes.decode("utf-8"))
504-
value_type = namedtuple(meta['value_type_name'], meta['value_type_fields'])
505-
ht = cls(key_size=meta['key_size'], value_format=meta['value_format'], value_type=value_type, capacity=meta['capacity'])
506-
count = 0
507-
ksize, vsize = meta['key_size'], meta['value_size']
508-
for i in range(meta['used']):
509-
key = fd.read(ksize)
510-
value = fd.read(vsize)
511-
ht._set_raw(key, value)
512-
return ht
513-
514-
def size(self) -> int:
515-
"""
516-
do a rough worst-case estimate of the on-disk size when using .write().
517-
518-
the serialized size of the metadata is a bit hard to predict, but we cover that with one_time_overheads.
519-
"""
520-
one_time_overheads = 4096 # very rough
521-
N = self.inner.used
522-
return int(N * (self.key_size + self.value_size) + one_time_overheads)
523-
524-
525-
def demo():
526-
print("BorgHash demo")
527-
print("=============")
528-
print("Code:")
529-
code = """
530-
from tempfile import NamedTemporaryFile
531-
from time import time
532-
533-
count = 50000
534-
value_type = namedtuple("Chunk", ["refcount", "size"])
535-
# 256bit (32Byte) key, 2x 32bit (4Byte) values
536-
ht = HashTableNT(key_size=32, value_format="<II", value_type=value_type)
537-
538-
t0 = time()
539-
for i in range(count):
540-
# make up a 256bit key from i, first 32bits need to be well distributed.
541-
key = f"{i:4x}{' '*28}".encode()
542-
value = value_type(refcount=i, size=i * 2)
543-
ht[key] = value
544-
assert len(ht) == count
545-
546-
t1 = time()
547-
found = 0
548-
for key, value in ht.items():
549-
i = int(key.decode(), 16)
550-
expected_value = value_type(refcount=i, size=i * 2)
551-
assert ht[key] == expected_value
552-
found += 1
553-
assert found == count
554-
555-
t2 = time()
556-
ht_written = ht
557-
with NamedTemporaryFile(prefix="borghash-demo-ht-read", suffix=".tmp", delete=False) as tmpfile:
558-
ht_written.write(tmpfile)
559-
filename = tmpfile.name
560-
assert len(ht_written) == count, f"{len(ht_written)} != {count}"
561-
562-
t3 = time()
563-
ht_read = HashTableNT.read(filename)
564-
assert len(ht_read) == count, f"{len(ht_read)} != {count}"
565-
566-
t4 = time()
567-
for i in range(count):
568-
# make up a 256bit key from i, first 32bits need to be well distributed.
569-
key = f"{i:4x}{' '*28}".encode()
570-
expected_value = value_type(refcount=i, size=i * 2)
571-
assert ht_read.pop(key) == expected_value
572-
assert len(ht_read) == 0
573-
574-
t5 = time()
575-
print("Result:")
576-
print(f"HashTableNT in-memory ops (count={count}): insert: {t1-t0:.3f}s, lookup: {t2-t1:.3f}s, pop: {t5-t4:.3f}s.")
577-
print(f"HashTableNT serialization (count={count}): write: {t3-t2:.3f}s, read: {t4-t3:.3f}s.")
578-
"""
579-
print(code)
580-
exec(code)

src/borghash/HashTableNT.pxd

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
cdef class HashTableNT:
2+
cdef int key_size
3+
cdef object value_type
4+
cdef object value_struct
5+
cdef int value_size
6+
cdef object inner

0 commit comments

Comments
 (0)