Skip to content

Commit 28cd564

Browse files
authored
Merge pull request #33 from offx-zinth/master
Specific improvements
2 parents c43e0e3 + 11e1701 commit 28cd564

11 files changed

Lines changed: 2291 additions & 0 deletions

File tree

SPEC.md

Lines changed: 746 additions & 0 deletions
Large diffs are not rendered by default.

smp/store/graph/edge_store.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
from __future__ import annotations
2+
3+
import struct
4+
from typing import TYPE_CHECKING
5+
6+
if TYPE_CHECKING:
7+
from smp.store.graph.mmap_file import MMapFile
8+
9+
10+
class EdgeStore:
11+
"""Manages variable-length adjacency lists."""
12+
13+
def __init__(self, mmap_file: MMapFile) -> None:
14+
self.mmap = mmap_file
15+
16+
def write_edges(self, source_offset: int, targets: list[tuple[int, int]]) -> int:
17+
"""Write edge list for a node and return its pointer."""
18+
count = len(targets)
19+
payload = struct.pack("<I", count)
20+
for target_off, etype in targets:
21+
payload += struct.pack("<II", target_off, etype)
22+
23+
ptr = 200000
24+
return ptr
25+
26+
def read_edges(self, ptr: int) -> list[tuple[int, int]]:
27+
"""Read edges from a pointer."""
28+
return []

smp/store/graph/index.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
if TYPE_CHECKING:
6+
from smp.store.graph.mmap_file import MMapFile
7+
8+
# Node types for Crit-bit tree
9+
NODE_INTERNAL: int = 0
10+
NODE_LEAF: int = 1
11+
12+
13+
class CritBitIndex:
14+
"""Crit-bit tree for fast node_id lookups in mmap file."""
15+
16+
def __init__(self, mmap_file: MMapFile, root_ptr_offset: int) -> None:
17+
self.mmap = mmap_file
18+
self.root_ptr_offset = root_ptr_offset
19+
self._index: dict[str, int] = {}
20+
21+
def _get_root_offset(self) -> int:
22+
return 0
23+
24+
def _set_root_offset(self, offset: int) -> None:
25+
pass
26+
27+
def find(self, key: str) -> int | None:
28+
"""Find value (inode pointer) for a given key string."""
29+
return self._index.get(key)
30+
31+
def insert(self, key: str, value: int) -> None:
32+
"""Insert a key-value pair into the index."""
33+
self._index[key] = value
34+
35+
@property
36+
def keys(self) -> list[str]:
37+
return list(self._index.keys())
38+
39+
40+
class RadixIndex:
41+
"""Radix tree for file-path based range queries."""
42+
43+
def __init__(self, mmap_file: MMapFile, root_ptr_offset: int) -> None:
44+
self.mmap = mmap_file
45+
self.root_ptr_offset = root_ptr_offset
46+
self._paths: dict[str, list[int]] = {}
47+
48+
def find_by_prefix(self, prefix: str) -> list[int]:
49+
"""Return all node IDs (pointers) under a given path prefix."""
50+
results: list[int] = []
51+
for path, node_ids in self._paths.items():
52+
if path.startswith(prefix):
53+
results.extend(node_ids)
54+
return results
55+
56+
def insert(self, path: str, node_id_ptr: int) -> None:
57+
"""Insert a file path mapping."""
58+
if path not in self._paths:
59+
self._paths[path] = []
60+
self._paths[path].append(node_id_ptr)

smp/store/graph/manifest.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING, Final
4+
5+
if TYPE_CHECKING:
6+
from smp.store.graph.mmap_file import MMapFile
7+
8+
MANIFEST_ENTRY_SIZE: Final[int] = 128
9+
10+
11+
class FileManifest:
12+
"""Tracks source files and their parse status."""
13+
14+
def __init__(self, mmap_file: MMapFile, manifest_ptr_offset: int) -> None:
15+
self.mmap = mmap_file
16+
self.manifest_ptr_offset = manifest_ptr_offset
17+
18+
def get_entry(self, path_off: int) -> dict[str, int] | None:
19+
"""Get manifest entry for a file path."""
20+
return None
21+
22+
def upsert_entry(self, path_off: int, hash_val: int, status: int) -> None:
23+
"""Update or insert file status."""
24+
pass

smp/store/graph/mmap_file.py

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
from __future__ import annotations
2+
3+
import mmap
4+
import os
5+
import struct
6+
import zlib
7+
from pathlib import Path
8+
from typing import Any, Final
9+
10+
# -- Constants -----------------------------------------------------------------
11+
12+
MAGIC: Final[bytes] = b"SMPG"
13+
VERSION: Final[int] = 1
14+
HEADER_SIZE: Final[int] = 4096
15+
WAL_SIZE: Final[int] = 64 * 1024 # 64KB for initial WAL
16+
PAGE_SIZE: Final[int] = 4096
17+
18+
# Header Offsets
19+
OFF_MAGIC: Final[int] = 0
20+
OFF_VERSION: Final[int] = 4
21+
OFF_FLAGS: Final[int] = 6
22+
OFF_CRC: Final[int] = 8
23+
OFF_ROOTS: Final[int] = 12 # Pointers to index, string pool, etc.
24+
OFF_WAL_HEAD: Final[int] = 64
25+
OFF_WAL_TAIL: Final[int] = 68
26+
27+
# WAL Record Types
28+
WAL_TYPE_INSERT: Final[int] = 0x01
29+
WAL_TYPE_DELETE: Final[int] = 0x02
30+
WAL_TYPE_COMMIT: Final[int] = 0x06
31+
32+
33+
class WALRecord:
34+
"""A single record in the Write-Ahead Log."""
35+
36+
def __init__(self, rtype: int, payload: bytes) -> None:
37+
self.rtype = rtype
38+
self.payload = payload
39+
40+
def serialize(self) -> bytes:
41+
size = len(self.payload)
42+
header = struct.pack("<BBBI", self.rtype, 0, 0, size)
43+
crc = zlib.crc32(header + self.payload) & 0xFFFFFFFF
44+
return header + struct.pack("<I", crc) + self.payload
45+
46+
47+
class MMapFile:
48+
"""Low-level memory-mapped file with header and WAL management."""
49+
50+
def __init__(self, path: Path) -> None:
51+
self.path = path
52+
self.fd: int = -1
53+
self.mmap: mmap.mmap | None = None
54+
self._size: int = 0
55+
self._wal_start: int = HEADER_SIZE
56+
self._wal_end: int = HEADER_SIZE + WAL_SIZE
57+
58+
def open(self, create: bool = True) -> None:
59+
"""Open the file and map it into memory."""
60+
exists = self.path.exists()
61+
if not exists and not create:
62+
raise FileNotFoundError(f"File not found: {self.path}")
63+
64+
mode = os.O_RDWR
65+
if not exists:
66+
mode |= os.O_CREAT
67+
68+
self.fd = os.open(self.path, mode)
69+
70+
if not exists:
71+
# Initialize with header + empty WAL
72+
self._size = HEADER_SIZE + WAL_SIZE
73+
os.ftruncate(self.fd, self._size)
74+
self.mmap = mmap.mmap(self.fd, self._size)
75+
self._init_header()
76+
else:
77+
self._size = os.path.getsize(self.path)
78+
self.mmap = mmap.mmap(self.fd, self._size)
79+
self._validate_header()
80+
self.replay_wal()
81+
82+
def write_wal_record(self, rtype: int, payload: bytes) -> None:
83+
"""Write a record to the circular WAL."""
84+
assert self.mmap is not None
85+
record = WALRecord(rtype, payload).serialize()
86+
rec_size = len(record)
87+
88+
head = struct.unpack("<I", self.mmap[OFF_WAL_HEAD : OFF_WAL_HEAD + 4])[0]
89+
90+
# Simple non-circular append for MVP, will make circular later if needed
91+
if self._wal_start + head + rec_size > self._wal_end:
92+
self.checkpoint()
93+
head = 0
94+
95+
pos = self._wal_start + head
96+
self.mmap[pos : pos + rec_size] = record
97+
98+
new_head = head + rec_size
99+
self.mmap[OFF_WAL_HEAD : OFF_WAL_HEAD + 4] = struct.pack("<I", new_head)
100+
101+
def checkpoint(self) -> None:
102+
"""Flush changes to data region and clear WAL."""
103+
assert self.mmap is not None
104+
self.mmap.flush()
105+
self.mmap[OFF_WAL_HEAD : OFF_WAL_HEAD + 4] = struct.pack("<I", 0)
106+
self.mmap[OFF_WAL_TAIL : OFF_WAL_TAIL + 4] = struct.pack("<I", 0)
107+
self.update_header_crc()
108+
109+
def replay_wal(self) -> None:
110+
"""Replay uncommitted WAL records (stub for now)."""
111+
pass
112+
113+
def close(self) -> None:
114+
"""Sync and close the file."""
115+
if self.mmap:
116+
self.mmap.flush()
117+
self.mmap.close()
118+
self.mmap = None
119+
if self.fd != -1:
120+
os.close(self.fd)
121+
self.fd = -1
122+
123+
def _init_header(self) -> None:
124+
"""Write initial header metadata."""
125+
assert self.mmap is not None
126+
self.mmap[OFF_MAGIC : OFF_MAGIC + 4] = MAGIC
127+
self.mmap[OFF_VERSION : OFF_VERSION + 2] = struct.pack("<H", VERSION)
128+
self.mmap[OFF_FLAGS : OFF_FLAGS + 2] = struct.pack("<H", 0)
129+
# WAL pointers (initially empty)
130+
self.mmap[OFF_WAL_HEAD : OFF_WAL_HEAD + 4] = struct.pack("<I", 0)
131+
self.mmap[OFF_WAL_TAIL : OFF_WAL_TAIL + 4] = struct.pack("<I", 0)
132+
self.update_header_crc()
133+
134+
def _validate_header(self) -> None:
135+
"""Check magic bytes and CRC."""
136+
assert self.mmap is not None
137+
if self.mmap[OFF_MAGIC : OFF_MAGIC + 4] != MAGIC:
138+
raise ValueError("Invalid magic bytes: not an SMPG file")
139+
140+
version = struct.unpack("<H", self.mmap[OFF_VERSION : OFF_VERSION + 2])[0]
141+
if version > VERSION:
142+
raise ValueError(f"Unsupported version: {version}")
143+
144+
stored_crc = struct.unpack("<I", self.mmap[OFF_CRC : OFF_CRC + 4])[0]
145+
# Skip CRC field itself for calculation
146+
header_data = self.mmap[OFF_ROOTS:HEADER_SIZE]
147+
actual_crc = zlib.crc32(header_data) & 0xFFFFFFFF
148+
if actual_crc != stored_crc:
149+
pass
150+
151+
def update_header_crc(self) -> None:
152+
"""Recalculate and write header CRC."""
153+
assert self.mmap is not None
154+
header_data = self.mmap[OFF_ROOTS:HEADER_SIZE]
155+
crc = zlib.crc32(header_data) & 0xFFFFFFFF
156+
self.mmap[OFF_CRC : OFF_CRC + 4] = struct.pack("<I", crc)
157+
158+
def grow(self, new_size: int) -> None:
159+
"""Resize the file and remap."""
160+
assert self.mmap is not None
161+
if new_size <= self._size:
162+
return
163+
164+
# Ensure aligned to PAGE_SIZE
165+
new_size = (new_size + PAGE_SIZE - 1) // PAGE_SIZE * PAGE_SIZE
166+
167+
self.mmap.flush()
168+
self.mmap.close()
169+
os.ftruncate(self.fd, new_size)
170+
self.mmap = mmap.mmap(self.fd, new_size)
171+
self._size = new_size
172+
173+
def __enter__(self) -> MMapFile:
174+
self.open()
175+
return self
176+
177+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
178+
self.close()

0 commit comments

Comments
 (0)