Skip to content

Commit e29f53b

Browse files
authored
Merge pull request #109 from nvt/bytecode/v6-header-rev
bytecode: upgrade to v6 with more descriptive header
2 parents 81884e7 + 7c715ba commit e29f53b

8 files changed

Lines changed: 234 additions & 83 deletions

File tree

core/vm-loader.c

Lines changed: 61 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ read_program(const char *name)
253253
{
254254
vm_program_t *program;
255255
vm_loader_handle_t handle;
256-
unsigned char buf[VM_HEADER_SIZE];
256+
unsigned char buf[VM_HEADER_FIXED_SIZE];
257257
unsigned i;
258258
unsigned code_size;
259259

@@ -279,30 +279,60 @@ read_program(const char *name)
279279
program->exec_count = NULL;
280280
#endif
281281

282-
if(extract_program_name(&program->name, name) == 0) {
283-
VM_DEBUG(VM_DEBUG_LOW, "Failed to extract the program name from \"%s\"",
284-
name);
285-
goto error;
286-
}
287-
288-
VM_DEBUG(VM_DEBUG_LOW, "Loading program \"%s\"", program->name);
289-
290-
if(VM_LOADER_READ(handle, buf, 3) != 3) {
282+
if(VM_LOADER_READ(handle, buf, VM_HEADER_FIXED_SIZE) !=
283+
VM_HEADER_FIXED_SIZE) {
291284
VM_DEBUG(VM_DEBUG_LOW, "Read error on program header");
292285
goto error;
293286
}
294287

295-
if(buf[0] != VM_FILE_ID1 || buf[1] != VM_FILE_ID2) {
288+
if(buf[VM_HEADER_OFFSET_MAGIC1] != VM_FILE_ID1 ||
289+
buf[VM_HEADER_OFFSET_MAGIC2] != VM_FILE_ID2) {
296290
VM_DEBUG(VM_DEBUG_LOW, "%s: invalid program header", name);
297291
goto error;
298292
}
299293

300-
if(buf[2] != VM_BYTECODE_VERSION) {
301-
VM_DEBUG(VM_DEBUG_LOW, "%s: unsupported bytecode version %d",
302-
name, buf[2]);
303-
goto error;
294+
{
295+
uint16_t version =
296+
(uint16_t)buf[VM_HEADER_OFFSET_VERSION]
297+
| ((uint16_t)buf[VM_HEADER_OFFSET_VERSION + 1] << 8);
298+
if(version != VM_BYTECODE_VERSION) {
299+
VM_DEBUG(VM_DEBUG_LOW, "%s: unsupported bytecode version %u",
300+
name, (unsigned)version);
301+
goto error;
302+
}
303+
}
304+
305+
uint32_t total_len =
306+
(uint32_t)buf[VM_HEADER_OFFSET_TOTAL_LEN]
307+
| ((uint32_t)buf[VM_HEADER_OFFSET_TOTAL_LEN + 1] << 8)
308+
| ((uint32_t)buf[VM_HEADER_OFFSET_TOTAL_LEN + 2] << 16)
309+
| ((uint32_t)buf[VM_HEADER_OFFSET_TOTAL_LEN + 3] << 24);
310+
311+
/* Read the variable-length program name. */
312+
{
313+
unsigned name_len = buf[VM_HEADER_OFFSET_NAME_LEN];
314+
if(name_len > 0) {
315+
char *header_name = VM_MALLOC(name_len + 1);
316+
if(header_name == NULL) {
317+
goto error;
318+
}
319+
if(VM_LOADER_READ(handle, header_name, name_len) !=
320+
(vm_loader_offset_t)name_len) {
321+
VM_DEBUG(VM_DEBUG_LOW, "Read error on program name");
322+
VM_FREE(header_name);
323+
goto error;
324+
}
325+
header_name[name_len] = '\0';
326+
program->name = header_name;
327+
} else if(extract_program_name(&program->name, name) == 0) {
328+
VM_DEBUG(VM_DEBUG_LOW,
329+
"Failed to extract the program name from \"%s\"", name);
330+
goto error;
331+
}
304332
}
305333

334+
VM_DEBUG(VM_DEBUG_LOW, "Loading program \"%s\"", program->name);
335+
306336
if(read_table(&program->strings, handle) == 0) {
307337
VM_DEBUG(VM_DEBUG_LOW, "Failed to read the string table");
308338
goto error;
@@ -420,6 +450,22 @@ read_program(const char *name)
420450
goto error;
421451
}
422452

453+
/* Truncation / padding check: the position after the last body byte
454+
must equal the total length declared in the header. */
455+
{
456+
vm_loader_offset_t end_offset = VM_LOADER_SEEK_RELATIVE(handle, 0);
457+
if(end_offset == (vm_loader_offset_t)-1) {
458+
VM_DEBUG(VM_DEBUG_LOW, "loader:seek failed");
459+
goto error;
460+
}
461+
if((uint32_t)end_offset != total_len) {
462+
VM_DEBUG(VM_DEBUG_LOW,
463+
"%s: total-length mismatch (header says %u, parsed %u)",
464+
name, (unsigned)total_len, (unsigned)end_offset);
465+
goto error;
466+
}
467+
}
468+
423469
VM_DEBUG(VM_DEBUG_MEDIUM, "Loading bytecode consisting of %u byte%s",
424470
code_size, code_size != 1 ? "s" : "");
425471

doc/bytecode-format.md

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,25 @@ VeloxVM uses a custom bytecode format designed for efficient storage and executi
88

99
### Magic Number and Header
1010

11-
VeloxVM bytecode files (.vm) begin with a 3-byte header:
11+
VeloxVM bytecode files (.vm) begin with a 9-byte fixed prefix followed by an N-byte program name:
1212

1313
```
1414
Offset Size Description
1515
------ ---- -----------
1616
0x00 1 File ID 1: 0x5E (94 decimal)
1717
0x01 1 File ID 2: 0xB5 (181 decimal)
18-
0x02 1 Bytecode version (currently 5; see `VM_BYTECODE_VERSION` in `include/vm-bytecode.h`)
18+
0x02 2 Bytecode version (uint16 LE; currently 6, see `VM_BYTECODE_VERSION` in `include/vm-bytecode.h`)
19+
0x04 4 Total file length in bytes (uint32 LE) — header + body
20+
0x08 1 Program name length (N, 0..255)
21+
0x09 N Program name (UTF-8, no terminator)
1922
```
2023

2124
The magic number `0x5E 0xB5` (or `0xB55E` in little-endian short format) can be used to identify VeloxVM bytecode files with the `file` utility.
2225

26+
The **total length** field is the size of the entire file in bytes including the header. The loader verifies it against the actual file size after parsing the body and rejects mismatches (truncation or padding).
27+
28+
The **program name** is the canonical identity used by logs, profiler output, and the control plane. If N is 0 the loader falls back to deriving the name from the filename (with `.vm` stripped). Encoders typically populate the field from the source file's basename.
29+
2330
### File Structure
2431

2532
After the header, the file contains three variable-length tables followed by a captures block:

include/vm-bytecode.h

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,25 @@
3535

3636
#include <stdint.h>
3737

38-
/* The header consists of two bytes for the file ID and a byte for the
39-
bytecode version. */
40-
#define VM_HEADER_SIZE 3
38+
/* v6 header layout (9 fixed bytes + N-byte program name):
39+
0x00 2 Magic (VM_FILE_ID1, VM_FILE_ID2)
40+
0x02 2 Bytecode version (uint16 LE)
41+
0x04 4 Total file length in bytes (uint32 LE)
42+
0x08 1 Program name length N (0..255)
43+
0x09 N Program name (UTF-8, no terminator)
44+
The body (tables + captures + expression bytecode) follows the name. */
45+
#define VM_HEADER_FIXED_SIZE 9
46+
#define VM_HEADER_OFFSET_MAGIC1 0
47+
#define VM_HEADER_OFFSET_MAGIC2 1
48+
#define VM_HEADER_OFFSET_VERSION 2
49+
#define VM_HEADER_OFFSET_TOTAL_LEN 4
50+
#define VM_HEADER_OFFSET_NAME_LEN 8
51+
#define VM_HEADER_OFFSET_NAME 9
4152

4253
#define VM_FILE_ID1 94
4354
#define VM_FILE_ID2 181
4455

45-
#define VM_BYTECODE_VERSION 5
56+
#define VM_BYTECODE_VERSION 6
4657

4758
#define VM_TOKEN_ATOM 0
4859
#define VM_TOKEN_FORM 1

languages/python/pyvelox/bytecode.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ class Bytecode:
108108
def __init__(self):
109109
# File header
110110
self.magic = 0x5EB5 # 0x5E, 0xB5 in little-endian
111-
self.version = 5 # VM_BYTECODE_VERSION from include/vm-bytecode.h
111+
self.version = 6 # VM_BYTECODE_VERSION from include/vm-bytecode.h
112112

113113
# Symbol table management
114114
self.symbol_table = SymbolTable()

languages/python/pyvelox/disassembler.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,16 +71,24 @@ def read_bytes(self, n: int) -> bytes:
7171

7272
def disassemble(self) -> str:
7373
"""Disassemble the entire bytecode file."""
74-
# Read header
74+
# Read v6 header (9 fixed bytes + N-byte program name).
7575
magic1 = self.read_byte()
7676
magic2 = self.read_byte()
77-
version = self.read_byte()
78-
7977
if magic1 != 0x5E or magic2 != 0xB5:
8078
raise ValueError(f"Invalid magic bytes: 0x{magic1:02X}{magic2:02X}")
8179

80+
version = self.read_u16()
81+
total_len = (self.read_byte() | (self.read_byte() << 8)
82+
| (self.read_byte() << 16) | (self.read_byte() << 24))
83+
name_len = self.read_byte()
84+
prog_name = self.read_bytes(name_len).decode('utf-8',
85+
errors='replace') \
86+
if name_len > 0 else ''
87+
8288
output = [f"; VeloxVM Bytecode Disassembly"]
83-
output.append(f"; Magic: 0x{magic1:02X}{magic2:02X}, Version: {version}")
89+
output.append(
90+
f"; Magic: 0x{magic1:02X}{magic2:02X}, Version: {version}, "
91+
f"Total length: {total_len}, Program: {prog_name!r}")
8492
output.append("")
8593

8694
# Read string table

languages/python/pyvelox/writer.py

Lines changed: 41 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
following the VeloxVM bytecode format specification.
3434
"""
3535

36+
import io
3637
import struct
3738
from pathlib import Path
3839
from typing import Union, List
@@ -41,37 +42,49 @@
4142

4243
def write_bytecode_file(path: Union[str, Path], bc: Bytecode):
4344
"""
44-
Write bytecode to a .vm file.
45-
46-
File format:
47-
- Header (3 bytes): 0x5E, 0xB5, version
45+
Write bytecode to a .vm file in the v6 format.
46+
47+
File layout:
48+
- Header (9 fixed bytes + N-byte program name):
49+
0x00 2 Magic (0x5E 0xB5)
50+
0x02 2 Version (uint16 LE)
51+
0x04 4 Total file length (uint32 LE)
52+
0x08 1 Program name length N
53+
0x09 N Program name (UTF-8, no terminator)
4854
- String table: count (16-bit) + items (16-bit length + data)
4955
- Symbol table: count (16-bit) + items (16-bit length + data)
5056
- Expression table: count (16-bit) + items (16-bit length + data)
51-
- Captures section: count (16-bit) + entries. Each entry is
52-
(length:uint16, expr_id:uint16, symbol_id:uint16 ...). The entry
53-
length is the byte count of the entry's payload, i.e.
54-
2 + 2 * len(symbol_ids).
57+
- Captures section: count (16-bit) + entries (length:uint16,
58+
expr_id:uint16, symbol_id:uint16 ...).
5559
5660
Args:
5761
path: Output file path
5862
bc: Bytecode container to write
5963
"""
60-
with open(path, 'wb') as f:
61-
# Write header (3 bytes)
62-
f.write(bytes([0x5E, 0xB5, bc.version]))
63-
64-
# Write string table
65-
_write_table(f, bc.symbol_table.strings, _encode_string_item)
64+
path = Path(path)
65+
prog_name = path.stem.encode('utf-8')
66+
if len(prog_name) > 255:
67+
raise ValueError(f"Program name too long ({len(prog_name)} bytes, "
68+
f"max 255): {path.stem}")
69+
70+
# Build the body in memory so we can fill in the total-length header
71+
# field before any bytes hit disk.
72+
body = io.BytesIO()
73+
_write_table(body, bc.symbol_table.strings, _encode_string_item)
74+
_write_table(body, bc.symbol_table.symbols, _encode_string_item)
75+
_write_table(body, bc.expressions, _encode_bytes_item)
76+
_write_captures_section(body, bc.captures)
77+
body_bytes = body.getvalue()
78+
79+
total_len = 9 + len(prog_name) + len(body_bytes)
6680

67-
# Write symbol table
68-
_write_table(f, bc.symbol_table.symbols, _encode_string_item)
69-
70-
# Write expression table
71-
_write_table(f, bc.expressions, _encode_bytes_item)
72-
73-
# Write captures section
74-
_write_captures_section(f, bc.captures)
81+
with open(path, 'wb') as f:
82+
f.write(bytes([0x5E, 0xB5])) # Magic
83+
f.write(struct.pack('<H', bc.version)) # Version (uint16 LE)
84+
f.write(struct.pack('<I', total_len)) # Total length (uint32 LE)
85+
f.write(struct.pack('<B', len(prog_name))) # Name length
86+
f.write(prog_name) # Name bytes
87+
f.write(body_bytes)
7588

7689

7790
def _write_captures_section(f, captures):
@@ -150,12 +163,16 @@ def read_bytecode_file(path: Union[str, Path]) -> Bytecode:
150163
has the authoritative bytecode loader.
151164
"""
152165
with open(path, 'rb') as f:
153-
# Read header
166+
# Read v6 fixed prefix (9 bytes)
154167
magic_bytes = f.read(2)
155168
if magic_bytes != bytes([0x5E, 0xB5]):
156169
raise ValueError(f"Invalid magic number: {magic_bytes.hex()}")
157170

158-
version = f.read(1)[0]
171+
version = struct.unpack('<H', f.read(2))[0]
172+
_total_len = struct.unpack('<I', f.read(4))[0]
173+
name_len = struct.unpack('<B', f.read(1))[0]
174+
if name_len > 0:
175+
_ = f.read(name_len) # consume program name
159176

160177
# Create bytecode container
161178
bc = Bytecode()

languages/scheme/bytecode.rkt

Lines changed: 49 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@
7979
;; Create new bytecode container
8080
(define (make-bytecode)
8181
(bytecode #x5EB5 ; Magic number
82-
5 ; Version
82+
6 ; Version
8383
'() ; strings-rev
8484
(make-hash) ; strings-index
8585
'() ; symbols-rev
@@ -405,40 +405,59 @@
405405
;; ============================================================================
406406

407407
(define (write-bytecode-file filename bc)
408+
;; v6 header: 9 fixed bytes + N-byte program name.
409+
;; The program name is derived from the destination filename's basename.
410+
(define prog-name
411+
(let* ([p (if (path? filename) filename (string->path filename))]
412+
[base (file-name-from-path p)])
413+
(path->string (path-replace-extension base #""))))
414+
(define name-bytes (string->bytes/utf-8 prog-name))
415+
(define name-len (bytes-length name-bytes))
416+
(when (> name-len 255)
417+
(error 'write-bytecode-file
418+
"Program name too long (~a bytes, max 255): ~a"
419+
name-len prog-name))
420+
421+
;; Build the body in memory so we can compute the total length for the
422+
;; header before flushing to disk.
423+
(define body
424+
(with-output-to-bytes
425+
(lambda ()
426+
(write-table (bytecode-strings bc) write-string-entry)
427+
(write-table (bytecode-symbols bc) write-symbol-entry)
428+
(write-table (bytecode-expressions bc) write-expr-entry)
429+
;; Captures section: count of {expr_id, [symbol_id, ...]} entries.
430+
;; compile-lambda populates bc's captures-list as it analyzes lambda
431+
;; bodies for free variables; here we serialize that out.
432+
(let ([entries (bytecode-captures-list bc)])
433+
(write-u16 (length entries))
434+
(for ([entry entries])
435+
(let* ([expr-id (car entry)]
436+
[sym-ids (cdr entry)]
437+
[entry-bytes (+ 2 (* 2 (length sym-ids)))])
438+
(write-u16 entry-bytes)
439+
(write-u16 expr-id)
440+
(for ([sid sym-ids])
441+
(write-u16 sid))))))))
442+
443+
(define total-len (+ 9 name-len (bytes-length body)))
444+
408445
(with-output-to-file filename
409446
#:exists 'replace
410447
#:mode 'binary
411448
(lambda ()
412-
;; Magic number
449+
;; Magic (bytes 0-1, big-endian for `file(1)` compatibility)
413450
(write-bytes (integer->integer-bytes (bytecode-magic bc) 2 #f #t))
414-
415-
;; Version
416-
(write-byte (bytecode-version bc))
417-
418-
;; String table
419-
(write-table (bytecode-strings bc) write-string-entry)
420-
421-
;; Symbol table
422-
(write-table (bytecode-symbols bc) write-symbol-entry)
423-
424-
;; Expression table
425-
(write-table (bytecode-expressions bc) write-expr-entry)
426-
427-
;; Captures section: count of {expr_id, [symbol_id, ...]} entries.
428-
;; Each entry is uint16-length-prefixed, then uint16 expr_id +
429-
;; uint16 symbol_id per captured free variable. compile-lambda
430-
;; populates bc's captures-list as it analyzes lambda bodies for
431-
;; free variables; here we serialize that out.
432-
(let ([entries (bytecode-captures-list bc)])
433-
(write-u16 (length entries))
434-
(for ([entry entries])
435-
(let* ([expr-id (car entry)]
436-
[sym-ids (cdr entry)]
437-
[entry-bytes (+ 2 (* 2 (length sym-ids)))])
438-
(write-u16 entry-bytes)
439-
(write-u16 expr-id)
440-
(for ([sid sym-ids])
441-
(write-u16 sid))))))))
451+
;; Version (bytes 2-3, uint16 LE)
452+
(write-bytes (integer->integer-bytes (bytecode-version bc) 2 #f #f))
453+
;; Total file length (bytes 4-7, uint32 LE)
454+
(write-bytes (integer->integer-bytes total-len 4 #f #f))
455+
;; Program name length (byte 8)
456+
(write-byte name-len)
457+
;; Program name (bytes 9..9+N)
458+
(write-bytes name-bytes)
459+
;; Body
460+
(write-bytes body))))
442461

443462
(define (write-table items write-item)
444463
(write-u16 (length items))

0 commit comments

Comments
 (0)