-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreal_test.py
More file actions
179 lines (152 loc) · 8.34 KB
/
Copy pathreal_test.py
File metadata and controls
179 lines (152 loc) · 8.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/env python3
"""
Path B -- a REAL test on REAL data (not synthetic gen() puzzles).
The synthetic ladder (sec 43-51) showed the induction MECHANISM works on toy L=12 bit-tasks
solved to full-domain-exact 1.000. That is mechanism, not the goal. The goal is a real
bit-native model. This is the honest real test: run Path B's induction loop -- search a space
of bit-native computations and SELECT the ones that predict the next bit, by HELD-OUT bits/bit
-- on REAL streams, and report whether (a) it beats real baselines/gzip and (b) Path B's own
induced primitives (running counters, pattern detectors) actually get selected on real data,
or whether only simple context features matter (an honest negative is a real result).
Real metric = held-out bits/bit (cross-entropy = compression; raw 1.0000, lower better), the
project's standard, externally referenced against gzip on the SAME stream. Two real corpora:
* English text (data/corpus.txt) -- bytes -> bits, MSB first
* E. coli DNA (data/ecoli.fasta, 2 bit/base, A=00 C=01 G=10 T=11) -- the purest real bit stream
Two selection pools, to ISOLATE Path B's contribution:
BASE = {lag d, phase i mod p} -- simple context/periodicity (the Path A representation)
PATHB = BASE + {running count-mod-m, popcount-bucket, pattern-detector} -- Path B's induced primitives
Greedy forward selection from order-0 is the real-data analogue of WAKE: keep the computation
that most reduces held-out bits/bit. No leakage (table built on train only; test is held out).
"""
import sys, math, gzip
from scale import load # bytes -> bits, MSB first
# ---------------------------------------------------------------------------
# real DNA loader: FASTA ACGT -> 2-bit stream (A=00 C=01 G=10 T=11)
# ---------------------------------------------------------------------------
def load_dna(path, base_cap):
code = {'A': (0, 0), 'C': (0, 1), 'G': (1, 0), 'T': (1, 1)}
bits = bytearray(); nb = 0
with open(path, 'r') as f:
for line in f:
if line.startswith('>'):
continue
for ch in line.strip().upper():
if ch in code:
a, b = code[ch]; bits.append(a); bits.append(b); nb += 1
if nb >= base_cap:
return bits
return bits
# ---------------------------------------------------------------------------
# bit-native computations (each maps (bits, i) -> a small integer)
# ---------------------------------------------------------------------------
def fval(f, bits, i):
kind = f[0]
if kind == 'lag': # context: the bit d steps back
j = i - f[1]; return bits[j] if j >= 0 else 2
if kind == 'mod': # periodicity / phase
return i % f[1]
if kind == 'cntmod': # Path B: running count mod m over last w
w, m = f[1], f[2]; return sum(bits[max(0, i - w):i]) % m
if kind == 'bucket': # Path B: popcount bucket over last w
w, cap = f[1], f[2]; return min(sum(bits[max(0, i - w):i]), cap)
if kind == 'det': # Path B: does pattern occur in last w bits
pat, w = f[1], f[2]; seg = bits[max(0, i - w):i]; m = len(pat)
return int(any(tuple(seg[t:t + m]) == pat for t in range(len(seg) - m + 1)))
raise ValueError(f)
def address(rep, bits, i):
return tuple(fval(f, bits, i) for f in rep)
def eval_rep(bits, split, rep, alpha=0.5):
"""Held-out bits/bit: KT-smoothed next-bit table keyed on the representation (train only)."""
ones = sum(bits[:split]); g1 = (ones + alpha) / (split + 2 * alpha) # order-0 backoff
t = {}
for i in range(split):
a = address(rep, bits, i); c = t.get(a)
if c is None:
c = [0, 0]; t[a] = c
c[bits[i]] += 1
tot = 0.0; n = len(bits) - split
for i in range(split, len(bits)):
c = t.get(address(rep, bits, i))
p1 = g1 if c is None else (c[1] + alpha) / (c[0] + c[1] + 2 * alpha)
p = p1 if bits[i] == 1 else 1 - p1
tot += -math.log2(max(p, 1e-12))
return tot / n
def greedy(bits, split, pool, maxf=6, eps=0.0005):
rep, cur = [], eval_rep(bits, split, []) # order-0 baseline
trace = []
while len(rep) < maxf:
best_c, best_s = None, cur
for c in pool:
if c in rep:
continue
s = eval_rep(bits, split, rep + [c])
if s < best_s - eps:
best_s, best_c = s, c
if best_c is None:
break
rep.append(best_c); cur = best_s; trace.append((best_c, cur))
return rep, cur, trace
# pools ---------------------------------------------------------------------
BASE = [('lag', d) for d in range(1, 17)] + [('mod', p) for p in range(2, 17)]
PATHB_EXTRA = ([('cntmod', w, m) for w in (4, 8, 12, 16) for m in (2, 3)] +
[('bucket', w, 4) for w in (4, 8, 16)] +
[('det', pat, w) for pat in [(1, 1), (0, 0), (1, 0, 1), (0, 1, 0)] for w in (4, 8)])
PATHB = BASE + PATHB_EXTRA
PATHB_FORMS = {'cntmod', 'bucket', 'det'}
def run_corpus(name, bits, raw_bytes, maxf=6):
split = int(len(bits) * 0.8)
gz = len(gzip.compress(bytes(raw_bytes), 9)) * 8 / len(bits)
base0 = eval_rep(bits, split, [])
print(f"\n{'=' * 78}\n{name}: bits={len(bits)} train/test=80/20")
print(f" order-0 baseline : {base0:.4f} bits/bit")
print(f" gzip (same stream): {gz:.4f} bits/bit [external reference]")
repB, sB, trB = greedy(bits, split, BASE, maxf)
print(f"\n BASE pool (simple context/periodicity) -> {sB:.4f} bits/bit")
for c, s in trB:
print(f" + {str(c):16} {s:.4f}")
repP, sP, trP = greedy(bits, split, PATHB, maxf)
pathb_used = [c for c, _ in trP if c[0] in PATHB_FORMS]
print(f"\n PATHB pool (+ induced counters/detectors) -> {sP:.4f} bits/bit")
for c, s in trP:
mark = " <- Path B primitive" if c[0] in PATHB_FORMS else ""
print(f" + {str(c):16} {s:.4f}{mark}")
print(f"\n VERDICT [{name}]:")
print(f" beats order-0 : {sP < base0 - 1e-6} ({base0:.4f} -> {sP:.4f})")
print(f" beats gzip : {sP < gz - 1e-6} (gzip {gz:.4f})")
print(f" Path B primitives selected over BASE: "
f"{'YES ' + str(pathb_used) if pathb_used else 'NO (simple context suffices here)'}")
print(f" Path B pool improves on BASE pool : {sP < sB - 1e-6} ({sB:.4f} -> {sP:.4f})")
return base0, gz, sB, sP, pathb_used
def main():
text_cap = int(sys.argv[1]) if len(sys.argv) > 1 else 80000 # bytes of text
dna_cap = int(sys.argv[2]) if len(sys.argv) > 2 else 120000 # bases of DNA
print("PATH B -- REAL TEST: induce predictive bit-native computations on REAL data")
print("(held-out bits/bit; lower = better; raw = 1.0000)")
results = {}
try:
raw, tbits = load('data/corpus.txt', text_cap)
results['English text'] = run_corpus('English text (corpus.txt)', tbits, raw)
except FileNotFoundError:
print(" (data/corpus.txt not found -- skipping text)")
try:
dbits = load_dna('data/ecoli.fasta', dna_cap)
packed = bytearray()
for k in range(0, len(dbits) - 7, 8):
byte = 0
for j in range(8):
byte = (byte << 1) | dbits[k + j]
packed.append(byte)
results['E. coli DNA'] = run_corpus('E. coli DNA (ecoli.fasta, 2bit)', dbits, packed)
except FileNotFoundError:
print(" (data/ecoli.fasta not found -- skipping DNA)")
print(f"\n{'=' * 78}\nSUMMARY -- is Path B a real model on real data?")
print(f"{'corpus':16} | order0 | gzip | BASE | PATHB | Path B primitives used")
for nm, (b0, gz, sB, sP, used) in results.items():
print(f"{nm:16} | {b0:.4f} | {gz:.4f} | {sB:.4f} | {sP:.4f} | "
f"{'yes: ' + ','.join(c[0] for c in used) if used else 'no'}")
print("\n Honest reading: a real model is one whose induced computation predicts HELD-OUT real")
print(" data below the raw and gzip baselines. Where Path B's own primitives (counters/")
print(" detectors) are selected and lower bits/bit, the induction adds real value beyond simple")
print(" context; where they are not, simple context is the lever and that is the honest finding.")
if __name__ == "__main__":
main()