-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalyze_cantor_coverage.py
More file actions
243 lines (191 loc) · 9.61 KB
/
analyze_cantor_coverage.py
File metadata and controls
243 lines (191 loc) · 9.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
"""
Analyze Cantor Set Global Spectrum Coverage for Liminal Staircase
==================================================================
Calculate the containment zones and global attentiveness capacity.
"""
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
def compute_cantor_coord(position: float, depth: int = 8) -> float:
"""Pure Cantor set mapping (ternary)."""
x = max(1e-6, min(position, 1.0 - 1e-6))
cantor_val = 0.0
factor = 0.5
for _ in range(depth):
x_scaled = x * 3.0
digit = int(x_scaled)
x_frac = x_scaled - digit
# In Cantor set: keep left (0) and right (2) thirds, remove middle (1)
if digit == 2:
cantor_val += factor
x = x_frac
factor *= 0.5
return cantor_val
def analyze_cantor_coverage(num_positions: int = 512, depth: int = 8, k_neighbors: int = 64):
"""Analyze global spectrum coverage."""
print("=" * 80)
print("CANTOR SET GLOBAL SPECTRUM COVERAGE ANALYSIS")
print("=" * 80)
print(f"\nParameters:")
print(f" Sequence length: {num_positions}")
print(f" Cantor depth: {depth}")
print(f" k-NN neighbors: {k_neighbors}")
# Map positions to Cantor coordinates
positions = np.linspace(0, 1, num_positions)
cantor_coords = np.array([compute_cantor_coord(pos, depth) for pos in positions])
# Theoretical analysis
print(f"\n{'='*80}")
print("THEORETICAL PROPERTIES:")
print(f"{'='*80}")
# Ternary Cantor set properties
num_segments = 2 ** depth
segment_width = (1/3) ** depth
total_measure = (2/3) ** depth
print(f"\n1. Containment Zones (Cantor segments):")
print(f" - Number of segments: {num_segments}")
print(f" - Segment width: {segment_width:.2e}")
print(f" - Total measure: {total_measure:.6f} ({total_measure*100:.2f}% of [0,1])")
print(f" - Gap measure: {1-total_measure:.6f} ({(1-total_measure)*100:.2f}% of [0,1])")
# Resolution
print(f"\n2. Spatial Resolution:")
print(f" - Minimum distinguishable distance: {segment_width:.2e}")
print(f" - Positions per segment (avg): {num_positions / num_segments:.2f}")
# k-NN coverage
print(f"\n3. k-NN Global Coverage:")
print(f" - Neighbors per position: {k_neighbors}")
print(f" - Coverage fraction: {k_neighbors / num_positions * 100:.2f}%")
print(f" - Theoretical receptive field: O(log n) scales due to fractal structure")
# Empirical analysis
print(f"\n{'='*80}")
print("EMPIRICAL ANALYSIS:")
print(f"{'='*80}")
# Compute pairwise distances in Cantor space
distances = np.abs(cantor_coords[:, None] - cantor_coords[None, :])
# For each position, find k nearest neighbors
neighbor_indices = np.argpartition(distances, k_neighbors, axis=1)[:, :k_neighbors]
# Analyze neighbor distribution
max_distances = []
position_spans = []
for i in range(num_positions):
neighbors = neighbor_indices[i]
max_dist = distances[i, neighbors].max()
max_distances.append(max_dist)
# Span in original position space
position_span = (neighbors.max() - neighbors.min())
position_spans.append(position_span)
print(f"\n1. Neighbor Distribution:")
print(f" - Max Cantor distance (mean): {np.mean(max_distances):.4f}")
print(f" - Max Cantor distance (std): {np.std(max_distances):.4f}")
print(f" - Position span (mean): {np.mean(position_spans):.1f} positions")
print(f" - Position span (max): {np.max(position_spans)} positions")
# Multi-scale coverage
scale_bins = [0, 0.1, 0.25, 0.5, 1.0]
print(f"\n2. Multi-Scale Attention Coverage:")
for i in range(len(scale_bins) - 1):
lower, upper = scale_bins[i], scale_bins[i+1]
fraction = np.mean((distances > lower) & (distances <= upper))
scale_range = int(num_positions * (upper - lower))
print(f" - Range {lower:.2f}-{upper:.2f} (±{scale_range} pos): {fraction*100:.2f}% of pairs")
# Clustering analysis
print(f"\n3. Containment Zone Utilization:")
# Discretize Cantor coords to segments
segment_assignments = (cantor_coords * num_segments).astype(int)
segment_assignments = np.clip(segment_assignments, 0, num_segments - 1)
unique_segments, counts = np.unique(segment_assignments, return_counts=True)
print(f" - Occupied segments: {len(unique_segments)} / {num_segments}")
print(f" - Occupancy rate: {len(unique_segments) / num_segments * 100:.2f}%")
print(f" - Positions per segment (mean): {np.mean(counts):.2f}")
print(f" - Positions per segment (std): {np.std(counts):.2f}")
print(f" - Max positions in one segment: {np.max(counts)}")
# Global attentiveness capacity
print(f"\n{'='*80}")
print("GLOBAL ATTENTIVENESS CAPACITY:")
print(f"{'='*80}")
# For each position, check how many unique "distant" positions it can attend to
distant_threshold = num_positions * 0.25 # Consider "distant" as >25% of sequence length
global_reach_counts = []
for i in range(num_positions):
neighbors = neighbor_indices[i]
distant_neighbors = neighbors[np.abs(neighbors - i) > distant_threshold]
global_reach_counts.append(len(distant_neighbors))
print(f"\n1. Long-Range Dependency Capacity:")
print(f" - Distant neighbors (>25% seq len) per position (mean): {np.mean(global_reach_counts):.2f}")
print(f" - Distant neighbors (std): {np.std(global_reach_counts):.2f}")
print(f" - Positions with global reach (>10 distant): {np.sum(np.array(global_reach_counts) > 10)}/{num_positions}")
# Complexity verification
computation_per_position = k_neighbors
total_computation = num_positions * k_neighbors
full_attention_computation = num_positions * num_positions
print(f"\n2. Complexity Validation:")
print(f" - Computation per position: {computation_per_position} (O(k))")
print(f" - Total computation: {total_computation} (O(n·k) = O(n))")
print(f" - Full attention would be: {full_attention_computation} (O(n²))")
print(f" - Speedup: {full_attention_computation / total_computation:.1f}x")
print(f"\n{'='*80}")
print("VERDICT:")
print(f"{'='*80}")
verdict = f"""
The Cantor set with depth={depth} provides:
✓ O(n) complexity: {k_neighbors} operations per position vs {num_positions} for full attention
✓ Global coverage: {num_segments} containment zones spanning the full [0,1] spectrum
✓ Multi-scale: Fractal structure enables {np.mean(global_reach_counts):.1f} long-range connections per position
✓ Efficiency: {full_attention_computation / total_computation:.1f}x speedup over O(n²) attention
⚠ Coverage density: Only {len(unique_segments)}/{num_segments} segments occupied ({len(unique_segments)/num_segments*100:.1f}%)
⚠ Clustering: Average {np.mean(counts):.1f} positions per occupied segment
RECOMMENDATION: Depth={depth} is adequate for sequences up to {num_positions}.
For longer sequences, increase depth by 1 per 3x sequence length increase.
"""
print(verdict)
return {
'num_segments': num_segments,
'occupied_segments': len(unique_segments),
'occupancy_rate': len(unique_segments) / num_segments,
'mean_global_reach': np.mean(global_reach_counts),
'speedup': full_attention_computation / total_computation,
'cantor_coords': cantor_coords,
'segment_assignments': segment_assignments
}
if __name__ == "__main__":
# Analyze for different sequence lengths
for seq_len in [77, 256, 512]:
print("\n" * 2)
results = analyze_cantor_coverage(num_positions=seq_len, depth=8, k_neighbors=64)
if seq_len == 512:
# Visualize for longest sequence
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Plot 1: Cantor coordinate distribution
ax = axes[0, 0]
ax.hist(results['cantor_coords'], bins=50, alpha=0.7, edgecolor='black')
ax.set_xlabel('Cantor Coordinate')
ax.set_ylabel('Frequency')
ax.set_title(f'Cantor Coordinate Distribution (n={seq_len})')
ax.grid(True, alpha=0.3)
# Plot 2: Position vs Cantor coordinate
ax = axes[0, 1]
positions = np.arange(seq_len)
ax.scatter(positions, results['cantor_coords'], alpha=0.5, s=10)
ax.set_xlabel('Sequence Position')
ax.set_ylabel('Cantor Coordinate')
ax.set_title('Position → Cantor Mapping')
ax.grid(True, alpha=0.3)
# Plot 3: Segment occupancy
ax = axes[1, 0]
unique_segs, counts = np.unique(results['segment_assignments'], return_counts=True)
ax.bar(unique_segs, counts, alpha=0.7, edgecolor='black')
ax.set_xlabel('Segment ID')
ax.set_ylabel('Positions in Segment')
ax.set_title(f'Containment Zone Occupancy ({results["occupied_segments"]}/{results["num_segments"]} zones)')
ax.grid(True, alpha=0.3)
# Plot 4: Cantor set structure
ax = axes[1, 1]
# Show the fractal structure
sorted_coords = np.sort(results['cantor_coords'])
ax.scatter(sorted_coords, np.zeros_like(sorted_coords), alpha=0.6, s=20)
ax.set_xlabel('Cantor Coordinate')
ax.set_title('Cantor Set Structure (Fractal Distribution)')
ax.set_yticks([])
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('/home/user/lattice_vocabulary/cantor_coverage_analysis.png', dpi=150)
print(f"\n📊 Visualization saved to: cantor_coverage_analysis.png")