|
| 1 | +% ====================================================================== |
| 2 | +% figures.tex -- all figures for the GenoLeWM paper. |
| 3 | +% Self-contained: requires tikz, pgfplots, xcolor (loaded in main preamble). |
| 4 | +% Color + style definitions are guarded so this file also compiles standalone. |
| 5 | +% ====================================================================== |
| 6 | +\providecommand{\genocolor}{}\definecolor{genocolor}{HTML}{2F6F9F} |
| 7 | +\providecommand{\carboncolor}{}\definecolor{carboncolor}{HTML}{C24A3B} |
| 8 | +\providecommand{\baselinecolor}{}\definecolor{baselinecolor}{HTML}{8A8A8A} |
| 9 | +\definecolor{statefill}{HTML}{EAF1F7} |
| 10 | +\definecolor{actfill}{HTML}{FBE9E6} |
| 11 | + |
| 12 | +% ---------------------------------------------------------------------- |
| 13 | +% Figure 1: system architecture / latent-edit pipeline |
| 14 | +% ---------------------------------------------------------------------- |
| 15 | +\newcommand{\figArchitecture}{% |
| 16 | +\begin{figure}[t] |
| 17 | + \centering |
| 18 | + \resizebox{\linewidth}{!}{% |
| 19 | + \begin{tikzpicture}[ |
| 20 | + font=\small, |
| 21 | + box/.style={draw,rounded corners=2pt,minimum height=8mm,inner sep=4pt,align=center}, |
| 22 | + frozen/.style={box,draw=genocolor,fill=statefill,very thick}, |
| 23 | + train/.style={box,draw=carboncolor,fill=actfill,very thick}, |
| 24 | + io/.style={align=center,font=\footnotesize}, |
| 25 | + lat/.style={circle,draw,minimum size=8mm,inner sep=1pt,font=\footnotesize}, |
| 26 | + arr/.style={-{Stealth[length=2.2mm]},thick}, |
| 27 | + ] |
| 28 | + \node[io] (edit) {edit action $a$\\[-1pt]\scriptsize(pos, type, ref, alt)}; |
| 29 | + \node[io,below=10mm of edit] (wref) {reference window\\[-1pt]$w_{\mathrm{ref}}$ ($\approx$12\,kbp)}; |
| 30 | + \node[io,below=12mm of wref] (walt) {edited window\\[-1pt]$w_{\mathrm{alt}}$}; |
| 31 | + |
| 32 | + \node[train,right=12mm of edit] (aenc) {action\\encoder}; |
| 33 | + \node[frozen,right=12mm of wref] (enc1) {Carbon-500M\\encoder \textbf{(frozen)}}; |
| 34 | + \node[frozen,right=12mm of walt] (enc2) {Carbon-500M\\encoder \textbf{(frozen)}}; |
| 35 | + |
| 36 | + \node[lat,right=10mm of aenc] (aemb) {$a_{\mathrm{emb}}$}; |
| 37 | + \node[lat,right=10mm of enc1] (st) {$s_t$}; |
| 38 | + \node[lat,right=10mm of enc2] (stp1) {$s_{t+1}$}; |
| 39 | + |
| 40 | + \node[train,right=16mm of st,minimum height=13mm] (pred) {cross-attention\\predictor $g$}; |
| 41 | + \node[lat,right=12mm of pred] (shat) {$\hat s_{t+1}$}; |
| 42 | + \node[box,draw=black!55,right=10mm of shat] (loss) {$\mathcal{L}_{\mathrm{pred}}$}; |
| 43 | + \node[io,below=7mm of pred,text=black!75,align=center] (down) |
| 44 | + {\textbf{downstream (latent-only, no Carbon call):}\\[-1pt] |
| 45 | + surprise $\|\hat s_{t+1}-s_{t+1}\|$ \;$\cdot$\; AR rollout \;$\cdot$\; CEM planning}; |
| 46 | + |
| 47 | + \draw[arr] (edit) -- (aenc); |
| 48 | + \draw[arr] (wref) -- (enc1); |
| 49 | + \draw[arr] (walt) -- (enc2); |
| 50 | + \draw[arr] (aenc) -- (aemb); |
| 51 | + \draw[arr] (enc1) -- (st); |
| 52 | + \draw[arr] (enc2) -- (stp1); |
| 53 | + \draw[arr] (aemb) -| ([xshift=-4mm]pred.north) -- (pred.north); |
| 54 | + \draw[arr] (st) -- (pred); |
| 55 | + \draw[arr] (pred) -- (shat); |
| 56 | + \draw[arr] (shat) -- (loss); |
| 57 | + \draw[arr] (stp1) -| (loss.south); |
| 58 | + \draw[arr,dashed,black!55] (shat.south) -- ([yshift=2mm]down.east -| shat.south) ; |
| 59 | + |
| 60 | + \begin{scope}[on background layer] |
| 61 | + \node[fit=(enc1)(enc2)(wref)(walt),draw=genocolor,dashed,rounded corners,inner sep=6pt, |
| 62 | + label={[genocolor,font=\footnotesize]above:frozen state encoder --- paid once per window}] {}; |
| 63 | + \end{scope} |
| 64 | + \end{tikzpicture}} |
| 65 | + \caption{\textbf{GenoLeWM as an action-conditioned latent world model over genomic edits.} |
| 66 | + A frozen Carbon-500M DNA encoder maps a reference window $w_{\mathrm{ref}}$ to a state |
| 67 | + $s_t\in\mathbb{R}^{1024}$; a small trainable action encoder maps an explicit edit |
| 68 | + $a=(\text{pos},\text{type},\text{ref},\text{alt})$ to $a_{\mathrm{emb}}$; and a cross-attention |
| 69 | + predictor $g$ estimates the post-edit state $\hat s_{t+1}=g(s_t,a_{\mathrm{emb}})$. Only the action |
| 70 | + encoder and predictor are trained ($\approx$40M params); the training target $s_{t+1}$ is the frozen |
| 71 | + encoding of the edited window $w_{\mathrm{alt}}$. Once $\hat s_{t+1}$ is available, every downstream |
| 72 | + use (surprise scoring, multi-edit rollout, CEM planning) operates entirely in latent space and never |
| 73 | + re-invokes Carbon.} |
| 74 | + \label{fig:architecture} |
| 75 | +\end{figure}} |
| 76 | + |
| 77 | +% ---------------------------------------------------------------------- |
| 78 | +% Figure 2: VEP results vs Carbon zero-shot |
| 79 | +% ---------------------------------------------------------------------- |
| 80 | +\newcommand{\figVEP}{% |
| 81 | +\begin{figure}[t] |
| 82 | + \centering |
| 83 | + \begin{tikzpicture} |
| 84 | + \begin{axis}[ |
| 85 | + width=\linewidth, height=6cm, ybar, bar width=9pt, |
| 86 | + ymin=-0.15, ymax=1.05, |
| 87 | + ylabel={score (higher is better)}, |
| 88 | + symbolic x coords={ClinVar coding\\(AUROC),ClinVar non-coding\\(AUROC),BRCA2\\(Spearman),TraitGym\\(Spearman)}, |
| 89 | + xtick=data, xticklabel style={align=center,font=\small}, |
| 90 | + legend pos=north east, legend style={font=\small,draw=black!30}, |
| 91 | + enlarge x limits=0.18, ymajorgrids, grid style=dotted, |
| 92 | + nodes near coords, every node near coord/.append style={font=\tiny,/pgf/number format/.cd,fixed,precision=3}, |
| 93 | + ] |
| 94 | + \addplot[fill=genocolor,draw=genocolor] coordinates { |
| 95 | + ({ClinVar coding\\(AUROC)},0.734375) |
| 96 | + ({ClinVar non-coding\\(AUROC)},0.5625) |
| 97 | + ({BRCA2\\(Spearman)},0.149194) |
| 98 | + ({TraitGym\\(Spearman)},-0.0279645)}; |
| 99 | + \addplot[fill=carboncolor,draw=carboncolor] coordinates { |
| 100 | + ({ClinVar coding\\(AUROC)},0.921875) |
| 101 | + ({ClinVar non-coding\\(AUROC)},0.875) |
| 102 | + ({BRCA2\\(Spearman)},0.476907) |
| 103 | + ({TraitGym\\(Spearman)},-0.0838935)}; |
| 104 | + \legend{GenoLeWM, Carbon zero-shot} |
| 105 | + \end{axis} |
| 106 | + \end{tikzpicture} |
| 107 | + \caption{\textbf{Variant-effect prediction: GenoLeWM trails the Carbon zero-shot baseline on the |
| 108 | + pathogenicity-bearing slices.} Bars are the released v0.2.1 values; Carbon values are recovered as |
| 109 | + GenoLeWM~$-$~$\Delta$. GenoLeWM is worse on ClinVar coding/non-coding AUROC and BRCA2 saturation |
| 110 | + Spearman; the only ``win'' (TraitGym) is between two scores indistinguishable from zero. Slices are |
| 111 | + small (metrics quantized in steps of $0.0625{=}1/16$, consistent with $\approx$16 evaluated variants |
| 112 | + per ClinVar slice); see \S\ref{sec:limitations}.} |
| 113 | + \label{fig:vep} |
| 114 | +\end{figure}} |
| 115 | + |
| 116 | +% ---------------------------------------------------------------------- |
| 117 | +% Figure 3: rollout fidelity vs source-state baseline (the baseline trap) |
| 118 | +% ---------------------------------------------------------------------- |
| 119 | +\newcommand{\figRollout}{% |
| 120 | +\begin{figure}[t] |
| 121 | + \centering |
| 122 | + \begin{tikzpicture} |
| 123 | + \begin{axis}[ |
| 124 | + width=0.86\linewidth, height=5.4cm, ybar, bar width=18pt, |
| 125 | + ymin=0, ymax=1.08, |
| 126 | + ylabel={mean cosine to true $s_{t+1}$}, |
| 127 | + symbolic x coords={phased haplotypes,synthetic edit chains}, |
| 128 | + xtick=data, xticklabel style={font=\small}, |
| 129 | + legend pos=south east, legend style={font=\small,draw=black!30}, |
| 130 | + enlarge x limits=0.5, ymajorgrids, grid style=dotted, |
| 131 | + nodes near coords, every node near coord/.append style={font=\footnotesize,/pgf/number format/.cd,fixed,precision=3}, |
| 132 | + ] |
| 133 | + \addplot[fill=genocolor,draw=genocolor] coordinates {(phased haplotypes,0.288861) (synthetic edit chains,0.301608)}; |
| 134 | + \addplot[fill=baselinecolor,draw=baselinecolor] coordinates {(phased haplotypes,0.997831) (synthetic edit chains,0.991239)}; |
| 135 | + \legend{GenoLeWM predictor, source-state baseline $\hat s_{t+1}{=}s_t$} |
| 136 | + \end{axis} |
| 137 | + \end{tikzpicture} |
| 138 | + \caption{\textbf{The latent-residual baseline trap.} A trivial baseline that predicts \emph{no change} |
| 139 | + ($\hat s_{t+1}{=}s_t$) attains cosine $\approx0.99$ to the true post-edit state, because a single |
| 140 | + SNV barely perturbs a 1024-dimensional frozen Carbon embedding. The trained predictor's outputs sit at |
| 141 | + cosine $\approx0.29$--$0.30$ --- far below the do-nothing baseline (deltas $-0.709$ and $-0.690$). |
| 142 | + This is the central diagnostic for why edit-conditioned latent prediction is hard in this regime |
| 143 | + (\S\ref{sec:discussion}).} |
| 144 | + \label{fig:rollout} |
| 145 | +\end{figure}} |
| 146 | + |
| 147 | +% ---------------------------------------------------------------------- |
| 148 | +% Figure 4: AR rollout speedup vs target |
| 149 | +% ---------------------------------------------------------------------- |
| 150 | +\newcommand{\figARspeed}{% |
| 151 | +\begin{figure}[t] |
| 152 | + \centering |
| 153 | + \begin{tikzpicture} |
| 154 | + \begin{axis}[ |
| 155 | + width=0.7\linewidth, height=4.8cm, ybar, bar width=24pt, |
| 156 | + ymin=0, ymax=5.7, |
| 157 | + ylabel={speedup over naive rollout ($\times$)}, |
| 158 | + symbolic x coords={$K{=}5$,$K{=}20$}, |
| 159 | + xtick=data, enlarge x limits=0.6, ymajorgrids, grid style=dotted, |
| 160 | + nodes near coords, every node near coord/.append style={font=\small,/pgf/number format/.cd,fixed,precision=2}, |
| 161 | + ] |
| 162 | + \addplot[fill=genocolor,draw=genocolor] coordinates {({$K{=}5$},2.41386) ({$K{=}20$},2.47322)}; |
| 163 | + \draw[carboncolor,thick,dashed] |
| 164 | + ({rel axis cs:0,0}|-{axis cs:$K{=}5$,5.0}) -- ({rel axis cs:1,0}|-{axis cs:$K{=}20$,5.0}) |
| 165 | + node[pos=0.5,above,font=\small,carboncolor]{RFC-0004 target $5.0\times$ (open, \#42)}; |
| 166 | + \end{axis} |
| 167 | + \end{tikzpicture} |
| 168 | + \caption{\textbf{KV-cached autoregressive rollout speedup.} Cached \texttt{ARPredictor} rollout is |
| 169 | + $2.41\times$ ($K{=}5$) and $2.47\times$ ($K{=}20$) faster than naive repeated one-step prediction --- |
| 170 | + real but short of the $5\times$ target at $K{=}20$, which remains open and was explicitly rescoped. |
| 171 | + Measured on toy synthetic dimensions ($d_{\mathrm{state}}{=}64$, CPU, fp32); not Carbon-scale.} |
| 172 | + \label{fig:arspeed} |
| 173 | +\end{figure}} |
| 174 | + |
| 175 | +% ---------------------------------------------------------------------- |
| 176 | +% Figure: efficiency three-regime latency decomposition |
| 177 | +% ---------------------------------------------------------------------- |
| 178 | +\newcommand{\figEfficiency}{% |
| 179 | +\begin{figure}[t] |
| 180 | + \centering |
| 181 | + \begin{tikzpicture} |
| 182 | + \begin{semilogyaxis}[ |
| 183 | + width=0.92\linewidth, height=5.8cm, ybar, bar width=30pt, |
| 184 | + ymin=0.3, ymax=900000, ymode=log, log origin=infty, |
| 185 | + ylabel={per-variant latency (ms, log scale)}, |
| 186 | + symbolic x coords={Regime 1\\cold subprocess,Regime 2\\warm cache,Regime 3\\latent rollout}, |
| 187 | + xtick=data, xticklabel style={align=center,font=\small}, |
| 188 | + enlarge x limits=0.28, ymajorgrids, grid style=dotted, |
| 189 | + point meta=explicit symbolic, |
| 190 | + nodes near coords, every node near coord/.append style={font=\footnotesize,anchor=south}, |
| 191 | + ] |
| 192 | + \addplot[fill=carboncolor,draw=carboncolor] coordinates { |
| 193 | + ({Regime 1\\cold subprocess},115262.94) [115{,}263\,ms]}; |
| 194 | + \addplot[pattern=north east lines,pattern color=baselinecolor,draw=baselinecolor] coordinates { |
| 195 | + ({Regime 2\\warm cache},95) [${\sim}95$\,ms (est.)] |
| 196 | + ({Regime 3\\latent rollout},1) [${\sim}1$\,ms (est.)]}; |
| 197 | + \legend{measured, estimated (unmeasured)} |
| 198 | + \end{semilogyaxis} |
| 199 | + \end{tikzpicture} |
| 200 | + \caption{\textbf{The efficiency thesis is architecturally sound but unmeasured in the regime it |
| 201 | + describes.} Only \emph{Regime 1} --- a cold \texttt{geno-lewm-score} subprocess (interpreter start, |
| 202 | + ${\sim}1$\,GiB Carbon load, two Carbon forward passes) --- is measured, at $115{,}262.94$\,ms; the |
| 203 | + ${\sim}160$\,ms of actual Carbon compute is dwarfed by process start and model loading. The |
| 204 | + ``pay-Carbon-once'' thesis applies to \emph{Regime 2} (warm reference cache: one Carbon call on the |
| 205 | + edited window, in-process) and \emph{Regime 3} (latent-only rollout/planning: zero further Carbon |
| 206 | + calls), both \emph{unmeasured} on this checkpoint; the hatched bars are order-of-magnitude estimates, |
| 207 | + not results (\S\ref{sec:efficiency}). The $115$\,s figure is therefore evidence of cold-start cost, |
| 208 | + neither evidence that GenoLeWM is slow per inference nor that it is fast versus Carbon.} |
| 209 | + \label{fig:efficiency} |
| 210 | +\end{figure}} |
0 commit comments