Skip to content

Commit 50c6833

Browse files
committed
feat: support blob decoding to Parquet with tables-first querying (fixed/local auto switch, append-only)
1 parent 80b4157 commit 50c6833

37 files changed

Lines changed: 1433 additions & 654 deletions

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,7 @@ node_modules/
3535

3636
apps/web/data/local/
3737
apps/web/data/_backups/
38-
apps/web/data/*/objects/
38+
apps/web/data/*/objects/
39+
scrape/
40+
41+
.cache/

apps/web/README.md

Lines changed: 0 additions & 8 deletions
This file was deleted.

apps/web/app.js

Lines changed: 110 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -3,135 +3,150 @@ import * as duckdb from "https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@1.28.0
33

44
const $id = (id) => document.getElementById(id);
55
const statusEl = $id("status");
6-
const metaEl = $id("meta");
7-
const errorEl = $id("error");
6+
const metaEl = $id("meta");
7+
const errorEl = $id("error");
88
const resultEl = $id("result");
9-
const sqlBox = $id("sql");
9+
const sqlBox = $id("sql");
1010
const examples = $id("examples");
1111

12-
const setStatus = (t, bg="#eef5ff") => { if (statusEl) { statusEl.textContent = t; statusEl.style.background = bg; }};
12+
const setStatus = (t, bg = "#eef5ff") => { if (statusEl) { statusEl.textContent = t; statusEl.style.background = bg; } };
1313
const showError = (m) => { if (errorEl) errorEl.textContent = m || ""; };
1414

1515
// ---------- config helpers ----------
16-
function getMeta(name){ const el=document.querySelector(`meta[name="${name}"]`); return el?.content || ""; }
17-
function getQ(name){ return new URLSearchParams(location.search).get(name) || ""; }
16+
function getMeta(name) { const el = document.querySelector(`meta[name="${name}"]`); return el?.content || ""; }
17+
function getQ(name) { return new URLSearchParams(location.search).get(name) || ""; }
1818

1919
function resolveDataSubdir(){
20-
// Priority: global var -> meta -> query -> heuristic(hostname)
21-
const ov = (window.HX_DATA_SUBDIR || getMeta("hx-data-subdir") || getQ("data_subdir") || getQ("mode") || "").toLowerCase();
20+
const ov = (window.HX_DATA_SUBDIR
21+
|| (document.querySelector('meta[name="hx-data-subdir"]')?.content || "")
22+
|| new URLSearchParams(location.search).get("data_subdir")
23+
|| new URLSearchParams(location.search).get("mode")
24+
|| "").toLowerCase();
2225
if (ov === "fixed" || ov === "local") return ov;
23-
// Heuristic: 线上域名走 fixed,其余走 local(可按需调整)
2426
return location.hostname.endsWith("harborx.tech") ? "fixed" : "local";
2527
}
2628

27-
function resolveManifestURL(){
28-
const override = window.HX_MANIFEST_URL || getMeta("hx-manifest-url") || getQ("manifest");
29-
if (override) return new URL(override, document.baseURI);
29+
async function registerTablesFromJSON(conn){
3030
const sub = resolveDataSubdir();
31-
return new URL(`data/${sub}/manifest.json`, document.baseURI);
32-
}
33-
34-
// ---------- data loading ----------
35-
async function loadManifest(){
36-
let manifestURL = resolveManifestURL();
37-
// 拉取 manifest;失败时在 fixed/local 间做一次兜底切换
38-
let res = await fetch(manifestURL.href, { cache: "no-store" });
39-
if (!res.ok) {
40-
const sub = resolveDataSubdir();
41-
const alt = sub === "fixed" ? "local" : "fixed";
42-
const fallback = new URL(`data/${alt}/manifest.json`, document.baseURI);
43-
res = await fetch(fallback.href, { cache: "no-store" });
44-
if (!res.ok) throw new Error(`manifest fetch failed (${manifestURL} and ${fallback})`);
45-
manifestURL = fallback;
31+
const trySubs = sub === "fixed" ? ["fixed","local"] : ["local","fixed"];
32+
let tables=null, usedSub=null;
33+
for (const s of trySubs){
34+
const u = new URL(`data/${s}/state_diff/_tables.json`, document.baseURI);
35+
const r = await fetch(u.href, {cache:"no-store"});
36+
if (r.ok){ tables = await r.json(); usedSub=s; break; }
4637
}
47-
const m = await res.json();
48-
// 相对路径以 manifest 所在目录为基准
49-
const toAbs = (a) => (Array.isArray(a)?a:[]).map(p=> new URL(p, manifestURL).href);
50-
return {arrow:toAbs(m.arrow), parquet:toAbs(m.parquet), manifestURL};
51-
}
52-
53-
async function buildState(conn){
54-
const {arrow, parquet, manifestURL} = await loadManifest();
55-
let engine = arrow.length ? "arrow" : (parquet.length ? "parquet" : null);
56-
let files = engine==="arrow"?arrow:engine==="parquet"?parquet:[];
57-
58-
if(!engine) throw new Error("manifest has no files");
59-
60-
// Enable httpfs/arrow extensions
61-
try{await conn.query("INSTALL httpfs;");}catch{}
62-
try{await conn.query("LOAD httpfs;");}catch{}
63-
try{await conn.query("INSTALL arrow;");}catch{}
64-
try{await conn.query("LOAD arrow;");}catch{}
65-
66-
// Probe first file; fallback to parquet if arrow fails
67-
try{
68-
const probe = engine==="arrow"?`read_ipc('${files[0]}')`:`read_parquet('${files[0]}')`;
69-
await conn.query(`SELECT 1 FROM ${probe} LIMIT 1`);
70-
}catch(e){
71-
if(engine==="arrow" && parquet.length){ engine="parquet"; files=parquet; }
72-
else throw e;
38+
if (!tables) throw new Error("missing _tables.json in both fixed and local");
39+
40+
try { await conn.query("INSTALL httpfs;"); await conn.query("LOAD httpfs;"); } catch {}
41+
42+
const EMPTY_SCHEMAS = {
43+
storage_diffs: `
44+
SELECT CAST(NULL AS VARCHAR) AS address,
45+
CAST(NULL AS VARCHAR) AS key,
46+
CAST(NULL AS VARCHAR) AS value
47+
WHERE 1=0`,
48+
declared_classes: `
49+
SELECT CAST(NULL AS VARCHAR) AS class_hash,
50+
CAST(NULL AS VARCHAR) AS compiled_class_hash
51+
WHERE 1=0`,
52+
deployed_or_replaced: `
53+
SELECT CAST(NULL AS VARCHAR) AS address,
54+
CAST(NULL AS VARCHAR) AS class_hash
55+
WHERE 1=0`,
56+
nonces: `
57+
SELECT CAST(NULL AS VARCHAR) AS contract_address,
58+
CAST(NULL AS VARCHAR) AS nonce
59+
WHERE 1=0`,
60+
};
61+
62+
const toAbs = (rel) => new URL(rel, document.baseURI).href.replace(/#/g, "%23");
63+
const CHUNK = 16;
64+
const loaded = [];
65+
66+
for (const viewName of Object.keys(EMPTY_SCHEMAS)){
67+
const files = (tables[viewName] || []).filter(x => typeof x === "string");
68+
if (!files.length){
69+
await conn.query(`CREATE OR REPLACE VIEW ${viewName} AS ${EMPTY_SCHEMAS[viewName]}`);
70+
loaded.push(`${viewName}(0)`);
71+
continue;
72+
}
73+
const chunks = [];
74+
for (let i=0;i<files.length;i+=CHUNK){
75+
const part = files.slice(i,i+CHUNK)
76+
.map(f => `SELECT * FROM read_parquet('${toAbs(f)}')`)
77+
.join(" UNION ALL ");
78+
const v = `v_${viewName}_${(i/CHUNK)|0}`;
79+
await conn.query(`CREATE OR REPLACE TEMP VIEW ${v} AS ${part}`);
80+
chunks.push(v);
81+
}
82+
await conn.query(
83+
`CREATE OR REPLACE VIEW ${viewName} AS ${chunks.map(v=>`SELECT * FROM ${v}`).join(" UNION ALL ")}`
84+
);
85+
loaded.push(`${viewName}(${files.length})`);
7386
}
7487

75-
// Chunked views to limit query string size
76-
const CHUNK=16, views=[];
77-
for(let i=0;i<files.length;i+=CHUNK){
78-
const group = files.slice(i,i+CHUNK).map(f=> engine==="arrow"?
79-
`SELECT * FROM read_ipc('${f}')`:`SELECT * FROM read_parquet('${f}')`).join(" UNION ALL ");
80-
const v=`v_chunk_${(i/CHUNK)|0}`;
81-
await conn.query(`CREATE OR REPLACE TEMP VIEW ${v} AS ${group}`);
82-
views.push(v);
83-
}
84-
await conn.query(`CREATE OR REPLACE VIEW state AS ${views.map(v=>`SELECT * FROM ${v}`).join(" UNION ALL ")}`);
88+
const metaEl = document.getElementById("meta");
89+
if (metaEl) metaEl.textContent = `Tables: ${loaded.join(", ")} · Source: /data/${usedSub}/state_diff/_tables.json`;
90+
}
8591

86-
if (metaEl) {
87-
// 人类可读:显示使用的 manifest 与引擎
88-
const used = manifestURL.href.replace(location.origin,"");
89-
metaEl.textContent = `Files: ${files.length} · Engine: read_${engine} · Manifest: ${used}`;
90-
}
92+
function resolveManifestURL() {
93+
const override = window.HX_MANIFEST_URL || getMeta("hx-manifest-url") || getQ("manifest");
94+
if (override) return new URL(override, document.baseURI);
95+
const sub = resolveDataSubdir();
96+
return new URL(`data/${sub}/manifest.json`, document.baseURI);
9197
}
9298

9399
// ---------- render ----------
94-
function renderTable(table){
95-
const rows=table.toArray(); const cols=table.schema.fields.map(f=>f.name);
96-
let html="<table><thead><tr>"; for(const c of cols) html+=`<th>${c}</th>`; html+="</tr></thead><tbody>";
97-
for(const r of rows){ html+="<tr>"; for(const c of cols){ let v=r[c];
98-
if(v && (v.BYTES_PER_ELEMENT || v instanceof ArrayBuffer)){
99-
const b=v instanceof ArrayBuffer?new Uint8Array(v):new Uint8Array(v.buffer||v);
100-
const hex=Array.from(b.slice(0,16)).map(x=>x.toString(16).padStart(2,'0')).join('');
101-
v=`0x${hex}${b.length>16?'…':''}`;
102-
}
103-
html+=`<td>${(v===null||v===undefined)?"":String(v)}</td>`;
104-
} html+="</tr>"; }
105-
html+="</tbody></table>"; resultEl.innerHTML=html;
100+
function renderTable(table) {
101+
const rows = table.toArray(); const cols = table.schema.fields.map(f => f.name);
102+
let html = "<table><thead><tr>"; for (const c of cols) html += `<th>${c}</th>`; html += "</tr></thead><tbody>";
103+
for (const r of rows) {
104+
html += "<tr>"; for (const c of cols) {
105+
let v = r[c];
106+
if (v && (v.BYTES_PER_ELEMENT || v instanceof ArrayBuffer)) {
107+
const b = v instanceof ArrayBuffer ? new Uint8Array(v) : new Uint8Array(v.buffer || v);
108+
const hex = Array.from(b.slice(0, 16)).map(x => x.toString(16).padStart(2, '0')).join('');
109+
v = `0x${hex}${b.length > 16 ? '…' : ''}`;
110+
}
111+
html += `<td>${(v === null || v === undefined) ? "" : String(v)}</td>`;
112+
} html += "</tr>";
113+
}
114+
html += "</tbody></table>"; resultEl.innerHTML = html;
106115
}
107116

108117
// ---------- boot ----------
109-
function pickPreferBrowser(bundles){ return bundles.browser ?? bundles.mvp ?? Object.values(bundles)[0]; }
110-
async function sameOriginWorkerURL(url){ const r=await fetch(url,{cache:"no-store"}); if(!r.ok) throw new Error("fetch worker "+r.status); return URL.createObjectURL(new Blob([await r.text()],{type:"text/javascript"})); }
118+
function pickPreferBrowser(bundles) { return bundles.browser ?? bundles.mvp ?? Object.values(bundles)[0]; }
119+
async function sameOriginWorkerURL(url) { const r = await fetch(url, { cache: "no-store" }); if (!r.ok) throw new Error("fetch worker " + r.status); return URL.createObjectURL(new Blob([await r.text()], { type: "text/javascript" })); }
111120

112-
async function boot(){
113-
try{
121+
async function boot() {
122+
try {
114123
setStatus("Booting…");
115124
const bundles = duckdb.getJsDelivrBundles();
116-
const bundle = pickPreferBrowser(bundles);
125+
const bundle = pickPreferBrowser(bundles);
117126
const workerURL = await sameOriginWorkerURL(bundle.mainWorker);
118127
const worker = new Worker(workerURL);
119128
const db = new duckdb.AsyncDuckDB(new duckdb.ConsoleLogger(), worker);
120129
await db.instantiate(bundle.mainModule);
121130
const conn = await db.connect();
122131

123-
setStatus("Loading data…");
124-
await buildState(conn);
132+
setStatus("Loading tables…");
133+
await registerTablesFromJSON(conn);
125134
setStatus("Ready");
126135

127-
$id("run").onclick = async ()=>{
128-
const sql=($id("sql").value||"").trim(); if(!sql) return;
129-
const t0=performance.now();
130-
try{ const tbl=await conn.query(sql); renderTable(tbl); setStatus(`Done in ${(performance.now()-t0).toFixed(0)} ms`); }
131-
catch(e){ console.error(e); setStatus("Error","#ffecec"); showError(e?.message||String(e)); }
136+
$id("run").onclick = async () => {
137+
const sql = ($id("sql").value || "").trim(); if (!sql) return;
138+
const t0 = performance.now();
139+
try { const tbl = await conn.query(sql); renderTable(tbl); setStatus(`Done in ${(performance.now() - t0).toFixed(0)} ms`); }
140+
catch (e) { console.error(e); setStatus("Error", "#ffecec"); showError(e?.message || String(e)); }
141+
};
142+
$id("fill").onclick = () => { const q = $id("examples").value; if (q) $id("sql").value = q; };
143+
$id("init").onclick = async () => {
144+
showError("");
145+
resultEl.innerHTML = "";
146+
setStatus("Reloading tables…");
147+
await registerTablesFromJSON(conn);
148+
setStatus("Ready");
132149
};
133-
$id("fill").onclick = ()=>{ const q=$id("examples").value; if(q) $id("sql").value=q; };
134-
$id("init").onclick = async ()=>{ showError(""); resultEl.innerHTML=""; setStatus("Rebuilding…"); await buildState(conn); setStatus("Ready"); };
135-
}catch(e){ console.error(e); setStatus("Boot error","#ffecec"); showError(e?.message||String(e)); }
150+
} catch (e) { console.error(e); setStatus("Boot error", "#ffecec"); showError(e?.message || String(e)); }
136151
}
137152
boot();
-561 KB
Binary file not shown.
-1.24 MB
Binary file not shown.
-1.24 MB
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)