Skip to content

Commit 6cdf107

Browse files
authored
feat(data): publish parquet files to R2 with download page (#51)
## Summary - Add `scripts/copy_data_to_dist.py` that copies parquet files for rendered dates to `site/dist/data/` - Add `/data/` page listing available downloads by date with file sizes and row counts - Update `justfile` with `copy-data` recipe included in `publish` - Update sync workflow to run `copy-data` before R2 upload - Add "Data downloads" link to sidebar navigation (desktop and mobile) ## URL structure - Data index: `/data/` - Individual files: `/data/{date}/{query}.parquet` Example: `/data/2026-01-20/blobs_per_slot.parquet`
1 parent fd525a4 commit 6cdf107

7 files changed

Lines changed: 320 additions & 4 deletions

File tree

.github/workflows/sync.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,9 @@ jobs:
147147
ASTRO_BASE: ${{ github.event_name == 'pull_request' && format('/pr-{0}/', github.event.pull_request.number) || '/' }}
148148
run: just build
149149

150+
- name: Copy data to dist
151+
run: just copy-data
152+
150153
# ============================================
151154
# Save Caches
152155
# ============================================

justfile

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,12 @@ render target="all" force="false":
6565
build:
6666
cd site && pnpm build
6767

68-
# Render all + build Astro
69-
publish: render build
68+
# Copy parquet files to dist for R2 publishing (only rendered dates)
69+
copy-data:
70+
uv run python scripts/copy_data_to_dist.py
71+
72+
# Render all + build Astro + copy data for publishing
73+
publish: render build copy-data
7074

7175
# ============================================
7276
# CI / Full Pipeline

scripts/copy_data_to_dist.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
"""Copy parquet files to site/dist for R2 publishing.
2+
3+
Only copies data for dates that have rendered notebooks (from site/rendered/manifest.json).
4+
This ensures data availability aligns with published notebook content.
5+
6+
Usage:
7+
uv run python scripts/copy_data_to_dist.py
8+
"""
9+
10+
from __future__ import annotations
11+
12+
import json
13+
import shutil
14+
from pathlib import Path
15+
16+
17+
def load_rendered_dates(rendered_manifest_path: Path) -> list[str]:
18+
"""Load list of rendered dates from manifest."""
19+
if not rendered_manifest_path.exists():
20+
return []
21+
22+
manifest = json.loads(rendered_manifest_path.read_text())
23+
dates = manifest.get("dates", {})
24+
return list(dates.keys())
25+
26+
27+
def copy_data_for_date(
28+
source_dir: Path, dest_dir: Path, date: str
29+
) -> tuple[int, int]:
30+
"""Copy parquet files for a single date. Returns (file_count, total_bytes)."""
31+
source_date_dir = source_dir / date
32+
dest_date_dir = dest_dir / date
33+
34+
if not source_date_dir.exists():
35+
return 0, 0
36+
37+
dest_date_dir.mkdir(parents=True, exist_ok=True)
38+
39+
file_count = 0
40+
total_bytes = 0
41+
42+
for parquet_file in source_date_dir.glob("*.parquet"):
43+
dest_file = dest_date_dir / parquet_file.name
44+
shutil.copy2(parquet_file, dest_file)
45+
file_count += 1
46+
total_bytes += parquet_file.stat().st_size
47+
48+
return file_count, total_bytes
49+
50+
51+
def format_size(size_bytes: int) -> str:
52+
"""Format byte size for human-readable output."""
53+
if size_bytes < 1024:
54+
return f"{size_bytes} B"
55+
elif size_bytes < 1024 * 1024:
56+
return f"{size_bytes / 1024:.1f} KB"
57+
else:
58+
return f"{size_bytes / (1024 * 1024):.1f} MB"
59+
60+
61+
def main() -> None:
62+
project_root = Path(__file__).parent.parent
63+
data_source = project_root / "notebooks" / "data"
64+
data_dest = project_root / "site" / "dist" / "data"
65+
rendered_manifest = project_root / "site" / "rendered" / "manifest.json"
66+
data_manifest = data_source / "manifest.json"
67+
68+
print("Copying parquet data to site/dist for R2 publishing...")
69+
70+
rendered_dates = load_rendered_dates(rendered_manifest)
71+
if not rendered_dates:
72+
print("No rendered dates found in manifest. Nothing to copy.")
73+
return
74+
75+
print(f"Found {len(rendered_dates)} rendered date(s): {', '.join(sorted(rendered_dates))}")
76+
77+
data_dest.mkdir(parents=True, exist_ok=True)
78+
79+
total_files = 0
80+
total_size = 0
81+
82+
for date in sorted(rendered_dates):
83+
files, size = copy_data_for_date(data_source, data_dest, date)
84+
if files > 0:
85+
print(f" {date}: {files} file(s), {format_size(size)}")
86+
total_files += files
87+
total_size += size
88+
else:
89+
print(f" {date}: No parquet files found in source")
90+
91+
if data_manifest.exists():
92+
dest_manifest = data_dest / "manifest.json"
93+
shutil.copy2(data_manifest, dest_manifest)
94+
print(f" Copied manifest.json")
95+
96+
print(f"\nTotal: {total_files} file(s), {format_size(total_size)}")
97+
print(f"Output: {data_dest}")
98+
99+
100+
if __name__ == "__main__":
101+
main()

site/src/components/Icon.astro

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
---
2-
import { Activity, AlertCircle, AlertTriangle, Calendar, ChevronLeft, ChevronRight, Eye, FileText, Gavel, Grid3x3, Layers, Link, List, PanelLeft, Timer, XCircle } from 'lucide-react';
2+
import { Activity, AlertCircle, AlertTriangle, Calendar, ChevronLeft, ChevronRight, Clock, Download, Eye, FileText, Gauge, Gavel, Grid3x3, Layers, Link, List, PanelLeft, Timer, XCircle } from 'lucide-react';
33
44
interface Props {
55
name: string;
@@ -19,8 +19,11 @@ const icons: Record<string, any> = {
1919
Calendar,
2020
ChevronLeft,
2121
ChevronRight,
22+
Clock,
23+
Download,
2224
Eye,
2325
FileText,
26+
Gauge,
2427
Gavel,
2528
Grid3x3,
2629
Layers,

site/src/components/Sidebar.astro

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,4 +93,29 @@ const base = import.meta.env.BASE_URL;
9393
</div>
9494
)
9595
}
96+
97+
<!-- Resources Section -->
98+
<div class="flex flex-col gap-1">
99+
<div class="mb-0.5 px-2">
100+
<span class="text-muted-foreground text-[0.625rem] font-semibold tracking-wide uppercase">Resources</span>
101+
</div>
102+
<ul class="m-0 flex list-none flex-col p-0">
103+
<li>
104+
<a
105+
href={`${base}data`}
106+
class:list={[
107+
'group text-muted-foreground relative flex items-center gap-2 px-2 py-1.5 text-[0.8125rem] no-underline transition-all duration-200',
108+
"before:bg-primary before:absolute before:top-1/2 before:left-0 before:h-0 before:w-[2px] before:-translate-y-1/2 before:transition-all before:duration-200 before:content-['']",
109+
'hover:text-foreground hover:bg-muted hover:before:h-1/2',
110+
currentPath.startsWith('/data') ? 'text-foreground bg-[var(--mauve-4)] font-medium before:!h-[60%]' : '',
111+
]}
112+
>
113+
<span class="group-hover:text-primary flex shrink-0 items-center justify-center opacity-50 transition-all duration-200 group-hover:opacity-100">
114+
<Icon name="Download" size={12} />
115+
</span>
116+
<span class="min-w-0 flex-1 overflow-hidden text-ellipsis whitespace-nowrap">Data downloads</span>
117+
</a>
118+
</li>
119+
</ul>
120+
</div>
96121
</nav>

site/src/components/ui/SidebarList.tsx

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import React from 'react';
22
import { cn, toPathDate, formatShortDate } from '@/lib/utils';
3-
import { FileText, Calendar } from 'lucide-react';
3+
import { FileText, Calendar, Download } from 'lucide-react';
44

55
interface Notebook {
66
id: string;
@@ -92,6 +92,31 @@ export function SidebarList({ notebooks, latestDate, historicalDates, currentPat
9292
</ul>
9393
</div>
9494
)}
95+
96+
{/* Resources Section */}
97+
<div className="flex flex-col gap-1">
98+
<div className="mb-0.5 px-2">
99+
<span className="text-muted-foreground text-[0.625rem] font-semibold tracking-wide uppercase">Resources</span>
100+
</div>
101+
<ul className="m-0 flex list-none flex-col p-0">
102+
<li>
103+
<a
104+
href={`${base}data`}
105+
className={cn(
106+
'group text-muted-foreground relative flex items-center gap-2 px-2 py-1.5 text-[0.8125rem] no-underline transition-all duration-200',
107+
"before:bg-primary before:absolute before:top-1/2 before:left-0 before:h-0 before:w-[2px] before:-translate-y-1/2 before:transition-all before:duration-200 before:content-['']",
108+
'hover:text-foreground hover:bg-muted hover:before:h-1/2',
109+
currentPath.startsWith('/data') ? 'text-foreground bg-[var(--mauve-4)] font-medium before:!h-[60%]' : '',
110+
)}
111+
>
112+
<span className="group-hover:text-primary flex shrink-0 items-center justify-center opacity-50 transition-all duration-200 group-hover:opacity-100">
113+
<Download size={12} />
114+
</span>
115+
<span className="min-w-0 flex-1 overflow-hidden text-ellipsis whitespace-nowrap">Data downloads</span>
116+
</a>
117+
</li>
118+
</ul>
119+
</div>
95120
</nav>
96121
);
97122
}

site/src/pages/data/index.astro

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
---
2+
import BaseLayout from '@/layouts/BaseLayout.astro';
3+
import Icon from '@/components/Icon.astro';
4+
import { formatDisplayDate } from '@/lib/utils';
5+
import fs from 'node:fs';
6+
import path from 'node:path';
7+
8+
// Data manifest structure
9+
interface QueryData {
10+
fetched_at: string;
11+
query_hash: string;
12+
row_count: number;
13+
file_size_bytes: number;
14+
}
15+
16+
interface DataManifest {
17+
dates: string[];
18+
latest: string;
19+
date_queries: Record<string, Record<string, QueryData>>;
20+
}
21+
22+
// Load data manifest from notebooks/data/manifest.json
23+
function loadDataManifest(): DataManifest {
24+
const manifestPath = path.join(process.cwd(), '..', 'notebooks', 'data', 'manifest.json');
25+
try {
26+
if (fs.existsSync(manifestPath)) {
27+
const content = fs.readFileSync(manifestPath, 'utf-8');
28+
return JSON.parse(content);
29+
}
30+
} catch (e) {
31+
console.error('Failed to load data manifest', e);
32+
}
33+
return { dates: [], latest: '', date_queries: {} };
34+
}
35+
36+
// Format bytes to human-readable size
37+
function formatFileSize(bytes: number): string {
38+
if (bytes < 1024) return `${bytes} B`;
39+
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
40+
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
41+
}
42+
43+
const manifest = loadDataManifest();
44+
const base = import.meta.env.BASE_URL;
45+
46+
// Sort dates newest first
47+
const sortedDates = [...manifest.dates].sort().reverse();
48+
49+
// Get total file count and size
50+
let totalFiles = 0;
51+
let totalSize = 0;
52+
for (const date of sortedDates) {
53+
const queries = manifest.date_queries[date] || {};
54+
for (const query of Object.values(queries)) {
55+
totalFiles++;
56+
totalSize += query.file_size_bytes;
57+
}
58+
}
59+
---
60+
61+
<BaseLayout title="Data downloads">
62+
<div class="max-w-4xl">
63+
<!-- Header -->
64+
<header class="mb-12">
65+
<div class="text-muted-foreground bg-muted mb-6 inline-flex items-center gap-2 px-3 py-1.5 font-mono text-[0.6875rem] tracking-widest uppercase">
66+
<Icon name="Download" size={12} />
67+
<span>Parquet Files</span>
68+
</div>
69+
<h1 class="m-0 mb-6 font-serif text-4xl leading-tight font-normal -tracking-wide max-md:text-3xl">
70+
<span class="text-foreground">Data downloads</span>
71+
</h1>
72+
<p class="text-muted-foreground max-w-xl text-base leading-relaxed">
73+
Parquet files generated by our data pipelines, from which the notebook visualizations are rendered.
74+
Download them for your own analysis.
75+
</p>
76+
<div class="text-muted-foreground mt-4 flex items-center gap-4 text-sm">
77+
<span>{totalFiles} files</span>
78+
<span class="text-border">|</span>
79+
<span>{formatFileSize(totalSize)} total</span>
80+
<span class="text-border">|</span>
81+
<span>{sortedDates.length} dates</span>
82+
</div>
83+
</header>
84+
85+
<!-- Data by Date -->
86+
{sortedDates.map((date) => {
87+
const queries = manifest.date_queries[date] || {};
88+
const queryNames = Object.keys(queries).sort();
89+
const isLatest = date === manifest.latest;
90+
91+
return (
92+
<section class="mb-10">
93+
<div class="mb-4 flex items-center gap-3">
94+
<h2 class="m-0 font-serif text-xl font-normal -tracking-tight">
95+
{formatDisplayDate(date)}
96+
</h2>
97+
{isLatest && (
98+
<span class="bg-primary text-primary-foreground px-2 py-0.5 text-[0.625rem] font-mono tracking-wide uppercase">
99+
Latest
100+
</span>
101+
)}
102+
</div>
103+
104+
<div class="border-border bg-card overflow-hidden border">
105+
<table class="w-full text-sm">
106+
<thead>
107+
<tr class="border-border bg-muted border-b">
108+
<th class="text-muted-foreground px-4 py-2.5 text-left font-mono text-[0.6875rem] tracking-wide uppercase font-medium">File</th>
109+
<th class="text-muted-foreground px-4 py-2.5 text-right font-mono text-[0.6875rem] tracking-wide uppercase font-medium">Rows</th>
110+
<th class="text-muted-foreground px-4 py-2.5 text-right font-mono text-[0.6875rem] tracking-wide uppercase font-medium">Size</th>
111+
</tr>
112+
</thead>
113+
<tbody>
114+
{queryNames.map((queryName, idx) => {
115+
const query = queries[queryName];
116+
const filename = `${queryName}.parquet`;
117+
const downloadUrl = `${base}data/${date}/${filename}`;
118+
119+
return (
120+
<tr class={`group border-border ${idx < queryNames.length - 1 ? 'border-b' : ''} hover:bg-muted/50 transition-colors`}>
121+
<td class="px-4 py-3">
122+
<a
123+
href={downloadUrl}
124+
download
125+
class="text-foreground hover:text-primary inline-flex items-center gap-2 no-underline transition-colors"
126+
>
127+
<code class="font-mono text-[0.8125rem]">{filename}</code>
128+
<span class="text-muted-foreground group-hover:text-primary opacity-0 group-hover:opacity-100 transition-opacity">
129+
<Icon name="Download" size={14} />
130+
</span>
131+
</a>
132+
</td>
133+
<td class="text-muted-foreground px-4 py-3 text-right font-mono text-[0.8125rem]">
134+
{query.row_count.toLocaleString()}
135+
</td>
136+
<td class="text-muted-foreground px-4 py-3 text-right font-mono text-[0.8125rem]">
137+
{formatFileSize(query.file_size_bytes)}
138+
</td>
139+
</tr>
140+
);
141+
})}
142+
</tbody>
143+
</table>
144+
</div>
145+
</section>
146+
);
147+
})}
148+
149+
{sortedDates.length === 0 && (
150+
<div class="border-border bg-muted text-muted-foreground border p-8 text-center">
151+
No data files available yet.
152+
</div>
153+
)}
154+
</div>
155+
</BaseLayout>

0 commit comments

Comments
 (0)