|
| 1 | +#!/usr/bin/env bash |
| 2 | +# build-duckdb.sh — Build a local DuckDB database from NovaMart CSV files |
| 3 | +# |
| 4 | +# Usage: |
| 5 | +# bash scripts/build-duckdb.sh # Build from CSVs in data/novamart/ |
| 6 | +# bash scripts/build-duckdb.sh --help # Show this help |
| 7 | +# |
| 8 | +# Creates data/novamart/novamart.duckdb with all tables loaded. |
| 9 | +# This is optional — Claude Code can query CSVs directly via DuckDB's |
| 10 | +# read_csv() function, but a pre-built .duckdb file is faster for |
| 11 | +# repeated queries. |
| 12 | + |
| 13 | +set -euo pipefail |
| 14 | + |
| 15 | +DATA_DIR="data/novamart" |
| 16 | +DB_FILE="${DATA_DIR}/novamart.duckdb" |
| 17 | + |
| 18 | +RED='\033[0;31m' |
| 19 | +GREEN='\033[0;32m' |
| 20 | +YELLOW='\033[1;33m' |
| 21 | +NC='\033[0m' |
| 22 | + |
| 23 | +usage() { |
| 24 | + echo "Usage: bash scripts/build-duckdb.sh [--help]" |
| 25 | + echo "" |
| 26 | + echo "Builds ${DB_FILE} from CSV files in ${DATA_DIR}/" |
| 27 | + echo "" |
| 28 | + echo "Prerequisites:" |
| 29 | + echo " - Python 3.9+ with duckdb package: pip install duckdb" |
| 30 | + echo " - OR DuckDB CLI: brew install duckdb (macOS)" |
| 31 | +} |
| 32 | + |
| 33 | +# --- Main --- |
| 34 | + |
| 35 | +if [[ "${1:-}" == "--help" ]] || [[ "${1:-}" == "-h" ]]; then |
| 36 | + usage |
| 37 | + exit 0 |
| 38 | +fi |
| 39 | + |
| 40 | +# Ensure we're in the repo root |
| 41 | +if [ ! -f "CLAUDE.md" ]; then |
| 42 | + echo -e "${RED}Error: Run this script from the AI Analyst repo root.${NC}" |
| 43 | + echo " cd ~/Desktop/ai-analyst && bash scripts/build-duckdb.sh" |
| 44 | + exit 1 |
| 45 | +fi |
| 46 | + |
| 47 | +# Check for CSV files |
| 48 | +if [ ! -d "$DATA_DIR" ] || [ -z "$(ls "$DATA_DIR"/*.csv 2>/dev/null)" ]; then |
| 49 | + echo -e "${RED}Error: No CSV files found in ${DATA_DIR}/${NC}" |
| 50 | + echo "" |
| 51 | + echo "Run the download script first:" |
| 52 | + echo " bash scripts/download-data.sh" |
| 53 | + exit 1 |
| 54 | +fi |
| 55 | + |
| 56 | +# Remove existing DB if present |
| 57 | +if [ -f "$DB_FILE" ]; then |
| 58 | + echo -e "${YELLOW}Removing existing ${DB_FILE}${NC}" |
| 59 | + rm -f "$DB_FILE" |
| 60 | +fi |
| 61 | + |
| 62 | +echo "Building DuckDB database from CSV files..." |
| 63 | +echo "" |
| 64 | + |
| 65 | +# Try Python+duckdb first, fall back to DuckDB CLI |
| 66 | +if python3 -c "import duckdb" 2>/dev/null; then |
| 67 | + python3 << 'PYEOF' |
| 68 | +import duckdb |
| 69 | +import os |
| 70 | +import glob |
| 71 | +
|
| 72 | +data_dir = "data/novamart" |
| 73 | +db_file = os.path.join(data_dir, "novamart.duckdb") |
| 74 | +
|
| 75 | +con = duckdb.connect(db_file) |
| 76 | +
|
| 77 | +csv_files = sorted(glob.glob(os.path.join(data_dir, "*.csv"))) |
| 78 | +loaded = 0 |
| 79 | +
|
| 80 | +for csv_path in csv_files: |
| 81 | + table_name = os.path.splitext(os.path.basename(csv_path))[0] |
| 82 | + print(f" Loading {table_name}...", end="", flush=True) |
| 83 | + con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM read_csv_auto('{csv_path}')") |
| 84 | + row_count = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0] |
| 85 | + print(f" {row_count:,} rows") |
| 86 | + loaded += 1 |
| 87 | +
|
| 88 | +con.close() |
| 89 | +print(f"\nLoaded {loaded} tables into {db_file}") |
| 90 | +PYEOF |
| 91 | + |
| 92 | +elif command -v duckdb &> /dev/null; then |
| 93 | + for csv_file in "$DATA_DIR"/*.csv; do |
| 94 | + table_name=$(basename "$csv_file" .csv) |
| 95 | + echo " Loading ${table_name}..." |
| 96 | + duckdb "$DB_FILE" "CREATE TABLE ${table_name} AS SELECT * FROM read_csv_auto('${csv_file}');" |
| 97 | + done |
| 98 | + echo "" |
| 99 | + echo "Tables loaded into ${DB_FILE}" |
| 100 | + |
| 101 | +else |
| 102 | + echo -e "${RED}Error: Neither Python duckdb package nor DuckDB CLI found.${NC}" |
| 103 | + echo "" |
| 104 | + echo "Install one of:" |
| 105 | + echo " pip install duckdb # Python package" |
| 106 | + echo " brew install duckdb # macOS CLI" |
| 107 | + echo " apt install duckdb # Linux CLI" |
| 108 | + exit 1 |
| 109 | +fi |
| 110 | + |
| 111 | +# Report file size |
| 112 | +if [ -f "$DB_FILE" ]; then |
| 113 | + size=$(ls -lh "$DB_FILE" | awk '{print $5}') |
| 114 | + echo "" |
| 115 | + echo -e "${GREEN}DuckDB database ready: ${DB_FILE} (${size})${NC}" |
| 116 | + echo "" |
| 117 | + echo "Claude Code will automatically use this database for faster queries." |
| 118 | +fi |
0 commit comments