Comparisons.PostgreSQLVSDoublets/bench/doublets/run.sh at 8a9b3b3b5f7b6fe93f4cbf5d8d3d54fc6c5ec19f · linksplatform/Comparisons.PostgreSQLVSDoublets · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/bin/bash
# ============================================================================
# Doublets Airlines Demo - Benchmark Script (Placeholder)
# ============================================================================
# This script runs timetable queries against the Doublets implementation
# of the Airlines demo database and collects timing measurements.
#
# Usage: ./run.sh <durability_mode> <dataset_size> [num_runs]
#   durability_mode: volatile or nonvolatile
#   dataset_size: 3m, 6m, 1y, or 2y
#   num_runs: number of iterations per query (default: 10)
#
# Example:
#   ./run.sh volatile 6m 10
#   ./run.sh nonvolatile 1y 20
#
# Output:
#   - CSV file: ../results/doublets_<mode>_<dataset>_<timestamp>.csv
#
# TODO: This is a placeholder. Implement actual Doublets benchmarking logic.
# ============================================================================

set -euo pipefail

# Configuration
DURABILITY_MODE="${1:-volatile}"
DATASET_SIZE="${2:-6m}"
NUM_RUNS="${3:-10}"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
RESULTS_DIR="../results"
OUTPUT_CSV="${RESULTS_DIR}/doublets_${DURABILITY_MODE}_${DATASET_SIZE}_${TIMESTAMP}.csv"

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

# Create results directory
mkdir -p "${RESULTS_DIR}"

# Initialize CSV
echo "system,durability_mode,dataset,query_id,run,rows,ms" > "${OUTPUT_CSV}"

echo -e "${GREEN}=== Doublets Benchmark ===${NC}"
echo "Mode: ${DURABILITY_MODE}"
echo "Dataset: ${DATASET_SIZE}"
echo "Runs per query: ${NUM_RUNS}"
echo "Output: ${OUTPUT_CSV}"
echo ""

echo -e "${YELLOW}=== TODO: Doublets Implementation ===${NC}"
echo ""
echo "This is a placeholder script. To complete the Doublets benchmark, implement:"
echo ""
echo "1. Data Loading:"
echo "   - Load Airlines data from PostgreSQL or CSV export"
echo "   - Convert entities to Doublets links (see bench/schema-mapping.md)"
echo "   - Store in Doublets database (volatile or nonvolatile mode)"
echo ""
echo "2. Query Implementation:"
echo "   - Implement equivalent queries using Doublets link API"
echo "   - Ensure result sets match PostgreSQL exactly"
echo "   - See bench/schema-mapping.md for query mappings"
echo ""
echo "3. Benchmark Execution:"
echo "   - Warm-up: run each query once"
echo "   - Measure: run each query ${NUM_RUNS} times"
echo "   - Record: wall-clock time (ms) and row count"
echo "   - Write results to CSV with same format as PostgreSQL benchmark"
echo ""
echo "4. Validation:"
echo "   - Compare result sets with PostgreSQL (checksums)"
echo "   - Verify performance improvements"
echo "   - Report any discrepancies"
echo ""
echo "Suggested implementation approaches:"
echo ""
echo "  a) Rust implementation (matching existing rust/ directory):"
echo "     - Use existing Doublets Rust library"
echo "     - Create Airlines data model"
echo "     - Implement queries using Doublets API"
echo "     - Add benchmark harness"
echo ""
echo "  b) C++ implementation (matching existing cpp/ directory):"
echo "     - Use existing Doublets C++ library"
echo "     - Follow same approach as Rust"
echo ""
echo "  c) Standalone tool:"
echo "     - Create separate benchmark binary"
echo "     - Load data from CSV export"
echo "     - Run queries and output CSV"
echo ""
echo "Reference implementations:"
echo "  - rust/benches/bench.rs - existing Doublets benchmarks"
echo "  - bench/pg/run.sh - PostgreSQL benchmark (for CSV format)"
echo "  - bench/schema-mapping.md - detailed mapping documentation"
echo ""
echo -e "${YELLOW}Until implementation is complete, this script generates mock data.${NC}"
echo ""

# Generate mock data for testing the analysis pipeline
echo -e "${YELLOW}Generating mock benchmark data...${NC}"

# Define query IDs (matching PostgreSQL)
QUERY_IDS=(
  "departures_svo"
  "arrivals_svo"
  "next_flight_svx_wuh"
  "manual_departures_svo"
  "manual_arrivals_svo"
  "route_details"
  "status_counts"
  "busiest_routes"
  "date_range"
)

# Mock: Doublets should be ~1000-10000x faster than PostgreSQL
# Generate realistic-looking performance data
for query_id in "${QUERY_IDS[@]}"; do
  # Simulate row counts (would come from actual queries)
  case "${query_id}" in
    "next_flight_svx_wuh")
      row_count=1
      ;;
    "route_details")
      row_count=20
      ;;
    "status_counts")
      row_count=5
      ;;
    "busiest_routes")
      row_count=10
      ;;
    "date_range")
      row_count=7
      ;;
    *)
      row_count=$((RANDOM % 100 + 10))
      ;;
  esac

  # Generate ${NUM_RUNS} measurements with small variance
  base_time=$((RANDOM % 50 + 10))  # 10-60ms for Doublets (vs seconds for PostgreSQL)

  for run in $(seq 1 "${NUM_RUNS}"); do
    # Add small random variance
    variance=$((RANDOM % 20 - 10))
    time=$((base_time + variance))
    [ ${time} -lt 1 ] && time=1  # Ensure positive

    echo "doublets,${DURABILITY_MODE},${DATASET_SIZE},${query_id},${run},${row_count},${time}" >> "${OUTPUT_CSV}"
  done
done

echo -e "${GREEN}Mock data generated${NC}"
echo ""

echo -e "${GREEN}=== Benchmark Complete (Mock) ===${NC}"
echo "Mock results saved to: ${OUTPUT_CSV}"
echo ""
echo -e "${RED}WARNING: This data is MOCK data for testing purposes.${NC}"
echo -e "${RED}Implement actual Doublets queries to get real measurements.${NC}"
echo ""