|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# EC Repair Plugin Worker Integration Test |
| 4 | +# |
| 5 | +# Tests the ec_repair plugin worker end-to-end: |
| 6 | +# 1. Start weed mini (master + filer + volume + admin) + 4 standalone volume servers |
| 7 | +# 2. Upload data to fill a volume, then EC-encode it |
| 8 | +# 3. Simulate shard loss by stopping a volume server and deleting shard files |
| 9 | +# 4. Trigger ec_repair detection + execution via admin plugin API |
| 10 | +# 5. Verify all shards are restored and no temp files leaked to /tmp |
| 11 | + |
| 12 | +set -euo pipefail |
| 13 | + |
| 14 | +WEED_BINARY="${WEED_BINARY:-weed}" |
| 15 | +DATA_DIR="${WEED_DATA_DIR:-/tmp/ec-repair-test-$$}" |
| 16 | +MASTER_IP="127.0.0.1" |
| 17 | +MASTER_PORT=9333 |
| 18 | +VOLUME_PORT_START=9340 |
| 19 | +ADMIN_PORT=0 |
| 20 | +NUM_EXTRA_VOLUMES=4 |
| 21 | +VOLUME_SIZE_LIMIT_MB=100 |
| 22 | + |
| 23 | +RED='\033[0;31m' |
| 24 | +GREEN='\033[0;32m' |
| 25 | +YELLOW='\033[1;33m' |
| 26 | +NC='\033[0m' |
| 27 | + |
| 28 | +pass() { echo -e "${GREEN}[PASS]${NC} $1"; } |
| 29 | +fail() { echo -e "${RED}[FAIL]${NC} $1"; EXIT_CODE=1; } |
| 30 | +info() { echo -e "${YELLOW}[INFO]${NC} $1"; } |
| 31 | + |
| 32 | +EXIT_CODE=0 |
| 33 | + |
| 34 | +cleanup() { |
| 35 | + info "Cleaning up..." |
| 36 | + pkill -f "${DATA_DIR}" 2>/dev/null || true |
| 37 | + sleep 2 |
| 38 | + pkill -9 -f "${DATA_DIR}" 2>/dev/null || true |
| 39 | + rm -rf "${DATA_DIR}" |
| 40 | +} |
| 41 | +trap cleanup EXIT |
| 42 | + |
| 43 | +wait_for_http() { |
| 44 | + local name=$1 url=$2 max=${3:-30} |
| 45 | + for i in $(seq 1 "$max"); do |
| 46 | + if curl -sf "$url" > /dev/null 2>&1; then |
| 47 | + return 0 |
| 48 | + fi |
| 49 | + sleep 1 |
| 50 | + done |
| 51 | + echo "Timed out waiting for $name at $url" |
| 52 | + return 1 |
| 53 | +} |
| 54 | + |
| 55 | +wait_for_port() { |
| 56 | + local name=$1 host=$2 port=$3 max=${4:-30} |
| 57 | + for i in $(seq 1 "$max"); do |
| 58 | + if nc -z "$host" "$port" 2>/dev/null; then |
| 59 | + return 0 |
| 60 | + fi |
| 61 | + sleep 1 |
| 62 | + done |
| 63 | + echo "Timed out waiting for $name on $host:$port" |
| 64 | + return 1 |
| 65 | +} |
| 66 | + |
| 67 | +# ── Setup ────────────────────────────────────────────────────────────── |
| 68 | + |
| 69 | +info "Creating data directories" |
| 70 | +mkdir -p "${DATA_DIR}/mini" |
| 71 | +for i in $(seq 1 $NUM_EXTRA_VOLUMES); do |
| 72 | + mkdir -p "${DATA_DIR}/vol${i}" |
| 73 | +done |
| 74 | + |
| 75 | +info "Starting weed mini" |
| 76 | +$WEED_BINARY mini \ |
| 77 | + -dir="${DATA_DIR}/mini" \ |
| 78 | + -master.volumeSizeLimitMB=$VOLUME_SIZE_LIMIT_MB \ |
| 79 | + -volume.port=$VOLUME_PORT_START \ |
| 80 | + -s3=false -webdav=false -admin.ui=true \ |
| 81 | + -ip=$MASTER_IP -ip.bind=0.0.0.0 \ |
| 82 | + > "${DATA_DIR}/mini.log" 2>&1 & |
| 83 | +echo $! > "${DATA_DIR}/mini.pid" |
| 84 | + |
| 85 | +wait_for_http "Master" "http://${MASTER_IP}:${MASTER_PORT}/cluster/status" 30 |
| 86 | + |
| 87 | +# Detect admin port from logs |
| 88 | +for attempt in $(seq 1 20); do |
| 89 | + ADMIN_PORT=$(grep -o "Admin server is ready at http://[^:]*:\([0-9]*\)" "${DATA_DIR}/mini.log" | grep -o '[0-9]*$' || true) |
| 90 | + if [ -n "$ADMIN_PORT" ]; then |
| 91 | + break |
| 92 | + fi |
| 93 | + sleep 1 |
| 94 | +done |
| 95 | + |
| 96 | +if [ -z "$ADMIN_PORT" ] || [ "$ADMIN_PORT" = "0" ]; then |
| 97 | + fail "Could not detect admin port from mini logs" |
| 98 | + cat "${DATA_DIR}/mini.log" |
| 99 | + exit 1 |
| 100 | +fi |
| 101 | + |
| 102 | +wait_for_http "Admin" "http://${MASTER_IP}:${ADMIN_PORT}/health" 15 |
| 103 | +info "Admin server ready on port $ADMIN_PORT" |
| 104 | + |
| 105 | +# Wait for plugin worker to register |
| 106 | +for attempt in $(seq 1 30); do |
| 107 | + JOB_TYPES=$(curl -sf "http://${MASTER_IP}:${ADMIN_PORT}/api/plugin/job-types" || echo "[]") |
| 108 | + if echo "$JOB_TYPES" | grep -q "ec_repair"; then |
| 109 | + break |
| 110 | + fi |
| 111 | + sleep 1 |
| 112 | +done |
| 113 | + |
| 114 | +if ! echo "$JOB_TYPES" | grep -q "ec_repair"; then |
| 115 | + fail "ec_repair job type not registered with admin" |
| 116 | + exit 1 |
| 117 | +fi |
| 118 | +info "ec_repair plugin worker registered" |
| 119 | + |
| 120 | +info "Starting $NUM_EXTRA_VOLUMES additional volume servers" |
| 121 | +for i in $(seq 1 $NUM_EXTRA_VOLUMES); do |
| 122 | + PORT=$((VOLUME_PORT_START + i)) |
| 123 | + $WEED_BINARY volume \ |
| 124 | + -dir="${DATA_DIR}/vol${i}" \ |
| 125 | + -port=${PORT} \ |
| 126 | + -master="${MASTER_IP}:${MASTER_PORT}" \ |
| 127 | + -max=10 \ |
| 128 | + -ip=$MASTER_IP \ |
| 129 | + > "${DATA_DIR}/vol${i}.log" 2>&1 & |
| 130 | + echo $! > "${DATA_DIR}/vol${i}.pid" |
| 131 | +done |
| 132 | + |
| 133 | +# Wait for all volume servers |
| 134 | +for i in $(seq 1 $NUM_EXTRA_VOLUMES); do |
| 135 | + PORT=$((VOLUME_PORT_START + i)) |
| 136 | + if ! wait_for_http "Volume $i" "http://${MASTER_IP}:${PORT}/status" 30; then |
| 137 | + fail "Volume server $i failed to start" |
| 138 | + cat "${DATA_DIR}/vol${i}.log" |
| 139 | + exit 1 |
| 140 | + fi |
| 141 | +done |
| 142 | + |
| 143 | +# Wait for volume servers to register with master |
| 144 | +sleep 5 |
| 145 | +REGISTERED=$(curl -sf "http://${MASTER_IP}:${MASTER_PORT}/vol/status" | python3 -c " |
| 146 | +import sys, json |
| 147 | +data = json.load(sys.stdin) |
| 148 | +dcs = data['Volumes']['DataCenters'] |
| 149 | +count = 0 |
| 150 | +for dc in dcs.values(): |
| 151 | + for rack in dc.values(): |
| 152 | + count += len(rack) |
| 153 | +print(count) |
| 154 | +" 2>/dev/null || echo "0") |
| 155 | + |
| 156 | +EXPECTED=$((1 + NUM_EXTRA_VOLUMES)) |
| 157 | +if [ "$REGISTERED" -lt "$EXPECTED" ]; then |
| 158 | + fail "Expected $EXPECTED volume servers, got $REGISTERED" |
| 159 | + exit 1 |
| 160 | +fi |
| 161 | +pass "All $REGISTERED volume servers registered" |
| 162 | + |
| 163 | +# ── Upload data and EC-encode ────────────────────────────────────────── |
| 164 | + |
| 165 | +info "Generating 10MB test file" |
| 166 | +dd if=/dev/urandom of="${DATA_DIR}/testfile" bs=1M count=10 2>/dev/null |
| 167 | + |
| 168 | +info "Uploading files to collection 'ectest'" |
| 169 | +for i in $(seq 1 12); do |
| 170 | + ASSIGN=$(curl -sf "http://${MASTER_IP}:${MASTER_PORT}/dir/assign?collection=ectest") |
| 171 | + FID=$(echo "$ASSIGN" | python3 -c "import sys,json; print(json.load(sys.stdin)['fid'])") |
| 172 | + URL=$(echo "$ASSIGN" | python3 -c "import sys,json; print(json.load(sys.stdin)['url'])") |
| 173 | + curl -sf -F file=@"${DATA_DIR}/testfile" "http://${URL}/${FID}" > /dev/null |
| 174 | +done |
| 175 | +info "Uploaded 12 files (~120MB total)" |
| 176 | + |
| 177 | +# Find a volume that's over the size limit |
| 178 | +ECVOL=$(curl -sf "http://${MASTER_IP}:${MASTER_PORT}/vol/status" | python3 -c " |
| 179 | +import sys, json |
| 180 | +data = json.load(sys.stdin) |
| 181 | +dcs = data['Volumes']['DataCenters'] |
| 182 | +for dc in dcs.values(): |
| 183 | + for rack in dc.values(): |
| 184 | + for node_url, vols in rack.items(): |
| 185 | + if vols is None: |
| 186 | + continue |
| 187 | + for v in vols: |
| 188 | + if v.get('Collection') == 'ectest' and v['Size'] > ${VOLUME_SIZE_LIMIT_MB} * 1024 * 1024: |
| 189 | + print(v['Id']) |
| 190 | + sys.exit(0) |
| 191 | +sys.exit(1) |
| 192 | +" 2>/dev/null || echo "") |
| 193 | + |
| 194 | +if [ -z "$ECVOL" ]; then |
| 195 | + fail "No ectest volume exceeded ${VOLUME_SIZE_LIMIT_MB}MB" |
| 196 | + exit 1 |
| 197 | +fi |
| 198 | +info "EC-encoding volume $ECVOL" |
| 199 | + |
| 200 | +echo "lock; ec.encode -collection=ectest -volumeId=${ECVOL} -force; unlock" | \ |
| 201 | + $WEED_BINARY shell -master="${MASTER_IP}:${MASTER_PORT}" > "${DATA_DIR}/ec-encode.log" 2>&1 |
| 202 | + |
| 203 | +# Verify EC shards exist |
| 204 | +EC_NODES=$(echo 'volume.list' | $WEED_BINARY shell -master="${MASTER_IP}:${MASTER_PORT}" 2>&1 | grep "ec volume id:${ECVOL}" | wc -l) |
| 205 | +if [ "$EC_NODES" -lt 2 ]; then |
| 206 | + fail "EC shards not distributed (found on $EC_NODES nodes)" |
| 207 | + exit 1 |
| 208 | +fi |
| 209 | +pass "Volume $ECVOL EC-encoded and distributed across $EC_NODES nodes" |
| 210 | + |
| 211 | +# Count total shards before corruption |
| 212 | +SHARDS_BEFORE=$(echo 'volume.list' | $WEED_BINARY shell -master="${MASTER_IP}:${MASTER_PORT}" 2>&1 | \ |
| 213 | + grep "ec volume id:${ECVOL}" | grep -o 'shards:\[[^]]*\]' | tr ',' '\n' | wc -w) |
| 214 | +info "Total EC shards before corruption: $SHARDS_BEFORE" |
| 215 | + |
| 216 | +# ── Simulate shard loss ──────────────────────────────────────────────── |
| 217 | + |
| 218 | +# Find a standalone volume server that has EC shards and kill it |
| 219 | +VICTIM_PORT="" |
| 220 | +VICTIM_IDX="" |
| 221 | +for i in $(seq 1 $NUM_EXTRA_VOLUMES); do |
| 222 | + PORT=$((VOLUME_PORT_START + i)) |
| 223 | + EC_FILES=$(ls "${DATA_DIR}/vol${i}"/ectest_${ECVOL}.ec[0-9]* 2>/dev/null | wc -l || echo 0) |
| 224 | + if [ "$EC_FILES" -gt 0 ]; then |
| 225 | + VICTIM_PORT=$PORT |
| 226 | + VICTIM_IDX=$i |
| 227 | + break |
| 228 | + fi |
| 229 | +done |
| 230 | + |
| 231 | +if [ -z "$VICTIM_PORT" ]; then |
| 232 | + fail "No standalone volume server has EC shard files" |
| 233 | + exit 1 |
| 234 | +fi |
| 235 | + |
| 236 | +DELETED_SHARDS=$(ls "${DATA_DIR}/vol${VICTIM_IDX}"/ectest_${ECVOL}.ec[0-9]* 2>/dev/null | wc -l) |
| 237 | +info "Simulating shard loss: killing vol${VICTIM_IDX} (port $VICTIM_PORT) and deleting $DELETED_SHARDS shard files" |
| 238 | + |
| 239 | +kill "$(cat "${DATA_DIR}/vol${VICTIM_IDX}.pid")" 2>/dev/null || true |
| 240 | +sleep 2 |
| 241 | + |
| 242 | +# Delete EC shard data files (keep .ecx/.ecj so the node doesn't re-report old shards) |
| 243 | +rm -f "${DATA_DIR}/vol${VICTIM_IDX}"/ectest_${ECVOL}.ec[0-9]* |
| 244 | +rm -f "${DATA_DIR}/vol${VICTIM_IDX}"/ectest_${ECVOL}.vif |
| 245 | + |
| 246 | +# Restart the volume server |
| 247 | +$WEED_BINARY volume \ |
| 248 | + -dir="${DATA_DIR}/vol${VICTIM_IDX}" \ |
| 249 | + -port=${VICTIM_PORT} \ |
| 250 | + -master="${MASTER_IP}:${MASTER_PORT}" \ |
| 251 | + -max=10 \ |
| 252 | + -ip=$MASTER_IP \ |
| 253 | + > "${DATA_DIR}/vol${VICTIM_IDX}.log" 2>&1 & |
| 254 | +echo $! > "${DATA_DIR}/vol${VICTIM_IDX}.pid" |
| 255 | + |
| 256 | +wait_for_http "Volume ${VICTIM_IDX}" "http://${MASTER_IP}:${VICTIM_PORT}/status" 30 |
| 257 | +sleep 3 |
| 258 | + |
| 259 | +pass "Shard loss simulated on vol${VICTIM_IDX}" |
| 260 | + |
| 261 | +# ── Trigger ec_repair via plugin API ─────────────────────────────────── |
| 262 | + |
| 263 | +info "Triggering ec_repair detection + execution via admin API" |
| 264 | + |
| 265 | +RUN_RESULT=$(curl -sf -X POST "http://${MASTER_IP}:${ADMIN_PORT}/api/plugin/job-types/ec_repair/run" \ |
| 266 | + -H "Content-Type: application/json" \ |
| 267 | + -d '{"max_results": 100, "timeout_seconds": 120}') |
| 268 | + |
| 269 | +DETECTED=$(echo "$RUN_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('detected_count', 0))") |
| 270 | +EXECUTED=$(echo "$RUN_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('executed_count', 0))") |
| 271 | +SUCCEEDED=$(echo "$RUN_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('success_count', 0))") |
| 272 | +ERRORS=$(echo "$RUN_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('error_count', 0))") |
| 273 | + |
| 274 | +if [ "$DETECTED" -lt 1 ]; then |
| 275 | + fail "Detection found 0 repair tasks (expected >= 1)" |
| 276 | + echo "$RUN_RESULT" | python3 -m json.tool |
| 277 | + exit 1 |
| 278 | +fi |
| 279 | +pass "Detection found $DETECTED repair task(s)" |
| 280 | + |
| 281 | +if [ "$SUCCEEDED" -lt 1 ] || [ "$ERRORS" -gt 0 ]; then |
| 282 | + fail "Execution: succeeded=$SUCCEEDED errors=$ERRORS (expected succeeded>=1, errors=0)" |
| 283 | + echo "$RUN_RESULT" | python3 -m json.tool |
| 284 | + exit 1 |
| 285 | +fi |
| 286 | +pass "Execution succeeded: $SUCCEEDED job(s) completed, $ERRORS errors" |
| 287 | + |
| 288 | +# ── Verify repair results ───────────────────────────────────────────── |
| 289 | + |
| 290 | +# Count shards after repair |
| 291 | +SHARDS_AFTER=$(echo 'volume.list' | $WEED_BINARY shell -master="${MASTER_IP}:${MASTER_PORT}" 2>&1 | \ |
| 292 | + grep "ec volume id:${ECVOL}" | grep -o 'shards:\[[^]]*\]' | tr ',' '\n' | wc -w) |
| 293 | + |
| 294 | +if [ "$SHARDS_AFTER" -lt "$SHARDS_BEFORE" ]; then |
| 295 | + fail "Shards after repair ($SHARDS_AFTER) < shards before corruption ($SHARDS_BEFORE)" |
| 296 | +else |
| 297 | + pass "All shards restored: $SHARDS_AFTER shards (was $SHARDS_BEFORE before corruption)" |
| 298 | +fi |
| 299 | + |
| 300 | +# Verify no temp files leaked to /tmp |
| 301 | +LEAKED=$(ls -d /tmp/ec-repair-${ECVOL}-* 2>/dev/null | wc -l || echo 0) |
| 302 | +if [ "$LEAKED" -gt 0 ]; then |
| 303 | + fail "Found $LEAKED leaked ec-repair temp dirs in /tmp (workingDir fix not applied)" |
| 304 | +else |
| 305 | + pass "No ec-repair temp files leaked to /tmp" |
| 306 | +fi |
| 307 | + |
| 308 | +# Verify a second detection shows no issues |
| 309 | +DETECT2=$(curl -sf -X POST "http://${MASTER_IP}:${ADMIN_PORT}/api/plugin/job-types/ec_repair/detect" \ |
| 310 | + -H "Content-Type: application/json" \ |
| 311 | + -d '{"max_results": 100, "timeout_seconds": 60}') |
| 312 | +REMAINING=$(echo "$DETECT2" | python3 -c "import sys,json; print(json.load(sys.stdin).get('total_proposals', -1))") |
| 313 | + |
| 314 | +if [ "$REMAINING" -eq 0 ]; then |
| 315 | + pass "Second detection confirms no remaining issues" |
| 316 | +else |
| 317 | + fail "Second detection still found $REMAINING repair proposals" |
| 318 | +fi |
| 319 | + |
| 320 | +# ── Summary ──────────────────────────────────────────────────────────── |
| 321 | + |
| 322 | +echo "" |
| 323 | +if [ "$EXIT_CODE" -eq 0 ]; then |
| 324 | + echo -e "${GREEN}All EC repair plugin worker tests passed${NC}" |
| 325 | +else |
| 326 | + echo -e "${RED}Some tests failed${NC}" |
| 327 | +fi |
| 328 | + |
| 329 | +exit $EXIT_CODE |
0 commit comments