Skip to content

Commit 3e1e1f5

Browse files
committed
add EC repair plugin worker e2e test
Tests the ec_repair plugin worker through the admin API: - starts weed mini + 4 volume servers - uploads data, EC-encodes a volume - simulates shard loss and triggers repair - verifies shards restored and no /tmp temp file leaks
1 parent 5fa9526 commit 3e1e1f5

File tree

2 files changed

+429
-0
lines changed

2 files changed

+429
-0
lines changed
Lines changed: 329 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,329 @@
1+
#!/bin/bash
2+
3+
# EC Repair Plugin Worker Integration Test
4+
#
5+
# Tests the ec_repair plugin worker end-to-end:
6+
# 1. Start weed mini (master + filer + volume + admin) + 4 standalone volume servers
7+
# 2. Upload data to fill a volume, then EC-encode it
8+
# 3. Simulate shard loss by stopping a volume server and deleting shard files
9+
# 4. Trigger ec_repair detection + execution via admin plugin API
10+
# 5. Verify all shards are restored and no temp files leaked to /tmp
11+
12+
set -euo pipefail
13+
14+
WEED_BINARY="${WEED_BINARY:-weed}"
15+
DATA_DIR="${WEED_DATA_DIR:-/tmp/ec-repair-test-$$}"
16+
MASTER_IP="127.0.0.1"
17+
MASTER_PORT=9333
18+
VOLUME_PORT_START=9340
19+
ADMIN_PORT=0
20+
NUM_EXTRA_VOLUMES=4
21+
VOLUME_SIZE_LIMIT_MB=100
22+
23+
RED='\033[0;31m'
24+
GREEN='\033[0;32m'
25+
YELLOW='\033[1;33m'
26+
NC='\033[0m'
27+
28+
pass() { echo -e "${GREEN}[PASS]${NC} $1"; }
29+
fail() { echo -e "${RED}[FAIL]${NC} $1"; EXIT_CODE=1; }
30+
info() { echo -e "${YELLOW}[INFO]${NC} $1"; }
31+
32+
EXIT_CODE=0
33+
34+
cleanup() {
35+
info "Cleaning up..."
36+
pkill -f "${DATA_DIR}" 2>/dev/null || true
37+
sleep 2
38+
pkill -9 -f "${DATA_DIR}" 2>/dev/null || true
39+
rm -rf "${DATA_DIR}"
40+
}
41+
trap cleanup EXIT
42+
43+
wait_for_http() {
44+
local name=$1 url=$2 max=${3:-30}
45+
for i in $(seq 1 "$max"); do
46+
if curl -sf "$url" > /dev/null 2>&1; then
47+
return 0
48+
fi
49+
sleep 1
50+
done
51+
echo "Timed out waiting for $name at $url"
52+
return 1
53+
}
54+
55+
wait_for_port() {
56+
local name=$1 host=$2 port=$3 max=${4:-30}
57+
for i in $(seq 1 "$max"); do
58+
if nc -z "$host" "$port" 2>/dev/null; then
59+
return 0
60+
fi
61+
sleep 1
62+
done
63+
echo "Timed out waiting for $name on $host:$port"
64+
return 1
65+
}
66+
67+
# ── Setup ──────────────────────────────────────────────────────────────
68+
69+
info "Creating data directories"
70+
mkdir -p "${DATA_DIR}/mini"
71+
for i in $(seq 1 $NUM_EXTRA_VOLUMES); do
72+
mkdir -p "${DATA_DIR}/vol${i}"
73+
done
74+
75+
info "Starting weed mini"
76+
$WEED_BINARY mini \
77+
-dir="${DATA_DIR}/mini" \
78+
-master.volumeSizeLimitMB=$VOLUME_SIZE_LIMIT_MB \
79+
-volume.port=$VOLUME_PORT_START \
80+
-s3=false -webdav=false -admin.ui=true \
81+
-ip=$MASTER_IP -ip.bind=0.0.0.0 \
82+
> "${DATA_DIR}/mini.log" 2>&1 &
83+
echo $! > "${DATA_DIR}/mini.pid"
84+
85+
wait_for_http "Master" "http://${MASTER_IP}:${MASTER_PORT}/cluster/status" 30
86+
87+
# Detect admin port from logs
88+
for attempt in $(seq 1 20); do
89+
ADMIN_PORT=$(grep -o "Admin server is ready at http://[^:]*:\([0-9]*\)" "${DATA_DIR}/mini.log" | grep -o '[0-9]*$' || true)
90+
if [ -n "$ADMIN_PORT" ]; then
91+
break
92+
fi
93+
sleep 1
94+
done
95+
96+
if [ -z "$ADMIN_PORT" ] || [ "$ADMIN_PORT" = "0" ]; then
97+
fail "Could not detect admin port from mini logs"
98+
cat "${DATA_DIR}/mini.log"
99+
exit 1
100+
fi
101+
102+
wait_for_http "Admin" "http://${MASTER_IP}:${ADMIN_PORT}/health" 15
103+
info "Admin server ready on port $ADMIN_PORT"
104+
105+
# Wait for plugin worker to register
106+
for attempt in $(seq 1 30); do
107+
JOB_TYPES=$(curl -sf "http://${MASTER_IP}:${ADMIN_PORT}/api/plugin/job-types" || echo "[]")
108+
if echo "$JOB_TYPES" | grep -q "ec_repair"; then
109+
break
110+
fi
111+
sleep 1
112+
done
113+
114+
if ! echo "$JOB_TYPES" | grep -q "ec_repair"; then
115+
fail "ec_repair job type not registered with admin"
116+
exit 1
117+
fi
118+
info "ec_repair plugin worker registered"
119+
120+
info "Starting $NUM_EXTRA_VOLUMES additional volume servers"
121+
for i in $(seq 1 $NUM_EXTRA_VOLUMES); do
122+
PORT=$((VOLUME_PORT_START + i))
123+
$WEED_BINARY volume \
124+
-dir="${DATA_DIR}/vol${i}" \
125+
-port=${PORT} \
126+
-master="${MASTER_IP}:${MASTER_PORT}" \
127+
-max=10 \
128+
-ip=$MASTER_IP \
129+
> "${DATA_DIR}/vol${i}.log" 2>&1 &
130+
echo $! > "${DATA_DIR}/vol${i}.pid"
131+
done
132+
133+
# Wait for all volume servers
134+
for i in $(seq 1 $NUM_EXTRA_VOLUMES); do
135+
PORT=$((VOLUME_PORT_START + i))
136+
if ! wait_for_http "Volume $i" "http://${MASTER_IP}:${PORT}/status" 30; then
137+
fail "Volume server $i failed to start"
138+
cat "${DATA_DIR}/vol${i}.log"
139+
exit 1
140+
fi
141+
done
142+
143+
# Wait for volume servers to register with master
144+
sleep 5
145+
REGISTERED=$(curl -sf "http://${MASTER_IP}:${MASTER_PORT}/vol/status" | python3 -c "
146+
import sys, json
147+
data = json.load(sys.stdin)
148+
dcs = data['Volumes']['DataCenters']
149+
count = 0
150+
for dc in dcs.values():
151+
for rack in dc.values():
152+
count += len(rack)
153+
print(count)
154+
" 2>/dev/null || echo "0")
155+
156+
EXPECTED=$((1 + NUM_EXTRA_VOLUMES))
157+
if [ "$REGISTERED" -lt "$EXPECTED" ]; then
158+
fail "Expected $EXPECTED volume servers, got $REGISTERED"
159+
exit 1
160+
fi
161+
pass "All $REGISTERED volume servers registered"
162+
163+
# ── Upload data and EC-encode ──────────────────────────────────────────
164+
165+
info "Generating 10MB test file"
166+
dd if=/dev/urandom of="${DATA_DIR}/testfile" bs=1M count=10 2>/dev/null
167+
168+
info "Uploading files to collection 'ectest'"
169+
for i in $(seq 1 12); do
170+
ASSIGN=$(curl -sf "http://${MASTER_IP}:${MASTER_PORT}/dir/assign?collection=ectest")
171+
FID=$(echo "$ASSIGN" | python3 -c "import sys,json; print(json.load(sys.stdin)['fid'])")
172+
URL=$(echo "$ASSIGN" | python3 -c "import sys,json; print(json.load(sys.stdin)['url'])")
173+
curl -sf -F file=@"${DATA_DIR}/testfile" "http://${URL}/${FID}" > /dev/null
174+
done
175+
info "Uploaded 12 files (~120MB total)"
176+
177+
# Find a volume that's over the size limit
178+
ECVOL=$(curl -sf "http://${MASTER_IP}:${MASTER_PORT}/vol/status" | python3 -c "
179+
import sys, json
180+
data = json.load(sys.stdin)
181+
dcs = data['Volumes']['DataCenters']
182+
for dc in dcs.values():
183+
for rack in dc.values():
184+
for node_url, vols in rack.items():
185+
if vols is None:
186+
continue
187+
for v in vols:
188+
if v.get('Collection') == 'ectest' and v['Size'] > ${VOLUME_SIZE_LIMIT_MB} * 1024 * 1024:
189+
print(v['Id'])
190+
sys.exit(0)
191+
sys.exit(1)
192+
" 2>/dev/null || echo "")
193+
194+
if [ -z "$ECVOL" ]; then
195+
fail "No ectest volume exceeded ${VOLUME_SIZE_LIMIT_MB}MB"
196+
exit 1
197+
fi
198+
info "EC-encoding volume $ECVOL"
199+
200+
echo "lock; ec.encode -collection=ectest -volumeId=${ECVOL} -force; unlock" | \
201+
$WEED_BINARY shell -master="${MASTER_IP}:${MASTER_PORT}" > "${DATA_DIR}/ec-encode.log" 2>&1
202+
203+
# Verify EC shards exist
204+
EC_NODES=$(echo 'volume.list' | $WEED_BINARY shell -master="${MASTER_IP}:${MASTER_PORT}" 2>&1 | grep "ec volume id:${ECVOL}" | wc -l)
205+
if [ "$EC_NODES" -lt 2 ]; then
206+
fail "EC shards not distributed (found on $EC_NODES nodes)"
207+
exit 1
208+
fi
209+
pass "Volume $ECVOL EC-encoded and distributed across $EC_NODES nodes"
210+
211+
# Count total shards before corruption
212+
SHARDS_BEFORE=$(echo 'volume.list' | $WEED_BINARY shell -master="${MASTER_IP}:${MASTER_PORT}" 2>&1 | \
213+
grep "ec volume id:${ECVOL}" | grep -o 'shards:\[[^]]*\]' | tr ',' '\n' | wc -w)
214+
info "Total EC shards before corruption: $SHARDS_BEFORE"
215+
216+
# ── Simulate shard loss ────────────────────────────────────────────────
217+
218+
# Find a standalone volume server that has EC shards and kill it
219+
VICTIM_PORT=""
220+
VICTIM_IDX=""
221+
for i in $(seq 1 $NUM_EXTRA_VOLUMES); do
222+
PORT=$((VOLUME_PORT_START + i))
223+
EC_FILES=$(ls "${DATA_DIR}/vol${i}"/ectest_${ECVOL}.ec[0-9]* 2>/dev/null | wc -l || echo 0)
224+
if [ "$EC_FILES" -gt 0 ]; then
225+
VICTIM_PORT=$PORT
226+
VICTIM_IDX=$i
227+
break
228+
fi
229+
done
230+
231+
if [ -z "$VICTIM_PORT" ]; then
232+
fail "No standalone volume server has EC shard files"
233+
exit 1
234+
fi
235+
236+
DELETED_SHARDS=$(ls "${DATA_DIR}/vol${VICTIM_IDX}"/ectest_${ECVOL}.ec[0-9]* 2>/dev/null | wc -l)
237+
info "Simulating shard loss: killing vol${VICTIM_IDX} (port $VICTIM_PORT) and deleting $DELETED_SHARDS shard files"
238+
239+
kill "$(cat "${DATA_DIR}/vol${VICTIM_IDX}.pid")" 2>/dev/null || true
240+
sleep 2
241+
242+
# Delete EC shard data files (keep .ecx/.ecj so the node doesn't re-report old shards)
243+
rm -f "${DATA_DIR}/vol${VICTIM_IDX}"/ectest_${ECVOL}.ec[0-9]*
244+
rm -f "${DATA_DIR}/vol${VICTIM_IDX}"/ectest_${ECVOL}.vif
245+
246+
# Restart the volume server
247+
$WEED_BINARY volume \
248+
-dir="${DATA_DIR}/vol${VICTIM_IDX}" \
249+
-port=${VICTIM_PORT} \
250+
-master="${MASTER_IP}:${MASTER_PORT}" \
251+
-max=10 \
252+
-ip=$MASTER_IP \
253+
> "${DATA_DIR}/vol${VICTIM_IDX}.log" 2>&1 &
254+
echo $! > "${DATA_DIR}/vol${VICTIM_IDX}.pid"
255+
256+
wait_for_http "Volume ${VICTIM_IDX}" "http://${MASTER_IP}:${VICTIM_PORT}/status" 30
257+
sleep 3
258+
259+
pass "Shard loss simulated on vol${VICTIM_IDX}"
260+
261+
# ── Trigger ec_repair via plugin API ───────────────────────────────────
262+
263+
info "Triggering ec_repair detection + execution via admin API"
264+
265+
RUN_RESULT=$(curl -sf -X POST "http://${MASTER_IP}:${ADMIN_PORT}/api/plugin/job-types/ec_repair/run" \
266+
-H "Content-Type: application/json" \
267+
-d '{"max_results": 100, "timeout_seconds": 120}')
268+
269+
DETECTED=$(echo "$RUN_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('detected_count', 0))")
270+
EXECUTED=$(echo "$RUN_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('executed_count', 0))")
271+
SUCCEEDED=$(echo "$RUN_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('success_count', 0))")
272+
ERRORS=$(echo "$RUN_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('error_count', 0))")
273+
274+
if [ "$DETECTED" -lt 1 ]; then
275+
fail "Detection found 0 repair tasks (expected >= 1)"
276+
echo "$RUN_RESULT" | python3 -m json.tool
277+
exit 1
278+
fi
279+
pass "Detection found $DETECTED repair task(s)"
280+
281+
if [ "$SUCCEEDED" -lt 1 ] || [ "$ERRORS" -gt 0 ]; then
282+
fail "Execution: succeeded=$SUCCEEDED errors=$ERRORS (expected succeeded>=1, errors=0)"
283+
echo "$RUN_RESULT" | python3 -m json.tool
284+
exit 1
285+
fi
286+
pass "Execution succeeded: $SUCCEEDED job(s) completed, $ERRORS errors"
287+
288+
# ── Verify repair results ─────────────────────────────────────────────
289+
290+
# Count shards after repair
291+
SHARDS_AFTER=$(echo 'volume.list' | $WEED_BINARY shell -master="${MASTER_IP}:${MASTER_PORT}" 2>&1 | \
292+
grep "ec volume id:${ECVOL}" | grep -o 'shards:\[[^]]*\]' | tr ',' '\n' | wc -w)
293+
294+
if [ "$SHARDS_AFTER" -lt "$SHARDS_BEFORE" ]; then
295+
fail "Shards after repair ($SHARDS_AFTER) < shards before corruption ($SHARDS_BEFORE)"
296+
else
297+
pass "All shards restored: $SHARDS_AFTER shards (was $SHARDS_BEFORE before corruption)"
298+
fi
299+
300+
# Verify no temp files leaked to /tmp
301+
LEAKED=$(ls -d /tmp/ec-repair-${ECVOL}-* 2>/dev/null | wc -l || echo 0)
302+
if [ "$LEAKED" -gt 0 ]; then
303+
fail "Found $LEAKED leaked ec-repair temp dirs in /tmp (workingDir fix not applied)"
304+
else
305+
pass "No ec-repair temp files leaked to /tmp"
306+
fi
307+
308+
# Verify a second detection shows no issues
309+
DETECT2=$(curl -sf -X POST "http://${MASTER_IP}:${ADMIN_PORT}/api/plugin/job-types/ec_repair/detect" \
310+
-H "Content-Type: application/json" \
311+
-d '{"max_results": 100, "timeout_seconds": 60}')
312+
REMAINING=$(echo "$DETECT2" | python3 -c "import sys,json; print(json.load(sys.stdin).get('total_proposals', -1))")
313+
314+
if [ "$REMAINING" -eq 0 ]; then
315+
pass "Second detection confirms no remaining issues"
316+
else
317+
fail "Second detection still found $REMAINING repair proposals"
318+
fi
319+
320+
# ── Summary ────────────────────────────────────────────────────────────
321+
322+
echo ""
323+
if [ "$EXIT_CODE" -eq 0 ]; then
324+
echo -e "${GREEN}All EC repair plugin worker tests passed${NC}"
325+
else
326+
echo -e "${RED}Some tests failed${NC}"
327+
fi
328+
329+
exit $EXIT_CODE

0 commit comments

Comments
 (0)