-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwait_and_train.sh
More file actions
executable file
·46 lines (38 loc) · 1.32 KB
/
wait_and_train.sh
File metadata and controls
executable file
·46 lines (38 loc) · 1.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/usr/bin/env bash
# Wait for Wikipedia dump download to finish, then start BM25 training.
# Usage: nohup bash wait_and_train.sh > data/train.log 2>&1 &
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
DUMP="$SCRIPT_DIR/data/ruwiki-latest-pages-articles.xml.bz2"
EXPECTED_SIZE=5911642060
echo "[$(date)] Waiting for download to complete..."
# Wait until file size stops growing (stable for 30 seconds)
PREV_SIZE=0
STABLE_COUNT=0
while true; do
CURR_SIZE=$(stat --format="%s" "$DUMP" 2>/dev/null || echo 0)
if [ "$CURR_SIZE" -ge "$EXPECTED_SIZE" ]; then
echo "[$(date)] Download complete! Size: $CURR_SIZE bytes"
break
fi
if [ "$CURR_SIZE" -eq "$PREV_SIZE" ]; then
STABLE_COUNT=$((STABLE_COUNT + 1))
if [ "$STABLE_COUNT" -ge 6 ]; then
echo "[$(date)] File size stable at $CURR_SIZE for 30s. Assuming download complete."
break
fi
else
STABLE_COUNT=0
fi
PREV_SIZE=$CURR_SIZE
sleep 5
done
echo "[$(date)] Starting BM25 training..."
cd "$SCRIPT_DIR"
PYTHONUNBUFFERED=1 .venv/bin/python train_bm25_wiki.py \
--input data/ruwiki-latest-pages-articles.xml.bz2 \
--output bm25_ru_default.json \
--checkpoint data/checkpoint.json \
--checkpoint-every 100000
echo "[$(date)] Training complete!"
ls -lh bm25_ru_default.json