Skip to content

Commit 14a4262

Browse files
authored
add big tensor monitor config (#592)
1 parent 68d1605 commit 14a4262

File tree

2 files changed

+3447
-0
lines changed

2 files changed

+3447
-0
lines changed
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#!/bin/bash
2+
3+
# Script to run engineV2.py
4+
# Usage: ./run.sh
5+
6+
# 配置参数
7+
# NUM_GPUS!=0 时,engineV2 不受外部 "CUDA_VISIBLE_DEVICES" 影响
8+
# FILE_INPUT="tester/api_config/8_big_tensor/big_tensor_1_8.txt"
9+
FILE_PATTERN="tester/api_config/monitor_config/big_tensor/GPU/monitoring_configs*.txt"
10+
LOG_DIR="tester/api_config/test_log_gpu_bigtensor_full"
11+
NUM_GPUS=-1
12+
NUM_WORKERS_PER_GPU=1
13+
GPU_IDS="-1"
14+
# REQUIRED_MEMORY=10
15+
16+
TEST_MODE_ARGS=(
17+
--accuracy=True
18+
# --paddle_only=True
19+
# --paddle_cinn=True
20+
# --test_amp=True
21+
# --test_cpu=True
22+
--use_cached_numpy=True
23+
)
24+
25+
IN_OUT_ARGS=(
26+
# --api_config_file="$FILE_INPUT"
27+
--api_config_file_pattern="$FILE_PATTERN"
28+
--log_dir="$LOG_DIR"
29+
)
30+
31+
PARALLEL_ARGS=(
32+
--num_gpus="$NUM_GPUS"
33+
--num_workers_per_gpu="$NUM_WORKERS_PER_GPU"
34+
--gpu_ids="$GPU_IDS"
35+
# --required_memory="$REQUIRED_MEMORY"
36+
)
37+
38+
mkdir -p "$LOG_DIR" || {
39+
echo "错误:无法创建日志目录 '$LOG_DIR'"
40+
exit 1
41+
}
42+
43+
# 执行程序
44+
LOG_FILE="$LOG_DIR/log_$(date +%Y%m%d_%H%M%S).log"
45+
if [ "$backprocess" -eq 1 ]; then
46+
nohup python engineV2.py \
47+
"${TEST_MODE_ARGS[@]}" \
48+
"${IN_OUT_ARGS[@]}" \
49+
"${PARALLEL_ARGS[@]}" \
50+
>> "$LOG_FILE" 2>&1 &
51+
PYTHON_PID=$!
52+
53+
sleep 1
54+
if ! ps -p "$PYTHON_PID" > /dev/null; then
55+
echo "错误:engineV2 启动失败,请检查 $LOG_FILE"
56+
exit 1
57+
fi
58+
59+
echo -e "\n\033[32m执行中... 另开终端运行监控:\033[0m"
60+
echo -e "1. GPU使用: watch -n 1 nvidia-smi"
61+
echo -e "2. 日志目录: ls -lh $LOG_DIR"
62+
echo -e "3. 详细日志: tail -f $LOG_FILE"
63+
echo -e "4. 终止任务: kill $PYTHON_PID"
64+
echo -e "\n进程已在后台运行,关闭终端不会影响进程执行"
65+
else
66+
python engineV2.py \
67+
"${TEST_MODE_ARGS[@]}" \
68+
"${IN_OUT_ARGS[@]}" \
69+
"${PARALLEL_ARGS[@]}" \
70+
2>&1 | tee -a "$LOG_FILE"
71+
72+
PYTHON_PID=$!
73+
74+
sleep 1
75+
if ! ps -p "$PYTHON_PID" > /dev/null; then
76+
echo "错误:engineV2 执行失败,请检查 $LOG_FILE"
77+
exit 1
78+
fi
79+
fi
80+
81+
exit 0
82+
83+
# watch -n 1 nvidia-smi --query-compute-apps=pid,process_name,used_memory,gpu_uuid --format=csv

0 commit comments

Comments
 (0)