版本和环境信息
- NebulaGraph 版本: 3.6.0
- 部署方式:基于源码编译,k8s容器化部署(3节点)
当前现象
测试发现创建图空间时,同时下发leader均衡任务会出现storage服务offline的现象,且服务容器CPU占用接近100%,增删改查任务失败
期望现象
创建图空间成功,leader均衡任务成功,服务运行稳定,增删改查任务正常
复现步骤
- 预制图空间space1,NGQL:create space space1(partition_num=20,replica_factor=3,vid_type=fixed_string(32));
- 同时执行压测脚本
sh create_space.sh &;sh leader_balance.sh &。压测脚本如下
# create_space.sh脚本,并发创建图空间
#!/bin/bash
nebula_console_path=/usr/local/nebula/scripts
pwd="nebula"
function create_space() {
#定义接收参数
local keytoolCmd="$nebula_console_path/nebula-console -addr $GRAPHD_SERVICE -port 9669 -t 0 -u root"
#利用expect自动登录远端服务器并执行命令
space=$1
expect <<EOF >>$nebula_console_path/create_space
set timeout 100
spawn ${keytoolCmd}
expect "*Password*" {send "$pwd\n"}
expect "*]>" {send "create space $space(partition_num=20,replica_factor=3,vid_type=fixed_string(32)); \n"}
expect "*]>" {send "quit\n"}
expect eof
EOF
}
function drop_space() {
#定义接收参数
local keytoolCmd="$nebula_console_path/nebula-console -addr $GRAPHD_SERVICE -port 9669 -t 0 -u root"
#利用expect自动登录远端服务器并执行命令
space=$1
expect <<EOF >>$nebula_console_path/drop_space
set timeout 100
spawn ${keytoolCmd}
expect "*Password*" {send "$pwd\n"}
expect "*]>" {send "drop space $space; \n"}
expect "*]>" {send "quit\n"}
expect eof
EOF
}
# 检查是否有storaged offline
function check_node_status() {
#定义接收参数
local keytoolCmd="$nebula_console_path/nebula-console -addr $GRAPHD_SERVICE -port 9669 -t 0 -u root"
#利用expect自动登录远端服务器并执行命令
expect <<EOF >$nebula_console_path/check_node_status
set timeout 100
spawn ${keytoolCmd}
expect "*Password*" {send "$pwd\n"}
expect "*]>" {send "show hosts storage; \n"}
expect "*]>" {send "quit\n"}
expect eof
EOF
offline_num=$(grep "OFFLINE" $nebula_console_path/check_node_status |wc -l)
if [[ $offline_num -ne 0 ]]; then
echo "storage is offline"
return 1
fi
return 0
}
names=("basketballplayer" "basketballplayer1" "basketballplayer2" "basketballplayer3" "basketballplayer4" "basketballplayer5" "basketballplayer6" "basketballplayer7" "basketballplayer8" "basketballplayer9" "basketballplayer10" "basketballplayer11" "basketballplayer12" "basketballplayer13" "basketballplayer14" "basketballplayer15" "basketballplayer16" "basketballplayer17" "basketballplayer18" "basketballplayer19" "basketballplayer20" "basketballplayer21" "basketballplayer22" "basketballplayer23" "basketballplayer24" "basketballplayer25" "basketballplayer26" "basketballplayer27" "basketballplayer28" "basketballplayer29" "basketballplayer30" "basketballplayer31" "basketballplayer32" "basketballplayer33" "basketballplayer34" "basketballplayer35" "basketballplayer36" "basketballplayer37" "basketballplayer38" "basketballplayer39" "basketballplayer40" "basketballplayer41" "basketballplayer42" "basketballplayer43" "basketballplayer44" "basketballplayer45" "basketballplayer46" "basketballplayer47" "basketballplayer48" "basketballplayer49" "basketballplayer50")
function main() {
while true; do
for name in "${names[@]}"; do
# 检查是否offline
check_node_status
if [ $? != 0 ]; then
echo "have some storaged offline"
exit 0
fi
# 创建图空间
create_space $name
sleep 2
done
for name in "${names[@]}"; do
drop_space $name
sleep 2
done
done
}
main
# leader_balance.sh脚本,并发执行leader均衡
#!/bin/bash
nebula_console_path=/usr/local/nebula/scripts
pwd="nebula"
function leader_balance() {
#定义接收参数
local keytoolCmd="$nebula_console_path/nebula-console -addr $GRAPHD_SERVICE -port 9669 -t 0 -u root"
#利用expect自动登录远端服务器并执行命令
expect <<EOF >>$nebula_console_path/leader_balance
set timeout 100
spawn ${keytoolCmd}
expect "*Password*" {send "$pwd\n"}
expect "*]>" {send "use space1;submit job balance leader \n"}
expect "*]>" {send "quit\n"}
expect eof
EOF
}
# 检查是否有storaged offline
function check_node_status() {
#定义接收参数
local keytoolCmd="$nebula_console_path/nebula-console -addr $GRAPHD_SERVICE -port 9669 -t 0 -u root"
#利用expect自动登录远端服务器并执行命令
expect <<EOF >$nebula_console_path/check_node_status
set timeout 100
spawn ${keytoolCmd}
expect "*Password*" {send "$pwd\n"}
expect "*]>" {send "show hosts storage; \n"}
expect "*]>" {send "quit\n"}
expect eof
EOF
offline_num=$(grep "OFFLINE" $nebula_console_path/check_node_status |wc -l)
if [[ $offline_num -ne 0 ]]; then
echo "storage is offline"
return 1
fi
return 0
}
function main() {
while true; do
check_node_status
if [ $? != 0 ]; then
echo "have some storaged offline"
exit 0
fi
leader_balance
sleep 10
done
}
main
版本和环境信息
当前现象
测试发现创建图空间时,同时下发leader均衡任务会出现storage服务offline的现象,且服务容器CPU占用接近100%,增删改查任务失败
期望现象
创建图空间成功,leader均衡任务成功,服务运行稳定,增删改查任务正常
复现步骤
sh create_space.sh &;sh leader_balance.sh &。压测脚本如下